In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
from sqlalchemy import create_engine
from config import db_password
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Animal_Shelter"
engine = create_engine(db_string)

In [3]:
animal_center_df = pd.read_sql_table('clean_data', con=engine)
animal_center_df

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,outcome_date,outcome_type,age_upon_outcome,days_in_center
0,A670075,2014-01-01,Owner Surrender,Normal,Cat,Female,6.0,Maine Coon Mix,2014-01-16,Adoption or RTO,6.0,16
1,A670000,2014-01-01,Public Assist,Normal,Cat,Male,0.0,Domestic Shorthair Mix,2014-01-11,Transfer,0.0,11
2,A670085,2014-01-01,Stray,Sick,Cat,Male,5.0,Domestic Longhair Mix,2014-01-05,Euthanasia,5.0,5
3,A670066,2014-01-01,Stray,Normal,Cat,Female,1.0,Domestic Shorthair Mix,2014-01-12,Adoption or RTO,1.0,12
4,A670056,2014-01-01,Stray,Normal,Cat,Female,2.0,Domestic Shorthair Mix,2014-01-02,Transfer,2.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
38319,A827775,2020-12-30,Stray,Normal,Cat,Female,0.0,Domestic Shorthair Mix,2020-12-31,Transfer,0.0,2
38320,A827752,2020-12-30,Owner Surrender,Normal,Cat,Male,0.0,Domestic Medium Hair Mix,2020-12-31,Adoption or RTO,0.0,2
38321,A827778,2020-12-30,Stray,Normal,Cat,Male,2.0,Domestic Shorthair,2020-12-30,Still in center,2.0,1
38322,A827796,2020-12-31,Owner Surrender,Sick,Cat,Male,0.0,Domestic Shorthair,2020-12-31,Still in center,0.0,1


In [4]:
# convert textual data into numerical
animal_center_encoded = pd.get_dummies(animal_center_df, columns=['intake_type', 'intake_condition', 'sex_upon_intake', 'breed'])
animal_center_encoded

Unnamed: 0,animal_id,intake_date,animal_type,age_upon_intake,outcome_date,outcome_type,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,...,breed_Siamese,breed_Siamese Mix,breed_Snowshoe,breed_Snowshoe Mix,breed_Sphynx,breed_Tonkinese,breed_Tonkinese Mix,breed_Turkish Angora,breed_Turkish Angora Mix,breed_Turkish Van Mix
0,A670075,2014-01-01,Cat,6.0,2014-01-16,Adoption or RTO,6.0,16,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A670000,2014-01-01,Cat,0.0,2014-01-11,Transfer,0.0,11,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A670085,2014-01-01,Cat,5.0,2014-01-05,Euthanasia,5.0,5,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A670066,2014-01-01,Cat,1.0,2014-01-12,Adoption or RTO,1.0,12,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A670056,2014-01-01,Cat,2.0,2014-01-02,Transfer,2.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38319,A827775,2020-12-30,Cat,0.0,2020-12-31,Transfer,0.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
38320,A827752,2020-12-30,Cat,0.0,2020-12-31,Adoption or RTO,0.0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
38321,A827778,2020-12-30,Cat,2.0,2020-12-30,Still in center,2.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
38322,A827796,2020-12-31,Cat,0.0,2020-12-31,Still in center,0.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# drop unncessary columns
animal_center_encoded = animal_center_encoded.drop(columns=['animal_id', 'animal_type', 'intake_date', 'outcome_date'])
animal_center_encoded

Unnamed: 0,age_upon_intake,outcome_type,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,...,breed_Siamese,breed_Siamese Mix,breed_Snowshoe,breed_Snowshoe Mix,breed_Sphynx,breed_Tonkinese,breed_Tonkinese Mix,breed_Turkish Angora,breed_Turkish Angora Mix,breed_Turkish Van Mix
0,6.0,Adoption or RTO,6.0,16,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,Transfer,0.0,11,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,Euthanasia,5.0,5,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,Adoption or RTO,1.0,12,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,Transfer,2.0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38319,0.0,Transfer,0.0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
38320,0.0,Adoption or RTO,0.0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38321,2.0,Still in center,2.0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
38322,0.0,Still in center,0.0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# define the features set
X = animal_center_encoded.copy()
X = X.drop('outcome_type', axis=1)
X.head()

Unnamed: 0,age_upon_intake,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Feral,...,breed_Siamese,breed_Siamese Mix,breed_Snowshoe,breed_Snowshoe Mix,breed_Sphynx,breed_Tonkinese,breed_Tonkinese Mix,breed_Turkish Angora,breed_Turkish Angora Mix,breed_Turkish Van Mix
0,6.0,6.0,16,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,11,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,5.0,5,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1.0,12,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,2.0,2,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# create target
y = animal_center_encoded['outcome_type'].values

In [8]:
# splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# creating a StandardScaler instance
scaler = StandardScaler()
# fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
# create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

In [11]:
# fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array(['Adoption or RTO', 'Adoption or RTO', 'Adoption or RTO', ...,
       'Transfer', 'Transfer', 'Transfer'], dtype=object)

In [13]:
# calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 0,4272,19,47,2,746
Actual 1,60,2,7,0,58
Actual 2,108,3,115,0,180
Actual 3,4,1,1,3,11
Actual 4,1308,10,70,1,2553


In [14]:
# calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7248721427825905

In [15]:
print(classification_report(y_test, predictions))

                 precision    recall  f1-score   support

Adoption or RTO       0.74      0.84      0.79      5086
Died or Missing       0.06      0.02      0.02       127
     Euthanasia       0.48      0.28      0.36       406
Still in center       0.50      0.15      0.23        20
       Transfer       0.72      0.65      0.68      3942

       accuracy                           0.72      9581
      macro avg       0.50      0.39      0.42      9581
   weighted avg       0.71      0.72      0.71      9581



In [16]:
importances = rf_model.feature_importances_
# sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.705611261792164, 'days_in_center'),
 (0.0662163887618693, 'age_upon_intake'),
 (0.0656564774242622, 'age_upon_outcome'),
 (0.02915241918515108, 'intake_type_Stray'),
 (0.023349441993086178, 'intake_type_Owner Surrender'),
 (0.017488387860469336, 'intake_condition_Normal'),
 (0.01614172847594019, 'intake_condition_Sick'),
 (0.008028508906457926, 'breed_Domestic Shorthair'),
 (0.007668060234238934, 'breed_Domestic Shorthair Mix'),
 (0.005789039432182765, 'intake_type_Public Assist'),
 (0.005457204894542556, 'sex_upon_intake_Male'),
 (0.005396735048494595, 'breed_Domestic Medium Hair Mix'),
 (0.0053476394096597015, 'sex_upon_intake_Female'),
 (0.00472517498609284, 'breed_Siamese Mix'),
 (0.004509523921846439, 'breed_Domestic Longhair Mix'),
 (0.003469767403746896, 'breed_Domestic Medium Hair'),
 (0.002105768121508781, 'breed_Domestic Longhair'),
 (0.001812267392832681, 'breed_Siamese'),
 (0.0016782122105907596, 'breed_American Shorthair Mix'),
 (0.001600819817323498, 'breed_Snowshoe M

# GradientBoosting Classifier

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, max_features=5, max_depth=3, random_state=1)
    classifier.fit(X_train_scaled, y_train.ravel())
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(classifier.score(X_train_scaled, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(classifier.score(X_test_scaled, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.745
Accuracy score (validation): 0.738
Learning rate:  0.1
Accuracy score (training): 0.749
Accuracy score (validation): 0.738
Learning rate:  0.25
Accuracy score (training): 0.519
Accuracy score (validation): 0.526
Learning rate:  0.5
Accuracy score (training): 0.008
Accuracy score (validation): 0.011
Learning rate:  0.75
Accuracy score (training): 0.488
Accuracy score (validation): 0.484
Learning rate:  1
Accuracy score (training): 0.478
Accuracy score (validation): 0.475


In [18]:
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_features=5, max_depth=3, random_state=1)
classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [19]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7380231708589917


In [20]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 0,4494,0,16,4,572
Actual 1,70,0,2,1,54
Actual 2,70,0,105,3,228
Actual 3,1,0,1,5,13
Actual 4,1441,0,31,3,2467


In [21]:
print(classification_report(y_test, predictions))

                 precision    recall  f1-score   support

Adoption or RTO       0.74      0.88      0.81      5086
Died or Missing       0.00      0.00      0.00       127
     Euthanasia       0.68      0.26      0.37       406
Still in center       0.31      0.25      0.28        20
       Transfer       0.74      0.63      0.68      3942

       accuracy                           0.74      9581
      macro avg       0.49      0.40      0.43      9581
   weighted avg       0.73      0.74      0.72      9581



# Oversampling

In [29]:
from collections import Counter
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({'Died or Missing': 494,
         'Adoption or RTO': 15422,
         'Transfer': 11678,
         'Euthanasia': 1088,
         'Still in center': 61})

In [30]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'Died or Missing': 15422,
         'Adoption or RTO': 15422,
         'Transfer': 15422,
         'Euthanasia': 15422,
         'Still in center': 15422})

In [34]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)


LogisticRegression(random_state=1)

In [39]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5549298249066178

In [40]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[3156,  192,  399,   93, 1246],
       [  25,   15,   39,    6,   42],
       [  23,   21,  220,   59,   83],
       [   0,    0,    0,   19,    1],
       [ 709,  112,  492,  484, 2145]])

In [41]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

Adoption or RTO       0.81      0.62      0.83      0.70      0.72      0.51      5086
Died or Missing       0.04      0.12      0.97      0.06      0.34      0.10       127
     Euthanasia       0.19      0.54      0.90      0.28      0.70      0.47       406
Still in center       0.03      0.95      0.93      0.06      0.94      0.89        20
       Transfer       0.61      0.54      0.76      0.58      0.64      0.40      3942

    avg / total       0.69      0.58      0.81      0.62      0.68      0.46      9581



# 