In [1]:
import numpy as np
import pandas as pd
from path import Path

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from sqlalchemy import create_engine
from config import db_password
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Animal_Shelter"
engine = create_engine(db_string)

In [4]:
animal_center_df = pd.read_sql_table('clean_data', con=engine)
animal_center_df

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,outcome_date,outcome_type,age_upon_outcome,days_in_center
0,A670075,2014-01-01,Owner Surrender,Normal,Cat,Female,6.0,Maine Coon Mix,2014-01-16,Adoption or RTO,6.0,16
1,A670000,2014-01-01,Public Assist,Normal,Cat,Male,0.0,Domestic Shorthair Mix,2014-01-11,Transfer,0.0,11
2,A670085,2014-01-01,Stray,Sick,Cat,Male,5.0,Domestic Longhair Mix,2014-01-05,Euthanasia,5.0,5
3,A670066,2014-01-01,Stray,Normal,Cat,Female,1.0,Domestic Shorthair Mix,2014-01-12,Adoption or RTO,1.0,12
4,A670056,2014-01-01,Stray,Normal,Cat,Female,2.0,Domestic Shorthair Mix,2014-01-02,Transfer,2.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
38319,A827775,2020-12-30,Stray,Normal,Cat,Female,0.0,Domestic Shorthair Mix,2020-12-31,Transfer,0.0,2
38320,A827752,2020-12-30,Owner Surrender,Normal,Cat,Male,0.0,Domestic Medium Hair Mix,2020-12-31,Adoption or RTO,0.0,2
38321,A827778,2020-12-30,Stray,Normal,Cat,Male,2.0,Domestic Shorthair,2020-12-30,Still in center,2.0,1
38322,A827796,2020-12-31,Owner Surrender,Sick,Cat,Male,0.0,Domestic Shorthair,2020-12-31,Still in center,0.0,1


In [5]:
# Drop unncessary columns
animal_center_df = animal_center_df.drop(columns=['animal_id', 'animal_type', 'intake_date', 'outcome_date'])
animal_center_df

Unnamed: 0,intake_type,intake_condition,sex_upon_intake,age_upon_intake,breed,outcome_type,age_upon_outcome,days_in_center
0,Owner Surrender,Normal,Female,6.0,Maine Coon Mix,Adoption or RTO,6.0,16
1,Public Assist,Normal,Male,0.0,Domestic Shorthair Mix,Transfer,0.0,11
2,Stray,Sick,Male,5.0,Domestic Longhair Mix,Euthanasia,5.0,5
3,Stray,Normal,Female,1.0,Domestic Shorthair Mix,Adoption or RTO,1.0,12
4,Stray,Normal,Female,2.0,Domestic Shorthair Mix,Transfer,2.0,2
...,...,...,...,...,...,...,...,...
38319,Stray,Normal,Female,0.0,Domestic Shorthair Mix,Transfer,0.0,2
38320,Owner Surrender,Normal,Male,0.0,Domestic Medium Hair Mix,Adoption or RTO,0.0,2
38321,Stray,Normal,Male,2.0,Domestic Shorthair,Still in center,2.0,1
38322,Owner Surrender,Sick,Male,0.0,Domestic Shorthair,Still in center,0.0,1


In [6]:
# Convert textual data into numerical
animal_center_encoded = pd.get_dummies(animal_center_df, columns=['intake_type', 'intake_condition', 'sex_upon_intake', 'breed'])
animal_center_encoded

Unnamed: 0,age_upon_intake,outcome_type,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,...,breed_Siamese,breed_Siamese Mix,breed_Snowshoe,breed_Snowshoe Mix,breed_Sphynx,breed_Tonkinese,breed_Tonkinese Mix,breed_Turkish Angora,breed_Turkish Angora Mix,breed_Turkish Van Mix
0,6.0,Adoption or RTO,6.0,16,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,Transfer,0.0,11,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.0,Euthanasia,5.0,5,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,Adoption or RTO,1.0,12,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,Transfer,2.0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38319,0.0,Transfer,0.0,2,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
38320,0.0,Adoption or RTO,0.0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38321,2.0,Still in center,2.0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
38322,0.0,Still in center,0.0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Create features
X = animal_center_encoded.drop(columns='outcome_type')

# Create target
y = animal_center_encoded['outcome_type']
X.describe()

Unnamed: 0,age_upon_intake,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Feral,...,breed_Siamese,breed_Siamese Mix,breed_Snowshoe,breed_Snowshoe Mix,breed_Sphynx,breed_Tonkinese,breed_Tonkinese Mix,breed_Turkish Angora,breed_Turkish Angora Mix,breed_Turkish Van Mix
count,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,...,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0,38324.0
mean,1.265212,1.284156,28.057614,0.002348,0.001044,0.191629,0.022258,0.782721,0.000992,0.002166,...,0.005088,0.028233,0.000391,0.004384,7.8e-05,2.6e-05,0.000235,2.6e-05,0.00013,0.000209
std,2.728809,2.753081,66.237462,0.048404,0.03229,0.393588,0.147522,0.412399,0.031474,0.046488,...,0.071151,0.16564,0.01978,0.066065,0.008847,0.005108,0.015323,0.005108,0.011422,0.014447
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,35.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,22.0,22.0,1820.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Oversampling

In [8]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from collections import Counter

In [9]:
# Resample the data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X, y)
Counter(y_resampled)

Counter({'Adoption or RTO': 20508,
         'Transfer': 20508,
         'Euthanasia': 20508,
         'Died or Missing': 20508,
         'Still in center': 20508})

In [10]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X_resampled, y_resampled, random_state=1)

In [11]:
# Train the Random Forest Classifier using the resampled data
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=128, random_state=1)

In [12]:
# Balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7309134688606308

In [13]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[3512,  810,  177,    8,  742],
       [ 358, 3883,  365,  124,  297],
       [ 144,  619, 3723,  149,  457],
       [   0,    0,   54, 4681,  369],
       [ 996,  610,  439,  202, 2916]])

In [14]:
# Imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

Adoption or RTO       0.70      0.67      0.93      0.68      0.79      0.60      5249
Died or Missing       0.66      0.77      0.90      0.71      0.83      0.69      5027
     Euthanasia       0.78      0.73      0.95      0.76      0.83      0.68      5092
Still in center       0.91      0.92      0.98      0.91      0.95      0.89      5104
       Transfer       0.61      0.56      0.91      0.59      0.72      0.50      5163

    avg / total       0.73      0.73      0.93      0.73      0.82      0.67     25635



# SMOTE Oversampling

In [15]:
# Resample the data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X, y)
Counter(y_resampled)

Counter({'Adoption or RTO': 20508,
         'Transfer': 20508,
         'Euthanasia': 20508,
         'Died or Missing': 20508,
         'Still in center': 20508})

In [16]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X_resampled, y_resampled, random_state=1)

In [17]:
# Train the Random Forest Classifier using the resampled data
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=128, random_state=1)

In [18]:
# Balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.7579388022211747

In [19]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[3766,  504,  143,    6,  830],
       [ 259, 3793,  533,  183,  259],
       [ 108,  414, 3971,  143,  456],
       [   5,   14,    5, 4923,  157],
       [1056,  519,  468,  160, 2960]])

In [20]:
# Imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

Adoption or RTO       0.73      0.72      0.93      0.72      0.82      0.65      5249
Died or Missing       0.72      0.75      0.93      0.74      0.84      0.69      5027
     Euthanasia       0.78      0.78      0.94      0.78      0.86      0.72      5092
Still in center       0.91      0.96      0.98      0.94      0.97      0.94      5104
       Transfer       0.63      0.57      0.92      0.60      0.73      0.51      5163

    avg / total       0.75      0.76      0.94      0.75      0.84      0.70     25635



# Undersampling

In [21]:
# Resample the data with the RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_resample(X, y)
Counter(y_resampled)


Counter({'Adoption or RTO': 81,
         'Died or Missing': 81,
         'Euthanasia': 81,
         'Still in center': 81,
         'Transfer': 81})

In [22]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X_resampled, y_resampled, random_state=1)

In [23]:
# Train the Random Forest Classifier using the resampled data
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=128, random_state=1)

In [24]:
# Balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5019425161368896

In [25]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[ 8,  5,  1,  0,  3],
       [ 6,  4,  3,  1,  3],
       [ 2,  6, 10,  1,  5],
       [ 0,  1,  0, 20,  0],
       [ 5,  4,  3,  1, 10]])

In [26]:
# Imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

Adoption or RTO       0.38      0.47      0.85      0.42      0.63      0.38        17
Died or Missing       0.20      0.24      0.81      0.22      0.44      0.18        17
     Euthanasia       0.59      0.42      0.91      0.49      0.62      0.36        24
Still in center       0.87      0.95      0.96      0.91      0.96      0.92        21
       Transfer       0.48      0.43      0.86      0.45      0.61      0.36        23

    avg / total       0.52      0.51      0.88      0.51      0.66      0.45       102



# SMOTEENN (Combination Sampling)

In [27]:
# Resample the data with SMOTEENN
from imblearn.combine import SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'Adoption or RTO': 8415,
         'Died or Missing': 12430,
         'Euthanasia': 12116,
         'Still in center': 16515,
         'Transfer': 7002})

In [28]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X_resampled, y_resampled, random_state=1)

In [29]:
# Train the Random Forest Classifier using the resampled data
rf_model = RandomForestClassifier(n_estimators=128, random_state=1) 
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=128, random_state=1)

In [30]:
# Balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.9556398977651309

In [31]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[2138,   47,   11,    0,   19],
       [  52, 2880,  125,   21,   43],
       [  12,   91, 2771,   18,   47],
       [   0,    2,    4, 4061,    0],
       [  21,   30,   40,    0, 1687]])

In [32]:
# Imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                       pre       rec       spe        f1       geo       iba       sup

Adoption or RTO       0.96      0.97      0.99      0.96      0.98      0.96      2215
Died or Missing       0.94      0.92      0.98      0.93      0.95      0.90      3121
     Euthanasia       0.94      0.94      0.98      0.94      0.96      0.92      2939
Still in center       0.99      1.00      1.00      0.99      1.00      0.99      4067
       Transfer       0.94      0.95      0.99      0.94      0.97      0.94      1778

    avg / total       0.96      0.96      0.99      0.96      0.97      0.95     14120

