# Drop "breed" column (lower rank feature) and rerun the model

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from path import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
from sqlalchemy import create_engine
from config import db_password
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Animal_Shelter"
engine = create_engine(db_string)

In [3]:
animal_center_df = pd.read_sql_table('clean_data', con=engine)
animal_center_df

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,outcome_date,outcome_type,age_upon_outcome,days_in_center
0,A670075,2014-01-01,Owner Surrender,Normal,Cat,Female,6.0,Maine Coon Mix,2014-01-16,Adoption or RTO,6.0,16
1,A670000,2014-01-01,Public Assist,Normal,Cat,Male,0.0,Domestic Shorthair Mix,2014-01-11,Transfer,0.0,11
2,A670085,2014-01-01,Stray,Sick,Cat,Male,5.0,Domestic Longhair Mix,2014-01-05,Euthanasia,5.0,5
3,A670066,2014-01-01,Stray,Normal,Cat,Female,1.0,Domestic Shorthair Mix,2014-01-12,Adoption or RTO,1.0,12
4,A670056,2014-01-01,Stray,Normal,Cat,Female,2.0,Domestic Shorthair Mix,2014-01-02,Transfer,2.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
38319,A827775,2020-12-30,Stray,Normal,Cat,Female,0.0,Domestic Shorthair Mix,2020-12-31,Transfer,0.0,2
38320,A827752,2020-12-30,Owner Surrender,Normal,Cat,Male,0.0,Domestic Medium Hair Mix,2020-12-31,Adoption or RTO,0.0,2
38321,A827778,2020-12-30,Stray,Normal,Cat,Male,2.0,Domestic Shorthair,2020-12-30,Still in center,2.0,1
38322,A827796,2020-12-31,Owner Surrender,Sick,Cat,Male,0.0,Domestic Shorthair,2020-12-31,Still in center,0.0,1


In [4]:
# convert textual data into numerical
animal_center_encoded = pd.get_dummies(animal_center_df, columns=['intake_type', 'intake_condition', 'sex_upon_intake'])
animal_center_encoded

Unnamed: 0,animal_id,intake_date,animal_type,age_upon_intake,breed,outcome_date,outcome_type,age_upon_outcome,days_in_center,intake_type_Abandoned,...,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Feral,intake_condition_Normal,intake_condition_Pregnant,intake_condition_Sick,sex_upon_intake_Female,sex_upon_intake_Male
0,A670075,2014-01-01,Cat,6.0,Maine Coon Mix,2014-01-16,Adoption or RTO,6.0,16,0,...,1,0,0,0,0,1,0,0,1,0
1,A670000,2014-01-01,Cat,0.0,Domestic Shorthair Mix,2014-01-11,Transfer,0.0,11,0,...,0,1,0,0,0,1,0,0,0,1
2,A670085,2014-01-01,Cat,5.0,Domestic Longhair Mix,2014-01-05,Euthanasia,5.0,5,0,...,0,0,1,0,0,0,0,1,0,1
3,A670066,2014-01-01,Cat,1.0,Domestic Shorthair Mix,2014-01-12,Adoption or RTO,1.0,12,0,...,0,0,1,0,0,1,0,0,1,0
4,A670056,2014-01-01,Cat,2.0,Domestic Shorthair Mix,2014-01-02,Transfer,2.0,2,0,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38319,A827775,2020-12-30,Cat,0.0,Domestic Shorthair Mix,2020-12-31,Transfer,0.0,2,0,...,0,0,1,0,0,1,0,0,1,0
38320,A827752,2020-12-30,Cat,0.0,Domestic Medium Hair Mix,2020-12-31,Adoption or RTO,0.0,2,0,...,1,0,0,0,0,1,0,0,0,1
38321,A827778,2020-12-30,Cat,2.0,Domestic Shorthair,2020-12-30,Still in center,2.0,1,0,...,0,0,1,0,0,1,0,0,0,1
38322,A827796,2020-12-31,Cat,0.0,Domestic Shorthair,2020-12-31,Still in center,0.0,1,0,...,1,0,0,0,0,0,0,1,0,1


In [5]:
# drop unncessary columns
animal_center_encoded = animal_center_encoded.drop(columns=['animal_id', 'breed', 'animal_type', 'intake_date', 'outcome_date'])
animal_center_encoded

Unnamed: 0,age_upon_intake,outcome_type,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Feral,intake_condition_Normal,intake_condition_Pregnant,intake_condition_Sick,sex_upon_intake_Female,sex_upon_intake_Male
0,6.0,Adoption or RTO,6.0,16,0,0,1,0,0,0,0,1,0,0,1,0
1,0.0,Transfer,0.0,11,0,0,0,1,0,0,0,1,0,0,0,1
2,5.0,Euthanasia,5.0,5,0,0,0,0,1,0,0,0,0,1,0,1
3,1.0,Adoption or RTO,1.0,12,0,0,0,0,1,0,0,1,0,0,1,0
4,2.0,Transfer,2.0,2,0,0,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38319,0.0,Transfer,0.0,2,0,0,0,0,1,0,0,1,0,0,1,0
38320,0.0,Adoption or RTO,0.0,2,0,0,1,0,0,0,0,1,0,0,0,1
38321,2.0,Still in center,2.0,1,0,0,0,0,1,0,0,1,0,0,0,1
38322,0.0,Still in center,0.0,1,0,0,1,0,0,0,0,0,0,1,0,1


In [6]:
# define the features set
X = animal_center_encoded.copy()
X = X.drop('outcome_type', axis=1)
X.head()

Unnamed: 0,age_upon_intake,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Feral,intake_condition_Normal,intake_condition_Pregnant,intake_condition_Sick,sex_upon_intake_Female,sex_upon_intake_Male
0,6.0,6.0,16,0,0,1,0,0,0,0,1,0,0,1,0
1,0.0,0.0,11,0,0,0,1,0,0,0,1,0,0,0,1
2,5.0,5.0,5,0,0,0,0,1,0,0,0,0,1,0,1
3,1.0,1.0,12,0,0,0,0,1,0,0,1,0,0,1,0
4,2.0,2.0,2,0,0,0,0,1,0,0,1,0,0,1,0


In [7]:
# create target
y = animal_center_encoded['outcome_type'].values

In [8]:
# splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# creating a StandardScaler instance
scaler = StandardScaler()
# fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier
# create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1) 

In [11]:
# fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array(['Adoption or RTO', 'Adoption or RTO', 'Adoption or RTO', ...,
       'Transfer', 'Adoption or RTO', 'Transfer'], dtype=object)

In [13]:
# calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 0,4390,9,46,1,640
Actual 1,69,1,10,0,47
Actual 2,95,2,120,2,187
Actual 3,8,0,4,0,8
Actual 4,1407,4,57,0,2474


In [14]:
# calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7290470723306545

In [15]:
print(classification_report(y_test, predictions))

                 precision    recall  f1-score   support

Adoption or RTO       0.74      0.86      0.79      5086
Died or Missing       0.06      0.01      0.01       127
     Euthanasia       0.51      0.30      0.37       406
Still in center       0.00      0.00      0.00        20
       Transfer       0.74      0.63      0.68      3942

       accuracy                           0.73      9581
      macro avg       0.41      0.36      0.37      9581
   weighted avg       0.72      0.73      0.72      9581



In [16]:
importances = rf_model.feature_importances_
# sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.7509129312236001, 'days_in_center'),
 (0.06867516341486024, 'age_upon_intake'),
 (0.06228872970726233, 'age_upon_outcome'),
 (0.036382444172508056, 'intake_type_Stray'),
 (0.026822058732258385, 'intake_type_Owner Surrender'),
 (0.020159412881855646, 'intake_condition_Normal'),
 (0.017452589728868893, 'intake_condition_Sick'),
 (0.00637958642609142, 'intake_type_Public Assist'),
 (0.003583342762026352, 'sex_upon_intake_Male'),
 (0.00328903702657442, 'sex_upon_intake_Female'),
 (0.0013024605090472255, 'intake_condition_Feral'),
 (0.0010571779199311355, 'intake_type_Euthanasia Request'),
 (0.0006670319466509303, 'intake_type_Abandoned'),
 (0.0006006984432178586, 'intake_condition_Aged'),
 (0.0004273351052468191, 'intake_condition_Pregnant')]

# GradientBoosting Classifier

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=learning_rate, max_features=5, max_depth=3, random_state=1)
    classifier.fit(X_train_scaled, y_train.ravel())
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(classifier.score(X_train_scaled, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(classifier.score(X_test_scaled, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.749
Accuracy score (validation): 0.740
Learning rate:  0.1
Accuracy score (training): 0.753
Accuracy score (validation): 0.744
Learning rate:  0.25
Accuracy score (training): 0.756
Accuracy score (validation): 0.743
Learning rate:  0.5
Accuracy score (training): 0.474
Accuracy score (validation): 0.467
Learning rate:  0.75
Accuracy score (training): 0.041
Accuracy score (validation): 0.040
Learning rate:  1
Accuracy score (training): 0.568
Accuracy score (validation): 0.567


In [18]:
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_features=5, max_depth=3, random_state=1)
classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [19]:
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7440768187036844


In [20]:
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 0,4589,0,23,1,473
Actual 1,70,0,3,0,54
Actual 2,80,0,117,1,208
Actual 3,5,0,4,3,8
Actual 4,1490,0,32,0,2420


In [21]:
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
                 precision    recall  f1-score   support

Adoption or RTO       0.74      0.90      0.81      5086
Died or Missing       0.00      0.00      0.00       127
     Euthanasia       0.65      0.29      0.40       406
Still in center       0.60      0.15      0.24        20
       Transfer       0.77      0.61      0.68      3942

       accuracy                           0.74      9581
      macro avg       0.55      0.39      0.43      9581
   weighted avg       0.73      0.74      0.73      9581

