In [1]:
import numpy as np
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
from sqlalchemy import create_engine
from config import db_password
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Animal_Shelter"
engine = create_engine(db_string)

In [3]:
animal_center_df = pd.read_sql_table('clean_data', con=engine)
animal_center_df

Unnamed: 0,animal_id,intake_date,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,outcome_date,outcome_type,age_upon_outcome,days_in_center
0,A670083,2014-01-01,Stray,Normal,Dog,Female,0.0,2014-01-07,Adoption or RTO,0,6
1,A670077,2014-01-01,Public Assist,Normal,Dog,Male,1.0,2014-01-19,Transfer,1,18
2,A670032,2014-01-01,Stray,Normal,Dog,Female,6.0,2014-01-01,Adoption or RTO,6,0
3,A670075,2014-01-01,Owner Surrender,Normal,Cat,Female,6.0,2014-01-16,Adoption or RTO,6,15
4,A670055,2014-01-01,Stray,Normal,Dog,Male,5.0,2014-01-05,Adoption or RTO,5,4
...,...,...,...,...,...,...,...,...,...,...,...
92089,A827802,2020-12-31,Owner Surrender,Normal,Dog,Male,11.0,2021-01-05,Euthanasia,11,5
92090,A827240,2020-12-31,Stray,Normal,Cat,Male,1.0,2021-01-07,Adoption or RTO,1,7
92091,A827796,2020-12-31,Owner Surrender,Sick,Cat,Male,0.0,2020-12-31,Still in center,0,0
92092,A827808,2020-12-31,Stray,Normal,Dog,Female,2.0,2020-12-31,Still in center,2,0


In [4]:
# convert textual data into numerical
animal_center_encoded = pd.get_dummies(animal_center_df, columns=['intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake'])
animal_center_encoded

Unnamed: 0,animal_id,intake_date,age_upon_intake,outcome_date,outcome_type,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,...,intake_condition_Aged,intake_condition_Behavior,intake_condition_Feral,intake_condition_Normal,intake_condition_Pregnant,intake_condition_Sick,animal_type_Cat,animal_type_Dog,sex_upon_intake_Female,sex_upon_intake_Male
0,A670083,2014-01-01,0.0,2014-01-07,Adoption or RTO,0,6,0,0,0,...,0,0,0,1,0,0,0,1,1,0
1,A670077,2014-01-01,1.0,2014-01-19,Transfer,1,18,0,0,0,...,0,0,0,1,0,0,0,1,0,1
2,A670032,2014-01-01,6.0,2014-01-01,Adoption or RTO,6,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0
3,A670075,2014-01-01,6.0,2014-01-16,Adoption or RTO,6,15,0,0,1,...,0,0,0,1,0,0,1,0,1,0
4,A670055,2014-01-01,5.0,2014-01-05,Adoption or RTO,5,4,0,0,0,...,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92089,A827802,2020-12-31,11.0,2021-01-05,Euthanasia,11,5,0,0,1,...,0,0,0,1,0,0,0,1,0,1
92090,A827240,2020-12-31,1.0,2021-01-07,Adoption or RTO,1,7,0,0,0,...,0,0,0,1,0,0,1,0,0,1
92091,A827796,2020-12-31,0.0,2020-12-31,Still in center,0,0,0,0,1,...,0,0,0,0,0,1,1,0,0,1
92092,A827808,2020-12-31,2.0,2020-12-31,Still in center,2,0,0,0,0,...,0,0,0,1,0,0,0,1,1,0


In [5]:
# drop unncessary columns
animal_center_encoded = animal_center_encoded.drop(columns=['animal_id', 'intake_date', 'outcome_date'])
animal_center_encoded

Unnamed: 0,age_upon_intake,outcome_type,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Behavior,intake_condition_Feral,intake_condition_Normal,intake_condition_Pregnant,intake_condition_Sick,animal_type_Cat,animal_type_Dog,sex_upon_intake_Female,sex_upon_intake_Male
0,0.0,Adoption or RTO,0,6,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0
1,1.0,Transfer,1,18,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1
2,6.0,Adoption or RTO,6,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0
3,6.0,Adoption or RTO,6,15,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0
4,5.0,Adoption or RTO,5,4,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92089,11.0,Euthanasia,11,5,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1
92090,1.0,Adoption or RTO,1,7,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1
92091,0.0,Still in center,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1
92092,2.0,Still in center,2,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0


In [6]:
# define the features set
X = animal_center_encoded.copy()
X = X.drop('outcome_type', axis=1)
X.head()

Unnamed: 0,age_upon_intake,age_upon_outcome,days_in_center,intake_type_Abandoned,intake_type_Euthanasia Request,intake_type_Owner Surrender,intake_type_Public Assist,intake_type_Stray,intake_condition_Aged,intake_condition_Behavior,intake_condition_Feral,intake_condition_Normal,intake_condition_Pregnant,intake_condition_Sick,animal_type_Cat,animal_type_Dog,sex_upon_intake_Female,sex_upon_intake_Male
0,0.0,0,6,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0
1,1.0,1,18,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1
2,6.0,6,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0
3,6.0,6,15,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0
4,5.0,5,4,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1


In [7]:
# create target
y = animal_center_encoded['outcome_type'].values

In [8]:
# splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [9]:
# creating a StandardScaler instance
scaler = StandardScaler()
# fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=0) 

In [11]:
# fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [12]:
# making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array(['Adoption or RTO', 'Transfer', 'Adoption or RTO', ..., 'Transfer',
       'Transfer', 'Transfer'], dtype=object)

In [13]:
# calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1", "Actual 2", "Actual 3", "Actual 4"], columns=["Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1,Predicted 2,Predicted 3,Predicted 4
Actual 0,13437,22,125,3,1274
Actual 1,132,3,16,0,80
Actual 2,256,3,165,0,253
Actual 3,23,0,6,1,18
Actual 4,3830,9,91,1,3276


In [14]:
# calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.7332348853370396

In [15]:
print(classification_report(y_test, predictions))

                 precision    recall  f1-score   support

Adoption or RTO       0.76      0.90      0.83     14861
Died or Missing       0.08      0.01      0.02       231
     Euthanasia       0.41      0.24      0.31       677
Still in center       0.20      0.02      0.04        48
       Transfer       0.67      0.45      0.54      7207

       accuracy                           0.73     23024
      macro avg       0.42      0.33      0.35     23024
   weighted avg       0.71      0.73      0.71     23024



In [16]:
importances = rf_model.feature_importances_
# sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.6140368894148935, 'days_in_center'),
 (0.07530534321626782, 'age_upon_outcome'),
 (0.07474004496157176, 'age_upon_intake'),
 (0.053173183196192214, 'animal_type_Cat'),
 (0.044532570535985354, 'animal_type_Dog'),
 (0.029323974019164575, 'intake_type_Stray'),
 (0.0273195326549303, 'intake_condition_Normal'),
 (0.0259909291473747, 'intake_condition_Sick'),
 (0.021417118205336757, 'intake_type_Owner Surrender'),
 (0.014651317449972362, 'intake_type_Public Assist'),
 (0.005935705293713111, 'intake_type_Euthanasia Request'),
 (0.0048396805144842805, 'sex_upon_intake_Female'),
 (0.004738654062810294, 'sex_upon_intake_Male'),
 (0.0011982590005976792, 'intake_type_Abandoned'),
 (0.0010933760089083225, 'intake_condition_Aged'),
 (0.0008706807550454535, 'intake_condition_Feral'),
 (0.0007500045041141287, 'intake_condition_Pregnant'),
 (8.27370586373501e-05, 'intake_condition_Behavior')]