In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 3)
pd.option_context('display.max_rows', 50)

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import sklearn as sk

import prepare
import model

random_state = 42

In [2]:
# get the data
intakes = pd.read_csv('aac_intakes_20220304.csv')
outcomes = pd.read_csv('aac_outcomes_20220304.csv')

In [3]:
df = prepare.aac_prep(intakes, outcomes)
df = prepare.aac_get_dogs(df)

In [4]:
target = 'outcome_type'
positive = 'Adoption'

In [5]:
df.shape

(53507, 18)

In [6]:
df.head()

Unnamed: 0,intake_type,intake_condition,animal_type,n_previous_stays,stay_id,outcome_type,month_intake,fixed,sex,breed_mixed,breed_1,color_1,age_intake,found_in_austin,found_in_travis,found_outside_jurisdiction,found_other,akc_breed_group
8,Stray,Normal,Dog,0,A664257_0,Adoption,October,False,female,True,Podengo Pequeno,Black,1460 days,False,True,False,False,Hound
9,Stray,Normal,Dog,0,A664266_0,Transfer,October,False,female,True,Chihuahua Shorthair,Buff,365 days,True,False,False,False,Toy
14,Owner Surrender,Injured,Dog,0,A651630_0,Adoption,October,True,female,True,Labrador Retriever,Tan,2190 days,False,False,True,False,Sporting
16,Stray,Normal,Dog,0,A664269_0,Adoption,October,True,male,True,Great Pyrenees,White,730 days,True,False,False,False,Working
24,Stray,Normal,Dog,0,A664272_0,Transfer,October,True,female,True,Cairn Terrier,Brown,365 days,True,False,False,False,Terrier


#### Define all but the top 10 breeds as "other" to reduce dimensionality

In [7]:
top_10_breeds = list(df.breed_1.value_counts().head(10).index)
df['breed_1_reduced'] = np.where(df.breed_1.isin(top_10_breeds), df.breed_1, 'Other')

#### Do the same with colors

In [8]:
top_10_colors = list(df.color_1.value_counts().head(10).index)
df['color_1_reduced'] = np.where(df.color_1.isin(top_10_colors), df.color_1, 'Other')

### Prep for Modeling

In [9]:
df = prepare.aac_prep_for_modeling(df)

In [10]:
df.shape

(53507, 70)

# Modeling

### Train/Validate/Test Split

In [11]:
train, validate, test = prepare.train_validate_test_split(df)

train	 n = 29963
validate n = 12842
test	 n = 10702


In [12]:
train, validate, test = prepare.scale_aac(train, validate, test)

#### establish infrastructure for storage

In [13]:
model_info = pd.DataFrame()
model_results = pd.DataFrame()
model_number = 0

### Baseline

In [14]:
df[target].mode()

0    Adoption
dtype: object

In [15]:
model_number, model_info, model_results = model.run_baseline(train,
                                                       validate,
                                                       target,
                                                       positive,
                                                       model_number,
                                                       model_info,
                                                       model_results)

### RFE Decision Tree

In [16]:
model_number, model_info, model_results = model.rfe_decision_tree(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

Generating 1 of 88 models.     
Generating 2 of 88 models.     
Generating 3 of 88 models.     
Generating 4 of 88 models.     
Generating 5 of 88 models.     
Generating 6 of 88 models.     
Generating 7 of 88 models.     
Generating 8 of 88 models.     
Generating 9 of 88 models.     
Generating 10 of 88 models.     
Generating 11 of 88 models.     
Generating 12 of 88 models.     
Generating 13 of 88 models.     
Generating 14 of 88 models.     
Generating 15 of 88 models.     
Generating 16 of 88 models.     
Generating 17 of 88 models.     
Generating 18 of 88 models.     
Generating 19 of 88 models.     
Generating 20 of 88 models.     
Generating 21 of 88 models.     
Generating 22 of 88 models.     
Generating 23 of 88 models.     
Generating 24 of 88 models.     
Generating 25 of 88 models.     
Generating 26 of 88 models.     
Generating 27 of 88 models.     
Generating 28 of 88 models.     
Generating 29 of 88 models.     
Generating 30 of 88 models.     
Generating 31 of 88

### RFE Random Forest

In [17]:
model_number, model_info, model_results = model.rfe_random_forest(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

Generating 1 of 99 models.     
88
Generating 2 of 99 models.     
89
Generating 3 of 99 models.     
90
Generating 4 of 99 models.     
91
Generating 5 of 99 models.     
92
Generating 6 of 99 models.     
93
Generating 7 of 99 models.     
94
Generating 8 of 99 models.     
95
Generating 9 of 99 models.     
96
Generating 10 of 99 models.     
97
Generating 11 of 99 models.     
98
Generating 12 of 99 models.     
99
Generating 13 of 99 models.     
100
Generating 14 of 99 models.     
101
Generating 15 of 99 models.     
102
Generating 16 of 99 models.     
103
Generating 17 of 99 models.     
104
Generating 18 of 99 models.     
105
Generating 19 of 99 models.     
106
Generating 20 of 99 models.     
107
Generating 21 of 99 models.     
108
Generating 22 of 99 models.     
109
Generating 23 of 99 models.     
110
Generating 24 of 99 models.     
111
Generating 25 of 99 models.     
112
Generating 26 of 99 models.     
113
Generating 27 of 99 models.     
114
Generating 28 of 99 mo

### RFE Logistic Regression

In [18]:
model_number, model_info, model_results = model.rfe_log_regression(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

Generating 1 of 77 models.          
Generating 2 of 77 models.          
Generating 3 of 77 models.          
Generating 4 of 77 models.          
Generating 5 of 77 models.          
Generating 6 of 77 models.          
Generating 7 of 77 models.          
Generating 8 of 77 models.          
Generating 9 of 77 models.          
Generating 10 of 77 models.          
Generating 11 of 77 models.          
Generating 12 of 77 models.          
Generating 13 of 77 models.          
Generating 14 of 77 models.          
Generating 15 of 77 models.          
Generating 16 of 77 models.          
Generating 17 of 77 models.          
Generating 18 of 77 models.          
Generating 19 of 77 models.          
Generating 20 of 77 models.          
Generating 21 of 77 models.          
Generating 22 of 77 models.          
Generating 23 of 77 models.          
Generating 24 of 77 models.          
Generating 25 of 77 models.          
Generating 26 of 77 models.          
Generating 27 of 77 m

# Model Evaluation

In [19]:
model.display_model_results(model_results)

Unnamed: 0_level_0,model_number,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,baseline
metric_type,sample_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1
accuracy,train,0.697,0.697,0.701,0.702,0.702,0.702,0.702,0.702,0.698,0.698,0.703,0.703,0.704,0.704,0.704,0.704,0.703,0.698,0.703,0.704,0.704,0.705,0.705,0.705,0.703,0.703,0.706,0.704,0.704,0.706,0.708,0.709,0.703,0.703,0.706,0.707,0.707,0.708,0.71,0.71,0.703,0.705,0.708,0.707,0.709,0.711,0.713,0.714,0.703,0.705,...,0.688,0.703,0.704,0.704,0.704,0.704,0.704,0.689,0.703,0.704,0.704,0.705,0.703,0.703,0.688,0.703,0.704,0.704,0.705,0.704,0.704,0.689,0.704,0.702,0.704,0.704,0.704,0.704,0.689,0.704,0.704,0.704,0.704,0.704,0.704,0.689,0.704,0.704,0.703,0.703,0.704,0.704,0.689,0.704,0.704,0.704,0.704,0.704,0.704,0.688
accuracy,validate,0.696,0.696,0.7,0.7,0.7,0.7,0.7,0.7,0.697,0.697,0.701,0.701,0.701,0.701,0.701,0.702,0.702,0.697,0.703,0.702,0.703,0.702,0.703,0.703,0.702,0.702,0.704,0.702,0.702,0.702,0.701,0.7,0.702,0.701,0.704,0.705,0.706,0.703,0.701,0.701,0.702,0.703,0.707,0.706,0.707,0.707,0.703,0.704,0.702,0.703,...,0.688,0.702,0.703,0.702,0.702,0.702,0.702,0.688,0.702,0.703,0.704,0.704,0.703,0.703,0.688,0.702,0.704,0.703,0.704,0.703,0.703,0.689,0.703,0.702,0.704,0.704,0.703,0.703,0.689,0.703,0.702,0.704,0.704,0.703,0.703,0.689,0.702,0.702,0.703,0.703,0.704,0.704,0.689,0.703,0.702,0.702,0.702,0.704,0.704,0.688
f1_score,train,0.809,0.809,0.815,0.815,0.816,0.816,0.816,0.816,0.818,0.818,0.817,0.817,0.818,0.818,0.818,0.818,0.821,0.818,0.817,0.818,0.818,0.819,0.819,0.819,0.821,0.819,0.818,0.818,0.818,0.819,0.821,0.82,0.821,0.818,0.819,0.82,0.82,0.82,0.821,0.821,0.821,0.82,0.82,0.82,0.82,0.821,0.821,0.823,0.821,0.82,...,0.815,0.82,0.82,0.821,0.821,0.821,0.821,0.816,0.819,0.82,0.82,0.82,0.82,0.82,0.815,0.819,0.82,0.82,0.82,0.82,0.82,0.816,0.82,0.818,0.82,0.82,0.82,0.82,0.816,0.82,0.819,0.82,0.82,0.82,0.82,0.816,0.82,0.819,0.818,0.818,0.821,0.821,0.816,0.82,0.819,0.819,0.819,0.82,0.82,0.815
f1_score,validate,0.808,0.808,0.815,0.815,0.815,0.815,0.815,0.815,0.817,0.817,0.816,0.816,0.816,0.816,0.816,0.817,0.82,0.817,0.817,0.817,0.817,0.817,0.818,0.818,0.82,0.819,0.817,0.818,0.817,0.817,0.816,0.814,0.82,0.817,0.818,0.819,0.819,0.817,0.816,0.815,0.82,0.819,0.819,0.82,0.82,0.818,0.815,0.817,0.82,0.819,...,0.815,0.82,0.819,0.82,0.82,0.82,0.82,0.815,0.818,0.819,0.82,0.82,0.82,0.82,0.815,0.818,0.82,0.82,0.82,0.82,0.82,0.815,0.819,0.818,0.82,0.82,0.82,0.82,0.815,0.819,0.818,0.82,0.82,0.82,0.82,0.815,0.819,0.818,0.818,0.818,0.821,0.821,0.815,0.819,0.818,0.818,0.818,0.82,0.82,0.815
precision,train,0.714,0.714,0.71,0.71,0.71,0.71,0.71,0.71,0.699,0.699,0.709,0.709,0.71,0.71,0.71,0.71,0.703,0.698,0.709,0.709,0.709,0.709,0.709,0.709,0.703,0.706,0.712,0.708,0.709,0.709,0.711,0.715,0.703,0.708,0.712,0.711,0.711,0.713,0.713,0.713,0.703,0.706,0.713,0.709,0.713,0.716,0.719,0.717,0.703,0.706,...,0.688,0.704,0.705,0.703,0.703,0.703,0.703,0.688,0.706,0.706,0.705,0.705,0.703,0.703,0.688,0.706,0.705,0.706,0.705,0.703,0.703,0.689,0.706,0.706,0.706,0.706,0.704,0.704,0.689,0.706,0.707,0.706,0.706,0.704,0.704,0.689,0.706,0.707,0.706,0.706,0.705,0.705,0.689,0.706,0.707,0.707,0.707,0.705,0.705,0.688
precision,validate,0.714,0.714,0.708,0.708,0.709,0.709,0.709,0.709,0.698,0.698,0.708,0.708,0.708,0.708,0.708,0.708,0.702,0.698,0.708,0.707,0.708,0.707,0.707,0.707,0.702,0.705,0.71,0.707,0.708,0.707,0.707,0.709,0.702,0.706,0.71,0.709,0.71,0.71,0.708,0.709,0.702,0.705,0.712,0.708,0.711,0.713,0.714,0.711,0.702,0.705,...,0.688,0.703,0.704,0.702,0.702,0.702,0.702,0.688,0.705,0.705,0.704,0.704,0.702,0.702,0.688,0.706,0.705,0.705,0.704,0.702,0.702,0.688,0.704,0.706,0.705,0.705,0.702,0.702,0.688,0.704,0.705,0.705,0.705,0.702,0.702,0.688,0.705,0.705,0.706,0.706,0.704,0.704,0.688,0.705,0.705,0.706,0.706,0.705,0.705,0.688
recall,train,0.933,0.933,0.957,0.957,0.958,0.958,0.958,0.958,0.987,0.987,0.963,0.964,0.964,0.965,0.965,0.966,0.987,0.987,0.965,0.967,0.967,0.969,0.97,0.97,0.987,0.976,0.963,0.969,0.966,0.97,0.971,0.962,0.987,0.97,0.963,0.968,0.968,0.965,0.969,0.966,0.987,0.979,0.963,0.972,0.966,0.962,0.956,0.966,0.987,0.977,...,1.0,0.982,0.979,0.986,0.986,0.986,0.986,1.0,0.974,0.978,0.981,0.981,0.984,0.984,1.0,0.974,0.978,0.978,0.981,0.984,0.984,1.0,0.979,0.972,0.977,0.978,0.984,0.984,1.0,0.979,0.974,0.977,0.977,0.984,0.984,1.0,0.978,0.974,0.972,0.972,0.982,0.982,1.0,0.978,0.973,0.974,0.974,0.979,0.979,1.0
recall,validate,0.932,0.932,0.958,0.958,0.959,0.959,0.959,0.959,0.987,0.987,0.963,0.964,0.963,0.965,0.965,0.965,0.987,0.987,0.966,0.968,0.968,0.969,0.97,0.97,0.987,0.976,0.962,0.97,0.967,0.969,0.964,0.956,0.987,0.968,0.963,0.969,0.969,0.962,0.963,0.96,0.987,0.978,0.963,0.975,0.967,0.961,0.95,0.959,0.987,0.977,...,1.0,0.982,0.979,0.987,0.987,0.987,0.987,1.0,0.974,0.979,0.982,0.982,0.986,0.986,1.0,0.973,0.979,0.979,0.982,0.986,0.986,1.0,0.978,0.973,0.978,0.979,0.986,0.986,1.0,0.978,0.974,0.978,0.978,0.986,0.986,1.0,0.977,0.974,0.973,0.973,0.984,0.984,1.0,0.977,0.973,0.974,0.974,0.981,0.981,1.0


In [20]:
model.display_model_results(model_results[model_results.model_number == 'baseline'])

Unnamed: 0_level_0,model_number,baseline
metric_type,sample_type,Unnamed: 2_level_1
accuracy,train,0.688
accuracy,validate,0.688
f1_score,train,0.815
f1_score,validate,0.815
precision,train,0.688
precision,validate,0.688
recall,train,1.0
recall,validate,1.0


In [21]:
model.display_model_results(model.get_best_model_results(model_results))

Unnamed: 0_level_0,model_number,178,186,187
metric_type,sample_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
accuracy,train,0.724,0.72,0.725
accuracy,validate,0.713,0.713,0.713
f1_score,train,0.83,0.828,0.83
f1_score,validate,0.823,0.824,0.823
precision,train,0.721,0.717,0.722
precision,validate,0.715,0.714,0.715
recall,train,0.977,0.978,0.977
recall,validate,0.97,0.974,0.969


In [24]:
model_info[model_info.model_number.isin([178, 186, 187])]

Unnamed: 0,model_number,model_type,features,max_depth,min_samples_leaf,c_value
178,178,random forest,"[enc_fixed_unknown, enc_breed_mixed_True, enc_...",10.0,2.0,
186,186,random forest,"[enc_fixed_unknown, enc_breed_mixed_True, enc_...",9.0,2.0,
187,187,random forest,"[enc_fixed_unknown, enc_breed_mixed_True, enc_...",10.0,2.0,


In [25]:
for model_num in [178, 186, 187]:
    print(f'Model #{model_num} Features:')
    print('-' * 20)
    for feature in model_info[model_info.model_number == model_num].features.values[0]:
        print(feature)
    print()

Model #178 Features:
--------------------
enc_fixed_unknown
enc_breed_mixed_True
enc_intake_type_Public Assist
enc_intake_condition_Normal
enc_intake_condition_Sick
enc_month_intake_September
enc_sex_unknown
enc_breed_1_reduced_Other
enc_akc_breed_group_Toy
enc_found_in_austin_True
scaled_age_intake

Model #186 Features:
--------------------
enc_fixed_unknown
enc_breed_mixed_True
enc_intake_type_Owner Surrender
enc_intake_type_Public Assist
enc_intake_condition_Normal
enc_intake_condition_Sick
enc_month_intake_September
enc_sex_unknown
enc_breed_1_reduced_Other
enc_akc_breed_group_Toy
enc_found_outside_jurisdiction_True
scaled_age_intake

Model #187 Features:
--------------------
enc_fixed_unknown
enc_breed_mixed_True
enc_intake_type_Public Assist
enc_intake_type_Stray
enc_intake_condition_Normal
enc_intake_condition_Sick
enc_month_intake_September
enc_sex_unknown
enc_breed_1_reduced_Other
enc_akc_breed_group_Toy
enc_found_in_austin_True
scaled_age_intake



## Final Test
#### Choosing Model # 178 since it's accuracy is very close to the other top performers, but uses one fewer feature (11 vs 12 features)

In [28]:
# recreate the model using the same features and hyperparameters
def test_model_178(train,
                  test, 
                  target, 
                  positive):
    
    model_results_178 = pd.DataFrame()
    model_number = 178
    
    features = ['enc_fixed_unknown',
                'enc_breed_mixed_True',
                'enc_intake_type_Public Assist',
                'enc_intake_condition_Normal',
                'enc_intake_condition_Sick',
                'enc_month_intake_September',
                'enc_sex_unknown',
                'enc_breed_1_reduced_Other',
                'enc_akc_breed_group_Toy',
                'enc_found_in_austin_True',
                'scaled_age_intake']

    max_depth = 10
    min_samples_leaf = 2
    
    # establish a random forest classifier
    clf = RandomForestClassifier(max_depth=max_depth, 
                                 min_samples_leaf=min_samples_leaf,
                                 random_state=random_state)

    # separate each sample into x (features) and y (target)
    x_train = train[features]
    y_train = train[target]

    x_test = test[features]
    y_test = test[target]


    # create the classifer

    # establish a random forest classifier 
    clf = RandomForestClassifier(max_depth=max_depth, 
                                 min_samples_leaf=min_samples_leaf,
                                 random_state=random_state)

    # fit the classifier to the training data
    clf = clf.fit(x_train, y_train)

    #####################
    ### Model Results ###
    #####################

    ####### train #######

    # create prediction results for the model's performance on the test sample
    y_pred = clf.predict(x_test)
    sample_type = 'test'

    # get metrics

    # create dictionaries for each metric type for the test sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'sample_type': sample_type, 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_test, y_pred)}
    model_results_178 = model_results_178.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': sample_type, 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_test, y_pred, pos_label=positive)}
    model_results_178 = model_results_178.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': sample_type, 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_test, y_pred, pos_label=positive)}
    model_results_178 = model_results_178.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': sample_type, 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_test, y_pred, pos_label=positive)}
    model_results_178 = model_results_178.append(dct, ignore_index=True)

    return model_results_178

In [30]:
test_model_178(train, test, target, positive)[['metric_type', 'score']]

Unnamed: 0,metric_type,score
0,accuracy,0.705
1,precision,0.707
2,recall,0.969
3,f1_score,0.817
