In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 3)
pd.option_context('display.max_rows', 50)

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import sklearn as sk

import prepare
import model

random_state = 42

In [2]:
# get the data
intakes = pd.read_csv('aac_intakes_20220304.csv')
outcomes = pd.read_csv('aac_outcomes_20220304.csv')

In [3]:
df = prepare.aac_prep(intakes, outcomes)
df = prepare.aac_get_dogs(df)

In [4]:
target = 'outcome_type'
positive = 'Adoption'

In [5]:
df.shape

(53507, 18)

In [6]:
df.head()

Unnamed: 0,intake_type,intake_condition,animal_type,n_previous_stays,stay_id,outcome_type,month_intake,fixed,sex,breed_mixed,breed_1,color_1,age_intake,found_in_austin,found_in_travis,found_outside_jurisdiction,found_other,akc_breed_group
8,Stray,Normal,Dog,0,A664257_0,Adoption,October,False,female,True,Podengo Pequeno,Black,1460 days,False,True,False,False,Hound
9,Stray,Normal,Dog,0,A664266_0,Transfer,October,False,female,True,Chihuahua Shorthair,Buff,365 days,True,False,False,False,Toy
14,Owner Surrender,Injured,Dog,0,A651630_0,Adoption,October,True,female,True,Labrador Retriever,Tan,2190 days,False,False,True,False,Sporting
16,Stray,Normal,Dog,0,A664269_0,Adoption,October,True,male,True,Great Pyrenees,White,730 days,True,False,False,False,Working
24,Stray,Normal,Dog,0,A664272_0,Transfer,October,True,female,True,Cairn Terrier,Brown,365 days,True,False,False,False,Terrier


#### Define all but the top 10 breeds as "other" to reduce dimensionality

In [7]:
top_10_breeds = list(df.breed_1.value_counts().head(10).index)
df['breed_1_reduced'] = np.where(df.breed_1.isin(top_10_breeds), df.breed_1, 'Other')

#### Do the same with colors

In [8]:
top_10_colors = list(df.color_1.value_counts().head(10).index)
df['color_1_reduced'] = np.where(df.color_1.isin(top_10_colors), df.color_1, 'Other')

### Prep for Modeling

In [9]:
df = prepare.aac_prep_for_modeling(df)

In [10]:
df.shape

(53507, 70)

# Modeling

### Train/Validate/Test Split

In [11]:
train, validate, test = prepare.train_validate_test_split(df)

train	 n = 29963
validate n = 12842
test	 n = 10702


In [12]:
train, validate, test = prepare.scale_aac(train, validate, test)

#### establish infrastructure for storage

In [13]:
model_info = pd.DataFrame()
model_results = pd.DataFrame()
model_number = 0

### Baseline

In [14]:
df[target].mode()

0    Adoption
dtype: object

In [15]:
model_number, model_info, model_results = model.run_baseline(train,
                                                       validate,
                                                       target,
                                                       positive,
                                                       model_number,
                                                       model_info,
                                                       model_results)

### RFE Decision Tree

In [16]:
model_number, model_info, model_results = model.rfe_decision_tree(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

Generating 1 of 88 models.     
Generating 2 of 88 models.     
Generating 3 of 88 models.     
Generating 4 of 88 models.     
Generating 5 of 88 models.     
Generating 6 of 88 models.     
Generating 7 of 88 models.     
Generating 8 of 88 models.     
Generating 9 of 88 models.     
Generating 10 of 88 models.     
Generating 11 of 88 models.     
Generating 12 of 88 models.     
Generating 13 of 88 models.     
Generating 14 of 88 models.     
Generating 15 of 88 models.     
Generating 16 of 88 models.     
Generating 17 of 88 models.     
Generating 18 of 88 models.     
Generating 19 of 88 models.     
Generating 20 of 88 models.     
Generating 21 of 88 models.     
Generating 22 of 88 models.     
Generating 23 of 88 models.     
Generating 24 of 88 models.     
Generating 25 of 88 models.     
Generating 26 of 88 models.     
Generating 27 of 88 models.     
Generating 28 of 88 models.     
Generating 29 of 88 models.     
Generating 30 of 88 models.     
Generating 31 of 88

### RFE Random Forest

In [None]:
model_number, model_info, model_results = model.rfe_random_forest(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

Generating 1 of 99 models.     
88
Generating 2 of 99 models.     
89
Generating 3 of 99 models.     
90
Generating 4 of 99 models.     
91
Generating 5 of 99 models.     
92
Generating 6 of 99 models.     
93
Generating 7 of 99 models.     


### RFE Logistic Regression

In [None]:
model_number, model_info, model_results = model.rfe_log_regression(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

In [None]:
model.display_model_results(model_results)

In [None]:
model.display_model_results(model_results[model_results.model_number == 'baseline'])

In [None]:
model.display_model_results(model.get_best_model_results(model_results))

In [None]:
model_info[model_info.model_number.isin([32, 59, 60])]

In [None]:
for model_num in [32, 59, 60]:
    print(f'Model #{model_num} Features:')
    print('-' * 20)
    for feature in model_info[model_info.model_number == model_num].features.values[0]:
        print(feature)
    print()
          