In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 3)
pd.option_context('display.max_rows', 50)

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import sklearn as sk

import prepare
import model

random_state=42

In [2]:
# get the data
intakes = pd.read_csv('aac_intakes_20220304.csv')
outcomes = pd.read_csv('aac_outcomes_20220304.csv')

In [3]:
df = prepare.aac_prep(intakes, outcomes)
df = prepare.aac_get_dogs(df)

In [4]:
target = 'outcome_type'
positive = 'Adoption'

In [5]:
df.shape

(53507, 17)

In [6]:
df

Unnamed: 0,intake_type,intake_condition,animal_type,n_previous_stays,stay_id,outcome_type,month_intake,fixed,sex,breed_mixed,breed_1,color_1,age_intake,found_in_austin,found_in_travis,found_outside_jurisdiction,found_other
8,Stray,Normal,Dog,0,A664257_0,Adoption,October,False,female,True,Podengo Pequeno,Black,1460 days,False,True,False,False
9,Stray,Normal,Dog,0,A664266_0,Transfer,October,False,female,True,Chihuahua Shorthair,Buff,365 days,True,False,False,False
14,Owner Surrender,Injured,Dog,0,A651630_0,Adoption,October,True,female,True,Labrador Retriever,Tan,2190 days,False,False,True,False
16,Stray,Normal,Dog,0,A664269_0,Adoption,October,True,male,True,Great Pyrenees,White,730 days,True,False,False,False
24,Stray,Normal,Dog,0,A664272_0,Transfer,October,True,female,True,Cairn Terrier,Brown,365 days,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136251,Stray,Normal,Dog,0,A852194_0,Transfer,February,False,female,False,Cane Corso,Blue Cream,120 days,True,False,False,False
136259,Owner Surrender,Normal,Dog,0,A852246_0,Adoption,February,False,female,True,Labrador Retriever,Black,60 days,False,True,False,False
136260,Owner Surrender,Normal,Dog,0,A852245_0,Adoption,February,False,male,True,Labrador Retriever,Black,60 days,False,True,False,False
136289,Owner Surrender,Normal,Dog,0,A852403_0,Transfer,March,False,female,True,Maltese,White,180 days,True,False,False,False


#### Define all but the top 10 breeds as "other" to reduce dimensionality

In [7]:
top_10_breeds = list(df.breed_1.value_counts().head(10).index)
df['breed_1_reduced'] = np.where(df.breed_1.isin(top_10_breeds), df.breed_1, 'Other')

#### Do the same with colors

In [8]:
top_10_colors = list(df.color_1.value_counts().head(10).index)
df['color_1_reduced'] = np.where(df.color_1.isin(top_10_colors), df.color_1, 'Other')

### Prep for Modeling

In [9]:
df = prepare.aac_prep_for_modeling(df)

In [10]:
df.shape

(53507, 61)

# Modeling

### Train/Validate/Test Split

In [11]:
train, validate, test = prepare.train_validate_test_split(df)

train	 n = 29963
validate n = 12842
test	 n = 10702


In [12]:
train, validate, test = prepare.scale_aac(train, validate, test)

#### establish infrastructure for storage

In [13]:
model_info = pd.DataFrame()
model_results = pd.DataFrame()
model_number = 0

### Baseline

In [14]:
df[target].mode()

0    Adoption
dtype: object

In [15]:
def run_baseline(train,
                 validate,
                 target,
                 positive,
                 model_number,
                 model_info,
                 model_results):
    '''
    This function takes in the train and validate samples as dataframes, the target variable label, the positive condition label,
    an initialized model_number variable, as well as model_info and model_results dataframes dataframes that will be used for 
    storing information about the models. It then performs the operations necessary for making baseline predictions
    on our dataset, and stores information about our baseline model in the model_info and model_results dataframes. 
    The model_number, model_info, and model_results variables are returned (in that order). 
    '''

    # separate each sample into x (features) and y (target)
    x_train = train.drop(columns=target)
    y_train = train[target]

    x_validate = validate.drop(columns=target)
    y_validate = validate[target]


    # store baseline metrics

    # identify model number
    model_number = 'baseline'
    #identify model type
    model_type = 'baseline'

    # store info about the model

    # create a dictionary containing model number and model type
    dct = {'model_number': model_number,
           'model_type': model_type}
    # append that dictionary to the model_info dataframe
    model_info = model_info.append(dct, ignore_index=True)

    # establish baseline predictions for train sample
    y_pred = pd.Series([train[target].mode()[0]]).repeat(len(train))

    # get metrics

    # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_train, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'train', 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    # establish baseline predictions for validate sample
    y_pred = baseline_pred = pd.Series([train[target].mode()[0]]).repeat(len(validate))

    # get metrics

    # create dictionaries for each metric type for the validate sample and append those dictionaries to the model_results dataframe
    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'f1_score',
           'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'accuracy',
           'score': sk.metrics.accuracy_score(y_validate, y_pred)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'precision',
           'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    dct = {'model_number': model_number, 
           'sample_type': 'validate', 
           'metric_type': 'recall',
           'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
    model_results = model_results.append(dct, ignore_index=True)

    # set the model number to from 'baseline' to 0 
    model_number = 0
    
    return model_number, model_info, model_results

In [16]:
model_number, model_info, model_results = run_baseline(train,
                                                       validate,
                                                       target,
                                                       positive,
                                                       model_number,
                                                       model_info,
                                                       model_results)

### RFE Decision Tree

In [17]:
model_number, model_info, model_results = model.rfe_decision_tree(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


### RFE Random Forest

In [None]:
model_number, model_info, model_results = model.rfe_random_forest(train,
                                                                  validate, 
                                                                  target, 
                                                                  positive, 
                                                                  model_number, 
                                                                  model_info, 
                                                                  model_results)

20
21
22
23
24


### RFE KNN

In [None]:
def rfe_log_regression(train,
                      validate, 
                      target, 
                      positive, 
                      model_number, 
                      model_info, 
                      model_results):

    # all available features
    all_features = [col for col in train.columns if 'enc_' in col or 'scaled_' in col]

    # separate each sample into x (features) and y (target) - for RFE
    x_train_rfe = train[all_features]
    y_train_rfe = train[target]

    for n_features in range(2, 6):
        for c_value in [.001, .01, .1, 1, 10, 100, 1000]:
            
                #####################################
                ### Recursive Feature Elimination ###
                #####################################

                # establish a logistic regression classifier
                clf = LogisticRegression(C=c_value)

                # create the rfe object
                rfe = RFE(clf, n_features_to_select=n_features)

                # fit the data using RFE
                rfe.fit(x_train_rfe, y_train_rfe)

                # get list of the column names for the selected features
                features = x_train_rfe.iloc[:,rfe.support_].columns.tolist()
                
                ##################
                ### Model Info ###
                ##################

                print(model_number)

                # create a new model number by adding 1 to the previous model number
                model_number += 1
                # establish the model type
                model_type = 'logistic regression'

                # store info about the model

                # create a dictionary containing the features and hyperparamters used in this model instance
                dct = {'model_number': model_number,
                       'model_type': model_type,
                       'features': features,
                       'c_value': c_value}
                # append that dictionary to the model_info dataframe
                model_info = model_info.append(dct, ignore_index=True)
                
                ################
                ### Modeling ###
                ################

                # separate each sample into x (features) and y (target)
                x_train = train[features]
                y_train = train[target]

                x_validate = validate[features]
                y_validate = validate[target]
                
                # fit the classifier to the training data
                clf = clf.fit(x_train, y_train)
                
                #####################
                ### Model Results ###
                #####################

                ####### train #######

                # create prediction results for the model's performance on the train sample
                y_pred = clf.predict(x_train)
                sample_type = 'train'

                # get metrics

                # create dictionaries for each metric type for the train sample and append those dictionaries to the model_results dataframe
                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'accuracy',
                       'score': sk.metrics.accuracy_score(y_train, y_pred)}
                model_results = model_results.append(dct, ignore_index=True)

                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'precision',
                       'score': sk.metrics.precision_score(y_train, y_pred, pos_label=positive)}
                model_results = model_results.append(dct, ignore_index=True)

                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'recall',
                       'score': sk.metrics.recall_score(y_train, y_pred, pos_label=positive)}
                model_results = model_results.append(dct, ignore_index=True)

                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'f1_score',
                       'score': sk.metrics.f1_score(y_train, y_pred, pos_label=positive)}
                model_results = model_results.append(dct, ignore_index=True)


                ####### validate #######

                # create prediction results for the model's performance on the validate sample
                y_pred = clf.predict(x_validate)
                sample_type = 'validate'

                # get metrics

                # create dictionaries for each metric type for the validate sample and append those dictionaries to the model_results dataframe
                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'f1_score',
                       'score': sk.metrics.f1_score(y_validate, y_pred, pos_label=positive)}
                model_results = model_results.append(dct, ignore_index=True)

                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'accuracy',
                       'score': sk.metrics.accuracy_score(y_validate, y_pred)}
                model_results = model_results.append(dct, ignore_index=True)

                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'precision',
                       'score': sk.metrics.precision_score(y_validate, y_pred, pos_label=positive)}
                model_results = model_results.append(dct, ignore_index=True)

                dct = {'model_number': model_number, 
                       'sample_type': sample_type, 
                       'metric_type': 'recall',
                       'score': sk.metrics.recall_score(y_validate, y_pred, pos_label=positive)}
                model_results = model_results.append(dct, ignore_index=True) 
                
    return model_number, model_info, model_results

In [None]:
model_number, model_info, model_results = rfe_log_regression(train,
                                                              validate, 
                                                              target, 
                                                              positive, 
                                                              model_number, 
                                                              model_info, 
                                                              model_results)

In [None]:
model.display_model_results(model_results)

In [None]:
def get_best_model_results(model_results, metric_type='accuracy', n_models=3):
    '''
    This function takes in the model_results dataframe. This is a dataframe in tidy 
    data format containing the following data for each model created in the project:
    - model number
    - metric type (accuracy, precision, recall, f1 score)
    - sample type (train, validate)
    - score (the score for the given metric and sample types)

    The function identifies the {n_models} models with the highest scores for the given metric
    type, as measured on the validate sample.

    It returns a dataframe of information about those models' performance in the tidy data format
    (as described above). 

    The resulting dataframe can be fed into the display_model_results function for convenient display formatting.
    '''
    # create an array of model numbers for the best performing models
    # by filtering the model_results dataframe for only validate scores for the given metric type
    best_models = (model_results[(model_results.metric_type == metric_type) 
                               & (model_results.sample_type == 'validate')]
                                                 # sort by score value in descending order
                                                 .sort_values(by='score', 
                                                              ascending=False)
                                                 # take only the model number for the top n_models
                                                 .head(n_models)
                                                 .model_number
                                                 # and take only the values from the resulting dataframe as an array
                                                 .values)
    # create a dataframe of model_results for the models identified above
    # by filtering the model_results dataframe for only the model_numbers in the best_models array
    # TODO: make this so that it will return n_models, rather than only 3 models
    best_model_results = model_results[(model_results.model_number == best_models[0]) 
                                     | (model_results.model_number == best_models[1]) 
                                     | (model_results.model_number == best_models[2])]

    return best_model_results

In [None]:
model.display_model_results(get_best_model_results(model_results))

In [None]:
model_info[model_info.model_number.isin([15, 32, 36])]

In [None]:
for model_num in [15, 32, 36]:
    print(f'Model #{model_num} Features:')
    print('-' * 20)
    for feature in model_info[model_info.model_number == model_num].features.values[0]:
        print(feature)
    print()
          