In [153]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [154]:
import pandas as pd
import numpy as np

In [155]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler , FunctionTransformer,OneHotEncoder
from sklearn.compose import make_column_selector,ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

## Load the data

In [156]:
from From_Shelter_to_Love.data_provisoria_v2 import get_data

In [157]:
df_all_2 = get_data()

## Drop the outcome features and the data with more than 60 days in shelter 

In [184]:
df_2 = df_all_2.drop(columns =['Animal ID','age_upon_intake_years', 'age_upon_outcome_years', 'Outcome Type', 'age_upon_outcome_months', 'neutered_or_spayed_outcome', 'male_or_female_outcome', 'male_or_female_intake'], axis=1)
df_less_60_2 = df_2[df_2['days_in_shelter'] < 60]

In [185]:
df_less_60_2.head(5)

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Breed,days_in_shelter,age_upon_intake_months,neutered_or_spayed_intake,color
0,Public Assist,Normal,Dog,Mixed,2.0,72.0,1,Bicolor
1,Owner Surrender,Normal,Dog,Pure,5.0,120.0,1,Tricolor
2,Public Assist,Injured,Dog,Pure,1.0,192.0,1,Bicolor
3,Stray,Aged,Dog,Mixed,1.0,180.0,1,Bicolor
4,Stray,Normal,Dog,Mixed,1.0,180.0,0,Bicolor


## Baseline to less than 60 days in Shelter - RMSE

In [186]:
y_pred_2 = df_less_60_2['days_in_shelter'].mean()
#RMSE
np.sqrt(np.mean((y_pred_2 - df_less_60_2['days_in_shelter'])**2))

12.985501995602426

### Split the data

In [187]:
from sklearn.model_selection import train_test_split

y = df_less_60_2["days_in_shelter"]
X = df_less_60_2.drop(columns = ["days_in_shelter"] , axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

## Preprocessing 

In [188]:
X_train.dtypes

Intake Type                   object
Intake Condition              object
Animal Type                   object
Breed                         object
age_upon_intake_months       float64
neutered_or_spayed_intake      int64
color                         object
dtype: object

In [189]:
# Impute then Scale for numerical variables
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())])

# Encode categorical varibles 
cat_transformer = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore',sparse=False))])

cat_bi_transformer = OneHotEncoder(drop='if_binary', sparse = False)


# Apply transformations to desired features
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age_upon_intake_months', 'neutered_or_spayed_intake']),
    ('cat_bi_transformer', cat_bi_transformer, ['Animal Type', 'Breed']),
    ('cat_transformer', cat_transformer, ['color', 'Intake Condition', 'Intake Type'])])


In [190]:
X_train_transf = preprocessor.fit_transform(X_train)

## X_train_transf to a dataframe with the columns names 

In [191]:
cat_names = preprocessor.transformers_[2][1].steps[0][1].get_feature_names(['color', 'Intake Condition', 'Intake Type'])

In [192]:
cat_names

array(['color_Bicolor', 'color_Dark', 'color_Light', 'color_Tricolor',
       'Intake Condition_Aged', 'Intake Condition_Behavior',
       'Intake Condition_Feral', 'Intake Condition_Injured',
       'Intake Condition_Medical', 'Intake Condition_Normal',
       'Intake Condition_Nursing', 'Intake Condition_Other',
       'Intake Condition_Pregnant', 'Intake Condition_Sick',
       'Intake Type_Abandoned', 'Intake Type_Euthanasia Request',
       'Intake Type_Owner Surrender', 'Intake Type_Public Assist',
       'Intake Type_Stray'], dtype=object)

In [193]:
columns = ['age_upon_intake_months', 'neutered_or_spayed_intake', 
           'Animal Type', 'Breed', 'color_Bicolor', 'color_Dark', 'color_Light', 'color_Tricolor',
           'Intake Condition_Aged', 'Intake Condition_Behavior',
           'Intake Condition_Feral', 'Intake Condition_Injured',
           'Intake Condition_Medical', 'Intake Condition_Normal',
           'Intake Condition_Nursing', 'Intake Condition_Other',
           'Intake Condition_Pregnant', 'Intake Condition_Sick',
           'Intake Type_Abandoned', 'Intake Type_Euthanasia Request',
           'Intake Type_Owner Surrender', 'Intake Type_Public Assist',
           'Intake Type_Stray']
df_X_train_transf = pd.DataFrame(X_train_transf, columns = columns)

In [194]:
df_X_train_transf

Unnamed: 0,age_upon_intake_months,neutered_or_spayed_intake,Animal Type,Breed,color_Bicolor,color_Dark,color_Light,color_Tricolor,Intake Condition_Aged,Intake Condition_Behavior,...,Intake Condition_Normal,Intake Condition_Nursing,Intake Condition_Other,Intake Condition_Pregnant,Intake Condition_Sick,Intake Type_Abandoned,Intake Type_Euthanasia Request,Intake Type_Owner Surrender,Intake Type_Public Assist,Intake Type_Stray
0,0.017361,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.166667,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.006944,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.083333,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.003472,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65919,0.041667,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
65920,0.125000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
65921,0.041667,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
65922,0.006944,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [195]:
X_train.dtypes

Intake Type                   object
Intake Condition              object
Animal Type                   object
Breed                         object
age_upon_intake_months       float64
neutered_or_spayed_intake      int64
color                         object
dtype: object

## GridSearch

In [30]:
model = RandomForestRegressor(max_samples=2000)

# Hyperparameter Grid
grid = {'max_depth': [10, 20]}

# Instanciate Grid Search
search = GridSearchCV(model, grid, 
                           scoring='neg_mean_squared_error',
                           cv = 5) 
search.fit(X_train_transf,y_train)
print(search.best_params_)
print(search.best_score_)

{'max_depth': 20}
-153.22551754622432


In [15]:
import numpy as np
np.sqrt(153)

12.36931687685298

## Cross validate RandomForestRegressor

In [179]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmse_neg(y_true, y_pred):
    return 0 - np.sqrt(mean_squared_error(y_true, y_pred))
 
rmse = make_scorer(rmse)
rmse_neg = make_scorer(rmse_neg)

In [180]:
model = RandomForestRegressor(max_depth=20)

In [181]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
#cross_val_score(model, X_train_transf, y_train, cv=5, scoring='neg_mean_squared_error').mean()
Cv = cross_validate(model, X_train_transf, y_train, cv=5, scoring={'rmse_neg': rmse_neg, 'rmse': rmse})
Cv['test_rmse'].mean() 

12.537319153199551

## Test importance of features 

In [196]:
feature_names = [i for i in df_X_train_transf.columns]

train_X, val_X, train_y, val_y = train_test_split(df_X_train_transf, y_train, random_state=1)

my_model = RandomForestRegressor(max_depth=20).fit(df_X_train_transf, y_train)

from sklearn.inspection import permutation_importance

perm = permutation_importance(my_model,val_X, val_y,n_repeats=30, random_state=1)
 

In [197]:
importance_df = pd.DataFrame(np.vstack((feature_names,
                                        perm.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

Unnamed: 0,feature,score decrease
10,Intake Condition_Feral,4.199552883369373e-05
0,age_upon_intake_months,0.2282584647563642
1,neutered_or_spayed_intake,0.1361776004859971
2,Animal Type,0.1022372318915314
20,Intake Type_Owner Surrender,0.0509130072777502
13,Intake Condition_Normal,0.0348372499984128
3,Breed,0.0243500098751636
22,Intake Type_Stray,0.0138106880968986
4,color_Bicolor,0.0121490130395526
11,Intake Condition_Injured,0.0117985010494662


## Take some features and run again the model

In [199]:
df_X_train_transf_drop = df_X_train_transf.drop(columns =['Intake Condition_Behavior', 'Intake Condition_Pregnant', 
                                                          'Intake Condition_Aged', 'Intake Type_Euthanasia Request',
                                                          'Intake Condition_Medical', 'Intake Condition_Other', 
                                                          'Intake Type_Abandoned', 'Intake Condition_Sick', 
                                                          'color_Tricolor', 'color_Dark', 'Intake Type_Public Assist',
                                                          'color_Light'], axis=1)

In [201]:
Cv = cross_validate(model, df_X_train_transf_drop, y_train, cv=5, scoring={'rmse_neg': rmse_neg, 'rmse': rmse})
Cv['test_rmse'].mean() 

12.506679327411046

# Regression model is not better than a simple mean!!!!