In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler , FunctionTransformer,OneHotEncoder
from sklearn.compose import make_column_selector,ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

## Load the data

In [4]:
from From_Shelter_to_Love.data_provisoria_v2 import get_data

In [5]:
df_all_2 = get_data()

## Drop the outcome features and the data with more than 60 days in shelter 

In [60]:
df_2 = df_all_2.drop(columns =['Animal ID','age_upon_intake_years', 'age_upon_outcome_years', 'Outcome Type', 'age_upon_outcome_months', 'neutered_or_spayed_outcome', 'male_or_female_outcome', 'male_or_female_intake'], axis=1)
df_less_60_2 = df_2[df_2['days_in_shelter'] > 7]

In [61]:
df_22 = df_2[df_2['Animal Type']=='Dog']

In [62]:
df_less_60_2.head(5)

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Breed,Date of Birth,days_in_shelter,age_upon_intake_number_months,age_upon_intake_months,neutered_or_spayed_intake,color
10,Stray,Normal,Cat,Mixed,2000-01-21,15.0,178.583407,168,1,Bicolor
11,Owner Surrender,Normal,Dog,Mixed,1997-10-03,50.0,192.025549,192,1,Bicolor
13,Public Assist,Not-Normal,Dog,Pure,1999-06-01,10.0,207.955103,204,1,Bicolor
14,Stray,Normal,Dog,Mixed,2000-03-05,11.0,164.725217,156,0,Bicolor
19,Owner Surrender,Normal,Cat,Mixed,2000-05-01,10.0,216.26602,216,1,Tricolor


## Baseline to less than 60 days in Shelter - RMSE

In [63]:
y_pred_2 = df_less_60_2['days_in_shelter'].mean()
#RMSE
np.sqrt(np.mean((y_pred_2 - df_less_60_2['days_in_shelter'])**2))

58.28091490784098

### Split the data

In [64]:
from sklearn.model_selection import train_test_split

y = df_less_60_2["days_in_shelter"]
X = df_less_60_2.drop(columns = ["days_in_shelter"] , axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

## Preprocessing 

In [65]:
X_train.dtypes

Intake Type                              object
Intake Condition                         object
Animal Type                              object
Breed                                    object
Date of Birth                    datetime64[ns]
age_upon_intake_number_months           float64
age_upon_intake_months                    int64
neutered_or_spayed_intake                 int64
color                                    object
dtype: object

In [66]:
# Impute then Scale for numerical variables
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())])

# Encode categorical varibles 
cat_transformer = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore',sparse=False))])

cat_bi_transformer = OneHotEncoder(drop='if_binary', sparse = False)


# Apply transformations to desired features
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['age_upon_intake_months', 'neutered_or_spayed_intake']),
    ('cat_bi_transformer', cat_bi_transformer, ['Animal Type', 'Breed']),
    ('cat_transformer', cat_transformer, ['color', 'Intake Condition', 'Intake Type'])])


In [67]:
X_train_transf = preprocessor.fit_transform(X_train)

## X_train_transf to a dataframe with the columns names 

In [68]:
cat_names = preprocessor.transformers_[2][1].steps[0][1].get_feature_names(['color', 'Intake Condition', 'Intake Type'])

In [69]:
cat_names

array(['color_Bicolor', 'color_Dark', 'color_Light', 'color_Tricolor',
       'Intake Condition_Normal', 'Intake Condition_Not-Normal',
       'Intake Type_Abandoned', 'Intake Type_Euthanasia Request',
       'Intake Type_Owner Surrender', 'Intake Type_Public Assist',
       'Intake Type_Stray'], dtype=object)

In [70]:
columns = ['age_upon_intake_months', 'neutered_or_spayed_intake', 
           'Animal Type', 'Breed', 'color_Bicolor', 'color_Dark', 'color_Light', 'color_Tricolor',
       'Intake Condition_Normal', 'Intake Condition_Not-Normal',
       'Intake Type_Abandoned', 'Intake Type_Euthanasia Request',
       'Intake Type_Owner Surrender', 'Intake Type_Public Assist',
       'Intake Type_Stray']
df_X_train_transf = pd.DataFrame(X_train_transf, columns = columns)

In [71]:
df_X_train_transf

Unnamed: 0,age_upon_intake_months,neutered_or_spayed_intake,Animal Type,Breed,color_Bicolor,color_Dark,color_Light,color_Tricolor,Intake Condition_Normal,Intake Condition_Not-Normal,Intake Type_Abandoned,Intake Type_Euthanasia Request,Intake Type_Owner Surrender,Intake Type_Public Assist,Intake Type_Stray
0,0.263158,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.008772,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.004386,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.052632,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.052632,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31094,0.526316,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
31095,0.157895,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
31096,0.004386,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
31097,0.052632,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [72]:
X_train.dtypes

Intake Type                              object
Intake Condition                         object
Animal Type                              object
Breed                                    object
Date of Birth                    datetime64[ns]
age_upon_intake_number_months           float64
age_upon_intake_months                    int64
neutered_or_spayed_intake                 int64
color                                    object
dtype: object

## GridSearch

In [20]:
model = RandomForestRegressor(max_samples=2000)

# Hyperparameter Grid
grid = {'max_depth': [10, 20]}

# Instanciate Grid Search
search = GridSearchCV(model, grid, 
                           scoring='neg_mean_squared_error',
                           cv = 5) 
search.fit(X_train_transf,y_train)
print(search.best_params_)
print(search.best_score_)

{'max_depth': 10}
-2.7889375329814436


In [None]:
import numpy as np
np.sqrt(-2.7889)

## Cross validate RandomForestRegressor

In [77]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmse_neg(y_true, y_pred):
    return 0 - np.sqrt(mean_squared_error(y_true, y_pred))
 
rmse_s = make_scorer(rmse)
rmse_neg_s = make_scorer(rmse_neg)

In [75]:
model = RandomForestRegressor(max_depth=10)

In [78]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
#cross_val_score(model, X_train_transf, y_train, cv=5, scoring='neg_mean_squared_error').mean()
Cv = cross_validate(model, X_train_transf, y_train, cv=5, scoring={'rmse_neg': rmse_neg_s, 'rmse': rmse_s})
Cv['test_rmse'].mean() 

56.66239037016497

In [79]:
X_test_transf = preprocessor.fit_transform(X_test)
model.fit(X_train_transf, y_train)
y_pred = model.predict(X_test_transf)
y_true = y_test
rmse(y_true, y_pred)

61.39345577798158

## Test importance of features 

In [80]:
feature_names = [i for i in df_X_train_transf.columns]

train_X, val_X, train_y, val_y = train_test_split(df_X_train_transf, y_train, random_state=1)

my_model = RandomForestRegressor(max_depth=20).fit(df_X_train_transf, y_train)

from sklearn.inspection import permutation_importance

perm = permutation_importance(my_model,val_X, val_y,n_repeats=30, random_state=1)
 

In [81]:
importance_df = pd.DataFrame(np.vstack((feature_names,
                                        perm.importances_mean)).T) # Unstack results
importance_df.columns=['feature','score decrease']

importance_df.sort_values(by="score decrease", ascending = False) # Order by importance

Unnamed: 0,feature,score decrease
11,Intake Type_Euthanasia Request,1.856440732236342e-05
0,age_upon_intake_months,0.2879551867922462
1,neutered_or_spayed_intake,0.1628898075226449
2,Animal Type,0.0818247688664613
13,Intake Type_Public Assist,0.0420398199963737
5,color_Dark,0.0294789097623085
12,Intake Type_Owner Surrender,0.029241793223839
8,Intake Condition_Normal,0.026307326531574
9,Intake Condition_Not-Normal,0.0245995654534859
3,Breed,0.0214639856449442


# Regression model is not better than a simple mean!!!!