In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler , FunctionTransformer,OneHotEncoder
from sklearn.compose import make_column_selector,ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

## Load the data

In [3]:
from From_Shelter_to_Love.data_provisoria import get_data

In [4]:
df_all = get_data()

## Drop the outcome features and the data with more than 60 days in shelter 

In [5]:
df = df_all.drop(columns =['Animal ID','age_upon_intake_years', 'age_upon_outcome_years', 'Outcome Type', 'age_upon_outcome_months', 'neutered_or_spayed_outcome', 'male_or_female_outcome', 'male_or_female_intake'], axis=1)
df_less_60 = df[df['days_in_shelter'] < 60]


In [14]:
df_less_60.head(5)

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Breed,days_in_shelter,age_upon_intake_months,neutered_or_spayed_intake,group_color
0,Public Assist,Normal,Dog,Mixed,2.0,72.0,1,Yellow
1,Owner Surrender,Normal,Dog,Dachshund,5.0,120.0,1,Tricolor
2,Public Assist,Injured,Dog,Shetland Sheepdog,1.0,192.0,1,Brown
3,Stray,Aged,Dog,Mixed,1.0,180.0,1,Black
4,Stray,Normal,Dog,Mixed,1.0,180.0,0,Black


## Baseline to less than 60 days in Shelter - RMSE

In [7]:
y_pred = df_less_60['days_in_shelter'].mean()
#RMSE
np.sqrt(np.mean((y_pred - df_less_60['days_in_shelter'])**2))

12.985501995602426

### Split the data

In [25]:
from sklearn.model_selection import train_test_split

y = df_less_60["days_in_shelter"]
X = df_less_60.drop(columns = ["days_in_shelter"] , axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state = 10)

## Preprocessing 

In [49]:
X.dtypes

Intake Type                   object
Intake Condition              object
Animal Type                   object
Breed                         object
age_upon_intake_months       float64
neutered_or_spayed_intake      int64
group_color                   object
dtype: object

In [27]:
# Impute then Scale for numerical variables
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())])

# Encode categorical varibles 
cat_transformer = OneHotEncoder(handle_unknown='ignore',sparse=False)

cat_bi_transformer = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse = False)

# Apply transformations to desired features
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include=['int64',"float64"])),
    ('cat_transformer', cat_transformer, make_column_selector(dtype_include=["object"]))])

In [28]:
X_train_transf = preprocessor.fit_transform(X_train)

In [31]:
X_val_transf = preprocessor.transform(X_val)

In [9]:
#cv = cross_validate(pipe_baseline, X, y, cv=5, scoring={'rmse_neg': rmse_neg, 'rmse': rmse})

## GridSearch

In [30]:
model = RandomForestRegressor(max_samples=2000)

# Hyperparameter Grid
grid = {'max_depth': [10, 20]}

# Instanciate Grid Search
search = GridSearchCV(model, grid, 
                           scoring='neg_mean_squared_error',
                           cv = 5) 
search.fit(X_train_transf,y_train)
print(search.best_params_)
print(search.best_score_)

{'max_depth': 20}
-153.22551754622432


In [15]:
import numpy as np
np.sqrt(153)

12.36931687685298

## Cross validate RandomForestRegressor

In [8]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmse_neg(y_true, y_pred):
    return 0 - np.sqrt(mean_squared_error(y_true, y_pred))
 
rmse = make_scorer(rmse)
rmse_neg = make_scorer(rmse_neg)

In [None]:
model = RandomForestRegressor(max_depth=20)

In [None]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
#cross_val_score(model, X_train_transf, y_train, cv=5, scoring='neg_mean_squared_error').mean()
Cv = cross_validate(model, X_train_transf, y_train, cv=5, scoring={'rmse_neg': rmse_neg, 'rmse': rmse})
Cv['test_rmse'].mean() 

In [33]:
model.fit(X_train_transf, y_train)
model.score(X_val_transf, y_val)

0.0860115775223389

## Test importance of features 

In [None]:
feature_names = [i for i in data.columns if data[i].dtype in [np.int64]]
X = data[feature_names]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
my_model = RandomForestClassifier(n_estimators=100,
                                  random_state=0).fit(train_X, train_y)

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
eli5.show_weights(perm, feature_names = val_X.columns.tolist())


In [None]:
from sklearn.inspection import permutation_importance

r = permutation_importance(model, X_val_transf, y_val,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{X.feature_names[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")

## XGBoost for Regression

In [52]:
import xgboost as xgb
from sklearn.model_selection import RepeatedKFold

#model_xgbr = xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)

# define model
model_2 = xgb.XGBRegressor()
# define model evaluation method
# cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
cross_val_score(model_2, X_train_transf, y_train, scoring='neg_mean_squared_error', cv=5, n_jobs=-1).mean()
#Cv2 = cross_validate(model, X_train_transf, y_train, cv=5, scoring={'rmse_neg': rmse_neg, 'rmse': rmse})
#Cv2['test_rmse'].mean() 


-152.94647472509286

## GradientBoostingRegressor

In [53]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
X, y = make_regression(random_state=0)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, random_state=0)
reg = GradientBoostingRegressor(random_state=0)

cross_val_score(reg, X_train_transf, y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1).mean()

#reg.fit(X_train, y_train)
#reg.predict(X_test[1:2])
#reg.score(X_test, y_test)


-8.749832032708584