In [78]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [1]:
from sklearn import set_config ;set_config(display='diagram')
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler , FunctionTransformer,OneHotEncoder
from sklearn.compose import make_column_selector,ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor

Load the data 

In [7]:
from From_Shelter_to_Love.data_provisoria import get_data

In [8]:
df_all = get_data()

Drop features with outcome

In [47]:
df = df_all.drop(columns =['Animal ID','age_upon_intake_years', 'age_upon_outcome_years', 'Outcome Type', 'age_upon_outcome_months', 'neutered_or_spayed_outcome', 'male_or_female_outcome'], axis=1)


In [48]:
df

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Breed,days_in_shelter,age_upon_intake_months,neutered_or_spayed_intake,male_or_female_intake,group_color
0,Public Assist,Normal,Dog,Mixed,2.0,72.0,1,1.0,Yellow
1,Owner Surrender,Normal,Dog,Dachshund,5.0,120.0,1,1.0,Tricolor
2,Public Assist,Injured,Dog,Shetland Sheepdog,1.0,192.0,1,1.0,Brown
3,Stray,Aged,Dog,Mixed,1.0,180.0,1,0.0,Black
4,Stray,Normal,Dog,Mixed,1.0,180.0,0,0.0,Black
...,...,...,...,...,...,...,...,...,...
110852,Stray,Normal,Dog,Mixed,2.0,24.0,0,1.0,Black
110853,Stray,Normal,Dog,Mixed,2.0,24.0,1,0.0,White
110861,Stray,Normal,Dog,Mixed,1.0,24.0,1,1.0,Brown
110874,Owner Surrender,Normal,Dog,Alaskan Husky,2.0,12.0,0,1.0,Black


In [49]:
df.shape

(102725, 9)

In [50]:
df.dtypes

Intake Type                   object
Intake Condition              object
Animal Type                   object
Breed                         object
days_in_shelter              float64
age_upon_intake_months       float64
neutered_or_spayed_intake      int64
male_or_female_intake        float64
group_color                   object
dtype: object

Split the data 

In [51]:
from sklearn.model_selection import train_test_split

y = df["days_in_shelter"]
X = df.drop(columns = ["days_in_shelter"] , axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

In [52]:
y_train.shape

(71907,)

In [53]:
X_train

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Breed,age_upon_intake_months,neutered_or_spayed_intake,male_or_female_intake,group_color
79164,Stray,Normal,Cat,Mixed,1.0,0,1.0,Black Tabby
46730,Stray,Injured,Cat,Mixed,2.0,0,1.0,Black
66253,Stray,Normal,Dog,Mixed,24.0,1,1.0,White
92536,Stray,Normal,Cat,Mixed,1.0,0,1.0,Orange Tabby
6085,Owner Surrender,Normal,Dog,Mixed,1.0,0,1.0,Black
...,...,...,...,...,...,...,...,...
9934,Stray,Normal,Dog,Mixed,12.0,0,1.0,White
100433,Stray,Normal,Cat,Mixed,12.0,0,0.0,Tortie
54076,Stray,Injured,Cat,Mixed,1.0,0,0.0,Black
101706,Stray,Normal,Dog,Mixed,2.0,0,0.0,Black


## Preprocessing 

In [75]:

# Impute then Scale for numerical variables
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())])

# Encode categorical varibles 
cat_transformer = OneHotEncoder(handle_unknown='ignore',sparse=False)

# Apply transformations to desired features
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include=['int64',"float64"])),
    ('cat_transformer', cat_transformer, make_column_selector(dtype_include=["object"]))])

final_pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression', Ridge())])

final_pipe_RF = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression', RandomForestRegressor(n_estimators=100, max_depth=2, random_state=0, n_jobs=-1))])

final_pipe_SGD = Pipeline([
    ('preprocessing', preprocessor),
    ('linear_regression', SGDRegressor())])

In [66]:
shape = preprocessor.fit_transform(X_train).shape
shape

(71907, 268)

## Scores - Ridge, SGD, RF

In [57]:
final_pipe_trained = final_pipe.fit(X_train,y_train)
final_pipe_trained.score(X_test,y_test)

0.018339886790568527

In [58]:
final_pipe_SGD_trained = final_pipe_SGD.fit(X_train,y_train)
final_pipe_SGD_trained.score(X_test,y_test)

0.017548784163934794

In [76]:
final_pipe_RF_trained = final_pipe_RF.fit(X_train,y_train)
final_pipe_RF_trained.score(X_test,y_test)

0.016589911601590956

## Cross validate - Ridge, SGD, RF - mean_squared_error

In [59]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
cross_val_score(final_pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-1752.57917951213

In [60]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
cross_val_score(final_pipe_SGD, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-1752.6916582493582

In [77]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
cross_val_score(final_pipe_RF, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-1754.3268664611085

RMSE

In [72]:
import numpy as np
np.sqrt(1752)

41.8568990729127

In [63]:
cv_results = cross_validate(final_pipe, X_train, y_train, cv=5)
cv_results

{'fit_time': array([0.34194636, 0.39222789, 0.45366383, 0.46326661, 0.51181865]),
 'score_time': array([0.15389299, 0.15667534, 0.15009308, 0.13900185, 0.17520595]),
 'test_score': array([0.01593589, 0.012062  , 0.01566102, 0.0187366 , 0.01528112])}

RMSE da Baseline 

In [80]:
y_pred = df['days_in_shelter'].mean()
#RMSE
np.sqrt(np.mean((y_pred - df['days_in_shelter'])**2))

42.65241520991277