In [2]:
%load_ext autoreload
%autoreload 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn import set_config ;set_config(display='diagram')
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler , FunctionTransformer,OneHotEncoder
from sklearn.compose import make_column_selector,ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

## Load the data

In [4]:
from From_Shelter_to_Love.data_provisoria import get_data

In [5]:
df_all = get_data()

In [6]:
df = df_all.drop(columns =['Animal ID','age_upon_intake_years', 'age_upon_outcome_years', 'Outcome Type', 'age_upon_outcome_months', 'neutered_or_spayed_outcome', 'male_or_female_outcome'], axis=1)

Split the data

In [7]:
from sklearn.model_selection import train_test_split

y = df["days_in_shelter"]
X = df.drop(columns = ["days_in_shelter"] , axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

In [8]:
X_train.head(4)

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Breed,age_upon_intake_months,neutered_or_spayed_intake,male_or_female_intake,group_color
79164,Stray,Normal,Cat,Mixed,1.0,0,1.0,Black Tabby
46730,Stray,Injured,Cat,Mixed,2.0,0,1.0,Black
66253,Stray,Normal,Dog,Mixed,24.0,1,1.0,White
92536,Stray,Normal,Cat,Mixed,1.0,0,1.0,Orange Tabby


## Preprocessing

In [10]:
# Impute then Scale for numerical variables
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())])

# Encode categorical varibles 
cat_transformer = OneHotEncoder(handle_unknown='ignore',sparse=False)

# Apply transformations to desired features
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include=['int64',"float64"])),
    ('cat_transformer', cat_transformer, make_column_selector(dtype_include=["object"]))])

In [11]:
X_train_transf = preprocessor.fit_transform(X_train)

In [12]:
pd.DataFrame(X_train_transf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,258,259,260,261,262,263,264,265,266,267
0,0.003472,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.006944,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.083333,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.003472,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.003472,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71902,0.041667,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
71903,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
71904,0.003472,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71905,0.006944,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## GridSearch

In [13]:
models = {
    'Linear': {
        'model': LinearRegression(),
        'parameters': {
            'n_jobs': [None, -1]
           }},

    'Ridge': {
        'model': Ridge(max_iter = 100),
        'parameters': {
            'alpha': [0.01, 0.1, 1],
            'solver': ['auto', 'lsqr','sag']
        }}
}
#'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']

scores = [] 
#cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

for model_name, model_params in models.items():
    gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = 5, return_train_score=False)
    gs.fit(X_train_transf, y_train)
    scores.append({
        'model': model_name,
        'best_parameters': gs.best_params_,
        'score': gs.best_score_
    })

scores_df = pd.DataFrame(scores, columns=['model', 'best_parameters', 'score'])

In [14]:
scores_df

Unnamed: 0,model,best_parameters,score
0,Linear,{'n_jobs': None},-1.641834e+17
1,Ridge,"{'alpha': 0.1, 'solver': 'lsqr'}",0.01572139


In [17]:
models = {
    'Lasso': {
        'model': Lasso(max_iter = 500),
        'parameters': {
            'alpha': [0.01, 0.1, 1, 10]
        }},

    'RF': {
        'model': RandomForestRegressor(max_samples=2000),
        'parameters': {
            'n_jobs': [None, -1],
            'max_depth': [2, 5, 10], 
            'max_features':['auto', 'sqrt', 'log2']
        }}
}

scores = [] 
#cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

for model_name, model_params in models.items():
    gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = 5, return_train_score=False)
    gs.fit(X_train_transf, y_train)
    scores.append({
        'model': model_name,
        'best_parameters': gs.best_params_,
        'score': gs.best_score_
    })

scores_df_2 = pd.DataFrame(scores, columns=['model', 'best_parameters', 'score'])

In [22]:
scores_df_2

Unnamed: 0,model,best_parameters,score
0,Lasso,{'alpha': 0.01},0.015498
1,RF,"{'max_depth': 10, 'max_features': 'auto', 'n_j...",0.040278


In [24]:
scores_df_2['best_parameters'][1]

{'max_depth': 10, 'max_features': 'auto', 'n_jobs': -1}

In [33]:
models = {
    'Elastic': {
        'model': ElasticNet(max_iter=100),
        'parameters': {
            'alpha': [0.01, 0.1],
            'l1_ratio' : [0.5, 0.25] 
           }},

    'SGD': {
        'model': SGDRegressor(max_iter=100),
        'parameters': {
            'loss': ['squared_loss', 'huber'],
            'penalty': ['l2', 'l1', 'elasticnet'],
            'alpha': [0.0001, 0.001, 0.01],
            'l1_ratio' : [0.15, 0.25]
        }}

}

scores = [] 
#cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

for model_name, model_params in models.items():
    gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = 5, return_train_score=False)
    gs.fit(X_train_transf, y_train)
    scores.append({
        'model': model_name,
        'best_parameters': gs.best_params_,
        'score': gs.best_score_
    })

scores_df_3 = pd.DataFrame(scores, columns=['model', 'best_parameters', 'score'])

In [34]:
scores_df_3 

Unnamed: 0,model,best_parameters,score
0,Elastic,"{'alpha': 0.01, 'l1_ratio': 0.5}",0.014447
1,SGD,"{'alpha': 0.0001, 'l1_ratio': 0.15, 'loss': 's...",0.01587


## Cross validation - RandomForestRegressor

In [26]:
model = RandomForestRegressor(max_samples=2000, max_depth=10,n_jobs = -1)

In [27]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
cross_val_score(model, X_train_transf, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-1707.5100993519548

In [28]:
import numpy as np
np.sqrt(1707)

41.31585652022719