In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import pandas as pd
import numpy as np

In [10]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler , FunctionTransformer,OneHotEncoder
from sklearn.compose import make_column_selector,ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

## Load the data

In [5]:
from From_Shelter_to_Love.data_provisoria import get_data

In [6]:
df_all = get_data()

Drop outcome features and more than 60 days in shelter

In [7]:
df = df_all.drop(columns =['Animal ID','age_upon_intake_years', 'age_upon_outcome_years', 'Outcome Type', 'age_upon_outcome_months', 'neutered_or_spayed_outcome', 'male_or_female_outcome'], axis=1)
df_less_60 = df[df['days_in_shelter'] < 60]


## Baseline to less than 60 days in Shelter - RMSE

In [38]:
y_pred = df_less_60['days_in_shelter'].mean()
#RMSE
np.sqrt(np.mean((y_pred - df_less_60['days_in_shelter'])**2))

12.985501995602426

Split the data 

In [8]:
from sklearn.model_selection import train_test_split

y = df_less_60["days_in_shelter"]
X = df_less_60.drop(columns = ["days_in_shelter"] , axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 10)

## Preprocessing

In [11]:
# Impute then Scale for numerical variables
num_transformer = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', MinMaxScaler())])

# Encode categorical varibles 
cat_transformer = OneHotEncoder(handle_unknown='ignore',sparse=False)

# Apply transformations to desired features
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, make_column_selector(dtype_include=['int64',"float64"])),
    ('cat_transformer', cat_transformer, make_column_selector(dtype_include=["object"]))])

In [12]:
X_train_transf = preprocessor.fit_transform(X_train)

## GridSearch

In [45]:
models = {
    'Lasso': {
        'model': Lasso(max_iter = 500),
        'parameters': {
            'alpha': [0.01, 0.1, 1, 10]
           }},

    'Ridge': {
        'model': Ridge(max_iter = 100),
        'parameters': {
            'alpha': [0.01, 0.1, 1],
            'solver': ['auto', 'lsqr','sag']
        }}
}
#'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']

scores = [] 
#cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

for model_name, model_params in models.items():
    gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = 5, return_train_score=False, scoring = 'neg_mean_squared_error')
    gs.fit(X_train_transf, y_train)
    scores.append({
        'model': model_name,
        'best_parameters': gs.best_params_,
        'score': gs.best_score_
    })

scores_df_2 = pd.DataFrame(scores, columns=['model', 'best_parameters', 'score'])

In [47]:
scores_df_2

Unnamed: 0,model,best_parameters,score
0,Lasso,{'alpha': 0.01},-162.059565
1,Ridge,"{'alpha': 1, 'solver': 'lsqr'}",-161.783598


In [54]:
models = {
    'Ridge': {
        'model': Ridge(max_iter = 100),
        'parameters': {
            'alpha': [0.01, 0.1, 1],
            'solver': ['auto', 'lsqr','sag']
        }},

    'RF': {
        'model': RandomForestRegressor(max_samples=2000),
        'parameters': {
            'n_jobs': [None, -1],
            'max_depth': [10, 20, 30], 
            'max_features':['auto', 'sqrt', 'log2']
        }}
}

scores = [] 
#cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

for model_name, model_params in models.items():
    gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = 5, return_train_score=False, scoring = 'neg_mean_squared_error')
    gs.fit(X_train_transf, y_train)
    scores.append({
        'model': model_name,
        'best_parameters': gs.best_params_,
        'score': gs.best_score_
    })

scores_df = pd.DataFrame(scores, columns=['model', 'best_parameters', 'score'])

In [55]:
scores_df['best_parameters'][1]

{'max_depth': 20, 'max_features': 'auto', 'n_jobs': None}

In [48]:
models = {
    'Elastic': {
        'model': ElasticNet(max_iter=100),
        'parameters': {
            'alpha': [0.01, 0.1],
            'l1_ratio' : [0.5, 0.25] 
           }},

    'SGD': {
        'model': SGDRegressor(max_iter=100),
        'parameters': {
            'loss': ['squared_loss', 'huber'],
            'penalty': ['l2', 'l1', 'elasticnet'],
            'alpha': [0.0001, 0.001, 0.01],
            'l1_ratio' : [0.15, 0.25]
        }}

}

scores = [] 
#cv_shuffle = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)

for model_name, model_params in models.items():
    gs = GridSearchCV(model_params['model'], model_params['parameters'], cv = 5, return_train_score=False, scoring = 'neg_mean_squared_error')
    gs.fit(X_train_transf, y_train)
    scores.append({
        'model': model_name,
        'best_parameters': gs.best_params_,
        'score': gs.best_score_
    })

scores_df_3 = pd.DataFrame(scores, columns=['model', 'best_parameters', 'score'])

In [49]:
scores_df_3 

Unnamed: 0,model,best_parameters,score
0,Elastic,"{'alpha': 0.01, 'l1_ratio': 0.5}",-162.125886
1,SGD,"{'alpha': 0.0001, 'l1_ratio': 0.25, 'loss': 's...",-161.766443


## Cross validate RandomForestRegressor

In [17]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def rmse_neg(y_true, y_pred):
    return 0 - np.sqrt(mean_squared_error(y_true, y_pred))
 
rmse = make_scorer(rmse)
rmse_neg = make_scorer(rmse_neg)

In [18]:
model = RandomForestRegressor(max_samples=2000, max_depth=200,n_jobs = -1)

In [19]:
from sklearn.model_selection import cross_validate, cross_val_score

# Cross validate pipeline
cross_validate(model, X_train_transf, y_train, cv=5, scoring={'rmse_neg': rmse_neg, 'rmse': rmse})

{'fit_time': array([2.08453321, 1.94508195, 1.91597891, 1.63928819, 1.88549733]),
 'score_time': array([0.25677276, 0.18576145, 0.24244666, 0.26426482, 0.25204611]),
 'test_rmse_neg': array([-12.11977893, -12.19492098, -12.32385564, -12.19551041,
        -12.06742723]),
 'test_rmse': array([12.11977893, 12.19492098, 12.32385564, 12.19551041, 12.06742723])}

In [53]:
import numpy as np
np.sqrt(148)

12.165525060596439