In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
import datetime
import time


In [10]:
#Build first set of models using Cross Validation and Grid Search

In [88]:
#decorator for calculating processing time
def timer(f):
    def wrapper(*args,**kwargs):
        start=time.time()
        x=f(*args,**kwargs)
        end=time.time()
        t=datetime.datetime.fromtimestamp(end-start).strftime('%Mm %ss %fms')
        print(f'{f.__name__}- processing time: {t}')
        return x,(end-start)
    return wrapper
        
        

In [64]:
#Perform PCA transformation
def pca_transform(df):
    from sklearn.decomposition import PCA
    pca=PCA()
    pca.fit(df)
    arr=np.cumsum(pca.explained_variance_ratio_)
    #select the number of components which add up to a variance ratio of 1
    n=len(arr[arr<1])
    
    pca=PCA(n_components=n)
    df_t=pca.fit_transform(df)
    return df_t

In [65]:

#Retrieve saved X and y data and split the dataset into training and testing sets
def data_split(d):
    df=d.copy()
    X_los=df.drop(['los_days'],axis=1)
    y_los=df['los_days']

    X_train,X_test,y_train,y_test=train_test_split(X_los,y_los,test_size=0.2,random_state=0)
    
    return X_train,X_test,y_train,y_test


In [78]:
# transform features into polynomial feature map and then perform linear regression
def build_pr(X_train,y_train,score,degree_range):
    from sklearn.preprocessing import PolynomialFeatures
    best_score=-2 if score=='r2' else 3000
    t_all=0
    for i in degree_range:
        polynomial_features = PolynomialFeatures(degree=i)
        X_train_pr=polynomial_features.fit_transform(X_train)
        (m,d_score),t=build_lr(X_train_pr,y_train,score,pr=True)
        if score=='r2':
            if (d_score>best_score) and (d_score<1):
                best_score=d_score
                best_model=m
        else:
            if d_score<best_score:
                best_score=d_score
                best_model=m
        t_all+=t
    return (best_model,best_score),t_all

In [67]:
@timer
# Linear Regression: Ridge Regression
def build_lr(X_train,y_train,score,pr=False):
    from sklearn.linear_model import LinearRegression, Ridge
    lr=LinearRegression()
    lr.fit(X_train,y_train)
    params_lr={'alpha':[0.2,0.5,1.0,2,4,10,20,25]}
    cv_lr=GridSearchCV(Ridge(),params_lr,cv=5,scoring=score)
    cv_lr.fit(X_train,y_train)
    best_lr=cv_lr.best_estimator_
    best_score_lr=-1*cv_lr.best_score_
    if pr is False:
        print(best_lr)
        print(best_score_lr)
    return best_lr,best_score_lr

In [68]:
@timer
def build_rf(X_train,y_train,score):
    # Random Forest Regression
    params_rf= {'bootstrap':[True],
                  'n_estimators': [50,100,200], 
                  'max_features': ['log2', 'sqrt','auto'], 
                    'max_depth': [3,5,7], 
                  'min_samples_split': [2, 3],
                    'min_samples_leaf': [8,10]
                 }
    cv_rf=GridSearchCV(RandomForestRegressor(),params_rf,cv=5,scoring=score)
    cv_rf.fit(X_train,y_train)
    best_rf=cv_rf.best_estimator_
    best_score_rf=-1*cv_rf.best_score_
    print(best_rf)
    print(best_score_rf)
    return best_rf,best_score_rf

In [69]:
#Gradient Boost Regression
@timer
def build_gbr(X_train,y_train,score):
    from sklearn.ensemble import GradientBoostingRegressor
    params_gb={'n_estimators':[50,100,200],
                'learning_rate':[0.01,0.05,0.1],
                'max_depth':[5,7,10],
                'min_samples_split':[2,3],
                'min_samples_leaf':[3,5]}
    cv_gb=GridSearchCV(GradientBoostingRegressor(),params_gb,cv=5,scoring=score)
    cv_gb.fit(X_train,y_train)
    best_gb=cv_gb.best_estimator_
    best_score_gb=-1*cv_gb.best_score_
    print(best_gb)
    print(best_score_gb)
    return best_gb,best_score_gb

In [70]:
#SVM Regression
@timer
def build_svr(X_train,y_train,score):
    from sklearn.svm import SVR
    params_svr={
               'degree':[1,3,5],
                'gamma':['scale'],
                'C':[1.0,1.5,2],
    #             'C':10. ** np.arange(-3, 8),
                'epsilon':[0.1, 0.5,1]}
    cv_svr=GridSearchCV(SVR(),params_svr,cv=5,scoring=score)
    cv_svr.fit(X_train,y_train)
    best_svr=cv_svr.best_estimator_
    best_score_svr=cv_svr.best_score_
    print(best_svr)
    print(best_score_svr)
    return best_svr,best_score_svr

max_depth = 5 : This should be between 3-10. I’ve started with 5 but you can choose a different number as well. 4-6 can be good starting points.
min_child_weight = 1 : A smaller value is chosen because it is a highly imbalanced class problem and leaf nodes can have smaller size groups.
gamma = 0 : A smaller value like 0.1-0.2 can also be chosen for starting. This will anyways be tuned later.
subsample, colsample_bytree = 0.8 : This is a commonly used used start value. Typical values range between 0.5-0.9.
scale_pos_weight = 1: Because of high class imbalance.

In [71]:
@timer
def build_xgb(X_train,y_train,score):
    import xgboost as xgb
    params_xgb={'n_estimators':[50,100,200],
                'learning_rate':[0.01,0.05,0.1],
                'max_depth':[3,5,7],
                'min_samples_split':[2,3],
                'min_samples_leaf':[3,5]}
    cv_xgb=GridSearchCV(xgb.XGBRegressor(),params_xgb,cv=5,scoring=score)
    cv_xgb.fit(X_train,y_train)
    best_xgb=cv_xgb.best_estimator_
    best_score_xgb=-1*cv_xgb.best_score_
    print(best_xgb)
    print(best_score_xgb)
    return best_xgb,best_score_xgb

In [3]:
def run_cv(X_train,y_train,score,degree_range):
    model_result={}
    processing_time={}
    model_result['lr'],processing_time['lr']=build_lr(X_train,y_train,score)
    model_result['pr'],processing_time['pr']=build_pr(X_train,y_train,score,degree_range)
    model_result['rf'],processing_time['rf']=build_rf(X_train,y_train,score)
    model_result['gbr'],processing_time['gbr']=build_gbr(X_train,y_train,score)
    model_result['xgb'],processing_time['xgb']=build_xgb(X_train,y_train,score)
    model_result['svr'],processing_time['svr']=build_svr(X_train,y_train,score)
    return model_result,processing_time

In [1]:

def model_accuracy(model,X_test,y_test):    
    from sklearn.metrics import mean_squared_error
    results = []
    prediction = model.predict(X_test)
    diff=np.absolute(y_test-prediction)
    mse=mean_squared_error(y_test,prediction)
    print(f'for a length of {y_test_all.size} the total inaccuracy is {np.sum(y_test)}')
    print(f'average inaccuracy is: {np.average(diff)}')
    print(f'mse: {mse}')
    return prediction,mse