In [None]:
path = 'C:/Users/1/attribution_project/'

In [None]:
random_state = 7
criterion = 'r2' #'neg_mean_squared_error'

In [None]:
algorithms = ['RandomForestRegressor','LinearRegression','XGBRegressor','LGBMRegressor','CatBoostRegressor']
encoders = ['OneHotEncoder', 'CatBoostEncoder', 'None']

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

In [None]:
def encoding_selector(path, X_train, y_train, X_test, y_test, algorithm, encoder='None',
                      categorical_vars=[], save=False):
    
    import category_encoders
    
    if encoder=='None':
        return(X_train, X_test)
    
    elif encoder == 'CatBoostEncoder':
        enc = category_encoders.cat_boost.CatBoostEncoder().fit(X_train,y_train)
    elif encoder == 'OneHotEncoder':
        
        X = pd.concat([X_train, X_test])     
        enc = category_encoders.one_hot.OneHotEncoder(cols=categorical_vars).fit(X)
        X_encoded = enc.transform(X)
        
        if save: 
            enc_list_col = [m['mapping'].columns.to_list() for m in enc.mapping]
            enc_list_col_ids = [[X_encoded.columns.to_list().index(col) for col in list_col] 
                                for list_col in enc_list_col]

            pickle.dump(enc_list_col_ids, 
                        open(path + algorithm + '_' + encoder + '_enc_list_col_ids.pkl','wb'))

            #enc.mapping
            pickle.dump(enc, open(path + algorithm + '_' + encoder + '_enc.pkl','wb'))
            

    # transform the dataset
    X_train_encoded = enc.transform(X_train)
    X_test_encoded = enc.transform(X_test)
    
    return(X_train_encoded, X_test_encoded)

In [None]:
def algorithm_selector(algorithm, encoder, categorical_vars=[], eval_metric='r2', 
                       params=None, random_state=7):

    if algorithm == 'CatBoostRegressor':
        from catboost import CatBoostRegressor
        
        if params is None:
            
            if encoder == 'None':
                model = CatBoostRegressor(verbose=0, cat_features=categorical_vars)
            else:
                model = CatBoostRegressor(verbose=0)
            
            parameters = {'depth'         : [2,4,6,8,10,12,14],
                          'iterations': [50,100,200,500,1000,1500]
                                     }
        else:
            parameters = params

            if encoder == 'None':
                model = CatBoostRegressor(verbose=0, cat_features=categorical_vars, depth=params['depth'], 
                                          iterations=params['iterations'])
            else:
                model = CatBoostRegressor(verbose=0, depth=params['depth'], iterations=params['iterations'])
                
        parameters_disp = parameters.copy()
        
    elif algorithm == 'XGBRegressor':
        from xgboost import XGBRegressor
        
        if params is None:
            model = XGBRegressor()
            parameters = {'max_depth'         : [2,4,6,8,10,12,14],
                          'n_estimators': [50,100,200,500,1000,1500]}
        else:
            model = XGBRegressor(max_depth=params['max_depth'], n_estimators=params['n_estimators'])
            parameters = params
        
        parameters_disp = parameters.copy()
        
    elif algorithm == 'LGBMRegressor':
        from lightgbm import LGBMRegressor
        
        if params is None:
            
            if encoder == 'None':
                model = LGBMRegressor()
            else:
                model = LGBMRegressor()
                
            parameters = {'max_depth'         : [2,4,6,8,10,12,14],
                          'n_estimators': [50,100,200,500,1000,1500]}
        else:
            model = LGBMRegressor(max_depth=params['max_depth'], n_estimators=params['n_estimators'])
            parameters = params           
        
        parameters_disp = parameters.copy()
        
    elif algorithm == 'RandomForestRegressor':
        from sklearn.ensemble import RandomForestRegressor
        
        if params is None:
            model = RandomForestRegressor(random_state=random_state)
            parameters = {'max_depth'         : [2,4,6,8,10,12,14],
                          'n_estimators': [50,100,200,500,1000,1500]}
        else:
            model = RandomForestRegressor(max_depth=params['max_depth'], n_estimators=params['n_estimators'], 
                                          random_state=random_state)
            parameters = params
        
        parameters_disp = parameters.copy()
        
    elif algorithm == 'LinearRegression':
        from sklearn.linear_model import LinearRegression
        model = LinearRegression()
        
        parameters = None
        parameters_disp = None 
        
    return(model,parameters,parameters_disp)

In [None]:
def plot_grid_search_scorers(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2,
                             metric='mean', scorer='r2' ):

    if scorer == 'r2':
        scorer_label = 'r-square'
    elif scorer == 'neg_mean_squared_error':
        scorer_label = 'MSE'
    
    _, ax = plt.subplots(1,1)

    if metric == 'std':
        # Get Test Scores std and for each grid search
        scores_std = cv_results['std_test_'+scorer]
        s=[]
        for i in range(len(grid_param_2)):
            l = scores_std[i::len(grid_param_2)]
            s.append(l)
        scores_std = np.array(s)
        
        for idx, val in enumerate(grid_param_2):
            ax.plot(grid_param_1, scores_std[idx,:], '-o', label= name_param_2 + ': ' + str(val))
 
        ax.set_ylabel('CV test ' + scorer_label + ' score SD', fontsize=16)
    
    else:
        # Get Test Scores mean and for each grid search
        scores_mean = cv_results['mean_test_'+scorer]
        s=[]
        for i in range(len(grid_param_2)):
            l = scores_mean[i::len(grid_param_2)]
            if scorer == 'neg_mean_squared_error':
                l =  [abs(el) for el in l]
            s.append(l)
        scores_mean = np.array(s)
        
        for idx, val in enumerate(grid_param_2):
            ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

        ax.set_ylabel('CV mean test ' + scorer_label + ' score', fontsize=16)
        
    ax.set_title("Grid Search Scores", fontsize=10, fontweight='bold')
    ax.set_xlabel(name_param_1, fontsize=8)
    ax.legend(loc="best", fontsize=12, framealpha=1)
    ax.grid('on')

In [None]:
def run_combination(path, algorithm, encoder, X_train, X_test, y_train, y_test, cv, categorical_vars, 
                    criterion='r2', save=False, random_state=7):

    path_performance = path + 'performance/'  + criterion + '/' + algorithm + '_' + encoder + '_'
    
    results = dict()
    results['algorithm'] = algorithm
    results['encoder'] = encoder
    results1 = dict()

    (X_train, X_test) = encoding_selector(path, X_train, y_train, X_test, y_test, algorithm, encoder=encoder,
                                          categorical_vars=categorical_vars, save=save)
    
    X_train_copy = X_train.copy()
    X_test_copy = X_test.copy()
    
    X_train_copy['dataset']=1
    X_test_copy['dataset']=2

    X = pd.concat([X_train_copy, X_test_copy])
    
    if save:
        pickle.dump(X, open(path + algorithm + '_' + encoder + '_X.pkl','wb'))
        pickle.dump(X_train, open(path + algorithm + '_' + encoder + '_X_train.pkl','wb'))
        pickle.dump(X_test, open(path + algorithm + '_' + encoder + '_X_test.pkl','wb'))

    #algorithm setup
    model,parameters,parameters_disp = algorithm_selector(algorithm, encoder, 
                                                          categorical_vars=categorical_vars,
                                                          eval_metric=criterion, random_state=random_state)
    if algorithm != 'LinearRegression':
        param_names = list(parameters.keys())

    #the learning process
    from sklearn import datasets
    from sklearn.model_selection import GridSearchCV


    scoring = {'r2', 'neg_mean_squared_error'}

    #linear regression
    if parameters==None:
        from sklearn.model_selection import cross_validate
        CV_scores = cross_validate(model, X_train, y_train, cv=cv, scoring=scoring)
        
        results['CV R2'] = np.mean(list(CV_scores['test_r2']))
        results['CV R2 STD'] = np.std(list(CV_scores['test_r2']))
        results['CV MSE'] = abs(np.mean(list(CV_scores['test_neg_mean_squared_error'])))
        results['CV MSE STD'] = np.std(list(CV_scores['test_neg_mean_squared_error']))

        
    else:

        grid = GridSearchCV(estimator= model, param_grid = parameters, scoring=scoring, refit=criterion, 
                            cv = cv, n_jobs=-1, return_train_score=True)
        grid.fit(X_train, y_train)
    
        pickle.dump(grid, open(path + algorithm + '_' + encoder + '_grid.pkl','wb'))
        
        winner = np.argmax(grid.cv_results_['mean_test_' + criterion])
        
        params = {param_names[0]:grid.cv_results_['params'][winner][param_names[0]], 
                  param_names[1]:grid.cv_results_['params'][winner][param_names[1]]}

        param_unifier = {'max_depth':'depth', 'depth':'depth', 'n_estimators':'n_estimators', 
                         'iterations':'iterations'}
        
        results[param_unifier[param_names[0]]] = grid.cv_results_['params'][winner][param_names[0]]
        results[param_unifier[param_names[1]]] = grid.cv_results_['params'][winner][param_names[1]]

        results['CV R2'] = grid.cv_results_['mean_test_r2'][winner]
        results['CV R2 STD'] = grid.cv_results_['std_test_r2'][winner]
        results['CV MSE'] = abs(grid.cv_results_['mean_test_neg_mean_squared_error'][winner])
        results['CV MSE STD'] = abs(grid.cv_results_['std_test_neg_mean_squared_error'][winner])
        
        plt.clf()
        plt.tight_layout()
        plot_grid_search_scorers(grid.cv_results_, parameters_disp[param_names[0]], 
                                 parameters_disp[param_names[1]], param_names[0], param_names[1], 
                                 metric='mean', scorer='neg_mean_squared_error')
        plt.savefig(path_performance + 'grid_MSE_mean.jpg')
        
        plt.clf()
        plt.tight_layout()
        plot_grid_search_scorers(grid.cv_results_, parameters_disp[param_names[0]],
                                 parameters_disp[param_names[1]], param_names[0], param_names[1], 
                                 metric='std', scorer='neg_mean_squared_error')
        plt.savefig(path_performance + 'grid_MSE_std.jpg')
        
        plt.clf()
        plt.tight_layout()
        plot_grid_search_scorers(grid.cv_results_, parameters_disp[param_names[0]], 
                                 parameters_disp[param_names[1]], param_names[0], param_names[1],
                                 metric='mean', scorer='r2')
        plt.savefig(path_performance + 'grid_R2_mean.jpg')
        
        plt.clf()
        plt.tight_layout()
        plot_grid_search_scorers(grid.cv_results_, parameters_disp[param_names[0]], 
                                 parameters_disp[param_names[1]], param_names[0], param_names[1],
                                 metric='std', scorer='r2')
        plt.savefig(path_performance + 'grid_R2_std.jpg')
        
        model,parameters,parameters_disp = algorithm_selector(algorithm,encoder,
                                                              categorical_vars=categorical_vars,
                                                              params=params)
        
    model.fit(X_train,y_train)
    
    y_train_pred = model.predict(X_train)
    results1['train_corr'] = np.corrcoef(y_train, y_train_pred)[0,1]
    
    y_test_pred = model.predict(X_test)
    results1['test_corr'] = np.corrcoef(y_test, y_test_pred)[0,1]
    
    from catboost.utils import eval_metric
    
    results1['train R2'] = eval_metric(y_train, y_train_pred, 'R2')[0]
    results1['test R2'] = eval_metric(list(y_test), list(y_test_pred), 'R2')[0]
    
    results1['train MSE'] = eval_metric(y_train, y_train_pred, 'RMSE')[0] **2
    results1['test MSE'] = eval_metric(list(y_test), list(y_test_pred), 'RMSE')[0] **2

    
    if save: #and algorithm != 'LinearRegression':
        pickle.dump(model, open(path + algorithm + '_' + encoder + '_model.pkl','wb'))
        pickle.dump(y_test_pred, open(path + algorithm + '_' + encoder + '_y_test_pred.pkl','wb'))
        pickle.dump(y_train_pred, open(path + algorithm + '_' + encoder + '_y_train_pred.pkl','wb'))
    
    
    #analysing results
    
    path_performance_train = path + 'performance/'  + criterion + '/train/' + algorithm + '_' + encoder + '_'
    path_performance_test = path + 'performance/'  + criterion + '/test/' + algorithm + '_' + encoder + '_'
    
    #train data
    
    plt.clf()
    plt.plot(y_train, y_train_pred, 'o', color='black')
    plt.tight_layout()
    plt.savefig(path_performance_train + 'y_pred.jpg')

    plt.clf()
    plt.plot(y_train, y_train_pred-y_train, 'o', color='black')
    plt.tight_layout()
    plt.savefig(path_performance_train + 'y_error.jpg')


    MAPE_train = np.abs((y_train - y_train_pred) / y_train)
    bul_train = np.isinf(MAPE_train)

    plt.clf()
    plt.hist(MAPE_train[~bul_train])
    plt.tight_layout()
    plt.savefig(path_performance_train + 'y_nonzero_pred.jpg')

    plt.clf()
    plt.hist(y_train_pred[bul_train])
    plt.tight_layout()
    plt.savefig(path_performance_train + 'y_zero_pred.jpg')


    #test data

    plt.clf()
    plt.plot(y_test, y_test_pred, 'o', color='black')
    plt.tight_layout()
    plt.savefig(path_performance_test + 'y_pred.jpg')

    plt.clf()
    plt.plot(y_test, y_test_pred-y_test, 'o', color='black')
    plt.tight_layout()
    plt.savefig(path_performance_test + 'y_error.jpg')


    MAPE_test = np.abs((y_test - y_test_pred) / y_test)
    bul_test = np.isinf(MAPE_test)

    plt.clf()
    plt.hist(MAPE_test[~bul_test])
    plt.tight_layout()
    plt.savefig(path_performance_test + 'y_nonzero_pred.jpg')

    plt.clf()
    plt.hist(y_test_pred[bul_test])
    plt.tight_layout()
    plt.savefig(path_performance_test + 'y_zero_pred.jpg')
    
    
    results1['MAPE_train_nonzero'] = np.mean(MAPE_train[~bul_train])
    results1['MAPE_test_nonzero'] = np.mean(MAPE_test[~bul_test])
        
    results1['MAE_train_zero'] = np.mean(np.absolute(y_train_pred[bul_train]))
    results1['MAE_test_zero'] = np.mean(np.absolute(y_test_pred[bul_test]))
    
    df_results1 = pd.DataFrame.from_dict([results1])
    pickle.dump(df_results1, open(path_performance + 'results.pkl','wb'))
    df_results1.columns = df_results1.columns.str.replace('_',' ')
    df_results1.to_csv(path_performance + 'results.csv', index=False)
    
    return results

In [None]:
for m in [10,7,3]:

    print('m ', m)

    path_in = path + 'results/m' + str(m) + '/data/'
    path_out = path + 'results/m' + str(m) + '/output/'

    X_train = pickle.load(open(path_in + 'X_train.pkl','rb'))
    y_train = pickle.load(open(path_in + 'y_train.pkl','rb'))

    X_test = pickle.load(open(path_in + 'X_test.pkl','rb'))
    y_test = pickle.load(open(path_in + 'y_test.pkl','rb'))

    cv = pickle.load(open(path_in + 'cv.pkl','rb'))
    categorical_vars = pickle.load(open(path_in + 'categorical_vars.pkl','rb'))

    results = []

    for algorithm in algorithms:
        for encoder in encoders:
            if (encoder != 'None' or algorithm in ('LGBMRegressor', 'CatBoostRegressor')) and (encoder != 'CatBoostEncoder' 
                                                                                        or  algorithm != 'CatBoostRegressor'):
                print(algorithm, encoder)
                run_results = run_combination(path_out, algorithm, encoder, X_train, X_test, y_train, y_test,
                                              cv, categorical_vars, criterion=criterion,
                                              save=True, random_state=random_state)

                results.append(run_results)


    df_results = pd.DataFrame.from_dict(results)

    #ordering the results
    criterion2order = {'r2':'CV R2', 'neg_mean_squared_error':'CV MSE'}
    order = criterion2order[criterion]

    if criterion == 'neg_mean_squared_error':
        df_results = df_results.sort_values(by=order, ascending=True)
    else:
        df_results = df_results.sort_values(by=order, ascending=False)

    pickle.dump(df_results, open(path_out  + 'performance/' + criterion + '/df_results.pkl','wb'))

    df_results.columns = df_results.columns.str.replace('_',' ')

    df_results.to_csv(path_out + 'performance/' + criterion + '/results.csv', index=False)