In [102]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
%matplotlib inline
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import LinearRegression

#additional libraries
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import cross_validate


# annoying warnings
import warnings
warnings.filterwarnings('ignore')

#additional libraries, Payman
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor
import xgboost as xgb

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import tree
import glob

In [103]:
def display_all(df):
    with pd.option_context("display.max_rows", 100, "display.max_columns", 100): 
        display(df)

In [104]:
def oversample_extremes(data, low_val=42, low_weight=2, high_val=48, high_weight=2):
    '''
    A function that will create duplicate rows of specified rows in order to weight certain rows in machine learning training
    
    INPUTS:
    data: a pandas dataframe
    low_val: value of 'Quality' below which we will duplicate rows
    low_weight: integer of effective weight desired for low values
    high_val: value of 'Quality' above which we will duplicate rows
    high_weight: integer for effective weight desired for high values
    
    RETURNS:
    newdata: a pandas dataframe with duplicated rows according to the arguments passed
    
    NOTES:
    
    'low_weight' and 'high_weight' both need integer values and correspond to the effective weight of the sample compared
    to the rest of the data in the dataframe. For an example, a weight of 2 would mean creating 1 duplicate (so the
    row is now in the data twice), and a weight of 1 would return the same dataset (no additional duplicates created)
    
    Default cutoff values correspond to roughly anything outside the IQR
    '''
    # identify rows of interest
    low_df = data.loc[data['Quality'] < low_val]
    high_df = data.loc[data['Quality'] > high_val]
    
    newdata = data.copy()
    
    # loop to add on all the rows
    
    for i in range(low_weight-1): #pythonic indexing...
        newdata = newdata.append(low_df, ignore_index = True)
    for i in range(high_weight-1):
        newdata = newdata.append(high_df, ignore_index = True)
    
    return(newdata)
    

### XGBRegressor

**Without sampling**

In [105]:
xgb_dict = {}
os.chdir("../p_data")
for file in glob.glob('*.csv'):
#     print('\nData:',file)
    df = pd.read_csv(file)
    train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_train = train['Quality']
    train.drop(['Quality'], axis = 1, inplace = True)
    X_train = train
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val
    
    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test

    xgb_ml = xgb.XGBRegressor(n_estimators=160, max_depth=15)
    

    xgb_fit = xgb_ml.fit(X_train, y_train)
    xgb_preds = xgb_fit.predict(X_val)
#     print(xgb_ml.feature_importances_)
    
    mae = mean_absolute_error(y_val, xgb_preds)
    mse = mean_squared_error(y_val, xgb_preds)
    r_squared = r2_score(y_val, xgb_preds)
    
    xgb_dict[str(file)] = {}
    xgb_dict[str(file)]['mae'] = mae
    xgb_dict[str(file)]['mse'] = mse
    xgb_dict[str(file)]['r_squared'] = r_squared
    xgb_dict[str(file)]['feat_imp'] = xgb_ml.feature_importances_
    
    
#     print("MAE: {:.{}f}".format(mae, 2))
#     print("MSE: {:.{}f}".format(mse, 2))
#     print("R-squared: {:.{}f}".format(r_squared, 2))
    

In [106]:
xgb_df = pd.DataFrame.from_dict(xgb_dict)

**With sampling**

In [107]:
xgb_s_dict = {}
os.chdir("../p_data")
for file in glob.glob('*.csv'):
#     print('\nData:',file)
    df = pd.read_csv(file)
    temp_train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val

    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    for w in range(1,6,2):
#         print('Weight:',w)
        temp_df = oversample_extremes(temp_train, low_weight=w, high_weight = w)

        y_train = temp_df['Quality']
        temp_df.drop(['Quality'], axis = 1, inplace = True)
        X_train = temp_df

        xgb_ml = xgb.XGBRegressor(n_estimators=160, max_depth=15)
        
        xgb_fit = xgb_ml.fit(X_train, y_train)
        xgb_preds = xgb_fit.predict(X_val)
        
        mae = mean_absolute_error(y_val, xgb_preds)
        mse = mean_squared_error(y_val, xgb_preds)
        r_squared = r2_score(y_val, xgb_preds)
        
        xgb_s_dict[str(file) + '_w' + str(w)] = {}
        xgb_s_dict[str(file) + '_w' + str(w)]['mae'] = mae
        xgb_s_dict[str(file) + '_w' + str(w)]['mse'] = mse
        xgb_s_dict[str(file) + '_w' + str(w)]['r_squared'] = r_squared
        xgb_s_dict[str(file) + '_w' + str(w)]['feat_imp'] = xgb_ml.feature_importances_


#         print("MAE: {:.{}f}".format(mae, 2))
#         print("MSE: {:.{}f}".format(mse, 2))
#         print("R-squared: {:.{}f}".format(r_squared, 2))

In [108]:
xgb_s_df = pd.DataFrame.from_dict(xgb_s_dict)

In [109]:
xgb_summ = pd.concat([xgb_df.T, xgb_s_df.T])

In [110]:
xgb_summ['Model'] = 'XGB'

### RandomForestRegressor

In [111]:
#get best hyper parameters for the model
# hyperparameters = {
#     'n_estimators': range(90, 201, 10),
#     'max_depth': range(2, 12, 2),
#     'min_samples_split' : range(2, 10, 2),
#     'min_samples_leaf' : list(range(1, 10, 1))
# #     'min_weight_fraction_leaf' : (0.0)
#                               }

# grid = GridSearchCV(base_model, param_grid=hyperparameters, n_jobs=-1)
# grid.fit(X_train, y_train)
# print('score = {}\nparams={}'.format(grid.best_score_, grid.best_params_))

**Without sampling**

In [112]:
rf_dict = {}
os.chdir("../p_data")
for file in glob.glob('*.csv'):
#     print('\nData:',file)
    df = pd.read_csv(file)
    train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_train = train['Quality']
    train.drop(['Quality'], axis = 1, inplace = True)
    X_train = train
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val
    
    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    rf_reg = RandomForestRegressor(n_jobs=5, n_estimators= 120, max_depth= 10,
                    min_samples_split = 9,min_samples_leaf = 4,
                    min_weight_fraction_leaf= 0.0)
    
    rf_fit = rf_reg.fit(X_train, y_train)
    rf_reg_preds = rf_fit.predict(X_val)
    
    mae = mean_absolute_error(y_val, rf_reg_preds)
    mse = mean_squared_error(y_val, rf_reg_preds)
    r_squared = r2_score(y_val, rf_reg_preds)
    
    rf_dict[str(file)] = {}
    rf_dict[str(file)]['mae'] = mae
    rf_dict[str(file)]['mse'] = mse
    rf_dict[str(file)]['r_squared'] = r_squared
    rf_dict[str(file)]['feat_imp'] = rf_reg.feature_importances_
    


In [113]:
rf_df = pd.DataFrame.from_dict(rf_dict)

**With sampling**

In [114]:
rf_s_dict = {}
os.chdir("../p_data")
for file in glob.glob('*.csv'):
#     print('\nData:',file)
    df = pd.read_csv(file)
    temp_train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val

    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    for w in range(1,6,2):
#         print('Weight:',w)
        temp_df = oversample_extremes(temp_train, low_weight=w, high_weight = w)

        y_train = temp_df['Quality']
        temp_df.drop(['Quality'], axis = 1, inplace = True)
        X_train = temp_df

        rf_reg = RandomForestRegressor(n_jobs=5, n_estimators= 120, max_depth= 10,
                        min_samples_split = 9,min_samples_leaf = 4,
                        min_weight_fraction_leaf= 0.0)

        rf_fit = rf_reg.fit(X_train, y_train)
        rf_reg_preds = rf_fit.predict(X_val)
        
        mae = mean_absolute_error(y_val, rf_reg_preds)
        mse = mean_squared_error(y_val, rf_reg_preds)
        r_squared = r2_score(y_val, rf_reg_preds)
        
        rf_s_dict[str(file) + '_w' + str(w)] = {}
        rf_s_dict[str(file) + '_w' + str(w)]['mae'] = mae
        rf_s_dict[str(file) + '_w' + str(w)]['mse'] = mse
        rf_s_dict[str(file) + '_w' + str(w)]['r_squared'] = r_squared
        rf_s_dict[str(file) + '_w' + str(w)]['feat_imp'] = rf_reg.feature_importances_


In [115]:
rf_s_df = pd.DataFrame.from_dict(rf_s_dict)

In [116]:
rf_summ = pd.concat([rf_df.T, rf_s_df.T])

In [117]:
rf_summ['Model'] = 'RandomForest'

In [118]:
model_summ = pd.concat([xgb_summ, rf_summ])

In [119]:
model_summ

Unnamed: 0,feat_imp,mae,mse,r_squared,Model
no_tempinc.csv,"[0.010616321, 0.01124595, 0.7321606, 0.0095584...",0.614908,1.27309,0.675926,XGB
no_tempinc_minmax.csv,"[0.0040060766, 0.0, 0.004586884, 0.87501675, 0...",0.0294134,0.00268545,0.630875,XGB
no_tempinc_standard.csv,"[0.002098261, 0.0, 0.004272864, 0.9188961, 0.0...",0.37335,0.378409,0.59606,XGB
tempinc.csv,"[0.004522228, 0.005455211, 0.82258695, 0.00869...",0.655323,1.25939,0.687694,XGB
tempinc_minmax.csv,"[0.0027495276, 0.0, 0.00253314, 0.9097434, 0.0...",0.028061,0.00177771,0.687364,XGB
tempinc_standard.csv,"[0.001598057, 0.0, 0.0026110376, 0.93779457, 0...",0.318485,0.305168,0.693136,XGB
no_tempinc.csv_w1,"[0.007185624, 0.006700246, 0.8533451, 0.005470...",0.607769,0.975468,0.738696,XGB
no_tempinc.csv_w3,"[0.0025024228, 0.0038228417, 0.93996364, 0.001...",0.668146,1.18457,0.682683,XGB
no_tempinc.csv_w5,"[0.0014790962, 0.0026599183, 0.9485154, 0.0009...",0.683214,1.24799,0.665694,XGB
no_tempinc_minmax.csv_w1,"[0.0035745816, 0.0, 0.0040917215, 0.9350306, 0...",0.0312823,0.00318844,0.475876,XGB


In [122]:
model_summ.sort_values(by = 'r_squared', ascending=False)

Unnamed: 0,feat_imp,mae,mse,r_squared,Model
no_tempinc.csv_w1,"[0.007185624, 0.006700246, 0.8533451, 0.005470...",0.607769,0.975468,0.738696,XGB
no_tempinc_standard.csv,"[0.030535894528414827, 0.03083623724014279, 0....",0.361494,0.25033,0.711342,RandomForest
tempinc_standard.csv,"[0.001598057, 0.0, 0.0026110376, 0.93779457, 0...",0.318485,0.305168,0.693136,XGB
tempinc.csv,"[0.004522228, 0.005455211, 0.82258695, 0.00869...",0.655323,1.25939,0.687694,XGB
tempinc_minmax.csv,"[0.0027495276, 0.0, 0.00253314, 0.9097434, 0.0...",0.028061,0.00177771,0.687364,XGB
no_tempinc.csv_w3,"[0.0025024228, 0.0038228417, 0.93996364, 0.001...",0.668146,1.18457,0.682683,XGB
no_tempinc.csv,"[0.010616321, 0.01124595, 0.7321606, 0.0095584...",0.614908,1.27309,0.675926,XGB
tempinc.csv_w1,"[0.0075900466, 0.007901227, 0.7488843, 0.00734...",0.672679,1.39695,0.675303,XGB
no_tempinc.csv_w5,"[0.0014790962, 0.0026599183, 0.9485154, 0.0009...",0.683214,1.24799,0.665694,XGB
no_tempinc_minmax.csv,"[0.04322643208376294, 0.03836253261671352, 0.0...",0.0323015,0.00217952,0.661565,RandomForest
