In [47]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
%matplotlib inline
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import LinearRegression

#additional libraries
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import cross_validate


# annoying warnings
import warnings
warnings.filterwarnings('ignore')

#additional libraries, Payman
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor
import xgboost as xgb

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import tree
import glob

In [2]:
def display_all(df):
    with pd.option_context("display.max_rows", 100, "display.max_columns", 100): 
        display(df)

In [59]:
def oversample_extremes(data, low_val=42, low_weight=2, high_val=48, high_weight=2):
    '''
    A function that will create duplicate rows of specified rows in order to weight certain rows in machine learning training
    
    INPUTS:
    data: a pandas dataframe
    low_val: value of 'Quality' below which we will duplicate rows
    low_weight: integer of effective weight desired for low values
    high_val: value of 'Quality' above which we will duplicate rows
    high_weight: integer for effective weight desired for high values
    
    RETURNS:
    newdata: a pandas dataframe with duplicated rows according to the arguments passed
    
    NOTES:
    
    'low_weight' and 'high_weight' both need integer values and correspond to the effective weight of the sample compared
    to the rest of the data in the dataframe. For an example, a weight of 2 would mean creating 1 duplicate (so the
    row is now in the data twice), and a weight of 1 would return the same dataset (no additional duplicates created)
    
    Default cutoff values correspond to roughly anything outside the IQR
    '''
    # identify rows of interest
    low_df = data.loc[data['Quality'] < low_val]
    high_df = data.loc[data['Quality'] > high_val]
    
    newdata = data.copy()
    
    # loop to add on all the rows
    
    for i in range(low_weight-1): #pythonic indexing...
        newdata = newdata.append(low_df, ignore_index = True)
    for i in range(high_weight-1):
        newdata = newdata.append(high_df, ignore_index = True)
    
    return(newdata)
    

### XGBRegressor

**Without sampling**

In [76]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_train = train['Quality']
    train.drop(['Quality'], axis = 1, inplace = True)
    X_train = train
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val
    
    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test

    xgb_ml = xgb.XGBRegressor(n_estimators=160, max_depth=15)
    

    xgb_fit = xgb_ml.fit(X_train, y_train)
    xgb_preds = xgb_fit.predict(X_val)
    
    print("MAE: {:.{}f}".format(mean_absolute_error(y_val, xgb_preds), 2))
    print("MSE: {:.{}f}".format(mean_squared_error(y_val, xgb_preds), 2))
    print("R-squared: {:.{}f}".format(r2_score(y_val, xgb_preds), 2))
    


Data: no_tempinc.csv
MAE: 0.69
MSE: 1.22
R-squared: 0.68

Data: no_tempinc_minmax.csv
MAE: 0.03
MSE: 0.00
R-squared: 0.69

Data: no_tempinc_standard.csv
MAE: 0.39
MSE: 0.50
R-squared: 0.55

Data: tempinc.csv
MAE: 0.63
MSE: 1.20
R-squared: 0.69

Data: tempinc_minmax.csv
MAE: 0.03
MSE: 0.00
R-squared: 0.62

Data: tempinc_standard.csv
MAE: 0.31
MSE: 0.24
R-squared: 0.71


**With sampling**

In [77]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    temp_train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val

    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    for w in range(1,6,2):
        print('Weight:',w)
        temp_df = oversample_extremes(temp_train, low_weight=w, high_weight = w)

        y_train = temp_df['Quality']
        temp_df.drop(['Quality'], axis = 1, inplace = True)
        X_train = temp_df

        xgb_ml = xgb.XGBRegressor(n_estimators=160, max_depth=15)


        xgb_fit = xgb_ml.fit(X_train, y_train)
        xgb_preds = xgb_fit.predict(X_val)

        print("MAE: {:.{}f}".format(mean_absolute_error(y_val, xgb_preds), 2))
        print("MSE: {:.{}f}".format(mean_squared_error(y_val, xgb_preds), 2))
        print("R-squared: {:.{}f}".format(r2_score(y_val, xgb_preds), 2))


Data: no_tempinc.csv
Weight: 1
MAE: 0.63
MSE: 1.08
R-squared: 0.71
Weight: 3
MAE: 0.68
MSE: 1.12
R-squared: 0.70
Weight: 5
MAE: 0.70
MSE: 1.11
R-squared: 0.70

Data: no_tempinc_minmax.csv
Weight: 1
MAE: 0.03
MSE: 0.00
R-squared: 0.57
Weight: 3
MAE: 0.03
MSE: 0.00
R-squared: 0.52
Weight: 5
MAE: 0.03
MSE: 0.00
R-squared: 0.48

Data: no_tempinc_standard.csv
Weight: 1
MAE: 0.34
MSE: 0.30
R-squared: 0.69
Weight: 3
MAE: 0.39
MSE: 0.43
R-squared: 0.56
Weight: 5
MAE: 0.40
MSE: 0.47
R-squared: 0.52

Data: tempinc.csv
Weight: 1
MAE: 0.71
MSE: 1.38
R-squared: 0.60
Weight: 3
MAE: 0.71
MSE: 1.31
R-squared: 0.62
Weight: 5
MAE: 0.74
MSE: 1.36
R-squared: 0.61

Data: tempinc_minmax.csv
Weight: 1
MAE: 0.03
MSE: 0.00
R-squared: 0.66
Weight: 3
MAE: 0.03
MSE: 0.00
R-squared: 0.62
Weight: 5
MAE: 0.03
MSE: 0.00
R-squared: 0.63

Data: tempinc_standard.csv
Weight: 1
MAE: 0.38
MSE: 0.39
R-squared: 0.62
Weight: 3
MAE: 0.38
MSE: 0.42
R-squared: 0.59
Weight: 5
MAE: 0.39
MSE: 0.44
R-squared: 0.57


### RandomForestRegressor

In [208]:
#get best hyper parameters for the model
# hyperparameters = {
#     'n_estimators': range(90, 201, 10),
#     'max_depth': range(2, 12, 2),
#     'min_samples_split' : range(2, 10, 2),
#     'min_samples_leaf' : list(range(1, 10, 1))
# #     'min_weight_fraction_leaf' : (0.0)
#                               }

# grid = GridSearchCV(base_model, param_grid=hyperparameters, n_jobs=-1)
# grid.fit(X_train, y_train)
# print('score = {}\nparams={}'.format(grid.best_score_, grid.best_params_))

score = 0.6084947470169244
params={'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 8, 'n_estimators': 120}


**Without sampling**

In [63]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_train = train['Quality']
    train.drop(['Quality'], axis = 1, inplace = True)
    X_train = train
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val
    
    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    rf_reg = RandomForestRegressor(n_jobs=5, n_estimators= 120, max_depth= 10,
                    min_samples_split = 9,min_samples_leaf = 4,
                    min_weight_fraction_leaf= 0.0)
    
    rf_fit = rf_reg.fit(X_train, y_train)
    rf_reg_preds = rf_fit.predict(X_val)
    
    print("MAE: {:.{}f}".format(mean_absolute_error(y_val, rf_reg_preds), 2))
    print("MSE: {:.{}f}".format(mean_squared_error(y_val, rf_reg_preds), 2))
    print("R-squared: {:.{}f}".format(r2_score(y_val, rf_reg_preds), 2))


Data: no_tempinc.csv
MAE: 0.76
MSE: 1.95
R-squared: 0.53

Data: no_tempinc_minmax.csv
MAE: 0.03
MSE: 0.00
R-squared: 0.63

Data: no_tempinc_standard.csv
MAE: 0.43
MSE: 0.51
R-squared: 0.56

Data: tempinc.csv
MAE: 0.69
MSE: 0.93
R-squared: 0.69

Data: tempinc_minmax.csv
MAE: 0.03
MSE: 0.00
R-squared: 0.59

Data: tempinc_standard.csv
MAE: 0.37
MSE: 0.32
R-squared: 0.65


**With sampling**

In [61]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    temp_train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val

    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    for w in range(1,6,2):
        print('Weight:',w)
        temp_df = oversample_extremes(temp_train, low_weight=w, high_weight = w)

        y_train = temp_df['Quality']
        temp_df.drop(['Quality'], axis = 1, inplace = True)
        X_train = temp_df

        rf_reg = RandomForestRegressor(n_jobs=5, n_estimators= 120, max_depth= 10,
                        min_samples_split = 9,min_samples_leaf = 4,
                        min_weight_fraction_leaf= 0.0)

        rf_fit = rf_reg.fit(X_train, y_train)
        rf_reg_preds = rf_fit.predict(X_val)

        print("MAE: {:.{}f}".format(mean_absolute_error(y_val, rf_reg_preds), 2))
        print("MSE: {:.{}f}".format(mean_squared_error(y_val, rf_reg_preds), 2))
        print("R-squared: {:.{}f}".format(r2_score(y_val, rf_reg_preds), 2))




Data: no_tempinc.csv
Weight: 1
MAE: 0.77
MSE: 1.76
R-squared: 0.53
Weight: 3
MAE: 0.81
MSE: 1.67
R-squared: 0.55
Weight: 5
MAE: 0.83
MSE: 1.67
R-squared: 0.55

Data: no_tempinc_minmax.csv
Weight: 1
MAE: 0.03
MSE: 0.00
R-squared: 0.60
Weight: 3
MAE: 0.03
MSE: 0.00
R-squared: 0.60
Weight: 5
MAE: 0.03
MSE: 0.00
R-squared: 0.58

Data: no_tempinc_standard.csv
Weight: 1
MAE: 0.42
MSE: 0.45
R-squared: 0.58
Weight: 3
MAE: 0.41
MSE: 0.44
R-squared: 0.59
Weight: 5
MAE: 0.42
MSE: 0.49
R-squared: 0.54

Data: tempinc.csv
Weight: 1
MAE: 0.77
MSE: 1.38
R-squared: 0.59
Weight: 3
MAE: 0.78
MSE: 1.44
R-squared: 0.57
Weight: 5
MAE: 0.78
MSE: 1.41
R-squared: 0.58

Data: tempinc_minmax.csv
Weight: 1
MAE: 0.03
MSE: 0.00
R-squared: 0.69
Weight: 3
MAE: 0.03
MSE: 0.00
R-squared: 0.61
Weight: 5
MAE: 0.03
MSE: 0.00
R-squared: 0.53

Data: tempinc_standard.csv
Weight: 1
MAE: 0.40
MSE: 0.42
R-squared: 0.63
Weight: 3
MAE: 0.38
MSE: 0.35
R-squared: 0.69
Weight: 5
MAE: 0.37
MSE: 0.35
R-squared: 0.69


### DecisionTreeRegressor

In [274]:
# dt_reg = tree.DecisionTreeRegressor()

In [275]:
# hyperparameters = {"criterion": ["mse", "mae"],
#               "min_samples_split": [10, 20, 40],
#               "max_depth": [2, 6, 8],
#               "min_samples_leaf": [20, 40, 100],
#               "max_leaf_nodes": [5, 20, 100],
#               }

In [276]:
# grid = GridSearchCV(dt_reg, param_grid=hyperparameters, n_jobs=-1)
# grid.fit(X_train_values, y_train_values)
# print('score = {}\nparams={}'.format(grid.best_score_, grid.best_params_))

score = 0.5147994141686736
params={'criterion': 'mse', 'max_depth': 8, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'min_samples_split': 10}


**Without sampling**

In [74]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_train = train['Quality']
    train.drop(['Quality'], axis = 1, inplace = True)
    X_train = train
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val
    
    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test

    dt_reg = tree.DecisionTreeRegressor(criterion = 'mse',
                                   max_depth = 8,
                                   max_leaf_nodes = 100,
                                   min_samples_leaf = 20,
                                   min_samples_split = 10
                                   )
    

    dt_fit = dt_reg.fit(X_train, y_train)
    dt_preds = dt_fit.predict(X_val)
    
    print("MAE: {:.{}f}".format(mean_absolute_error(y_val, dt_preds), 2))
    print("MSE: {:.{}f}".format(mean_squared_error(y_val, dt_preds), 2))
    print("R-squared: {:.{}f}".format(r2_score(y_val, dt_preds), 2))
    


Data: no_tempinc.csv
MAE: 0.90
MSE: 1.68
R-squared: 0.45

Data: no_tempinc_minmax.csv
MAE: 0.04
MSE: 0.00
R-squared: 0.50

Data: no_tempinc_standard.csv
MAE: 0.49
MSE: 0.58
R-squared: 0.43

Data: tempinc.csv
MAE: 0.87
MSE: 1.67
R-squared: 0.52

Data: tempinc_minmax.csv
MAE: 0.04
MSE: 0.00
R-squared: 0.48

Data: tempinc_standard.csv
MAE: 0.47
MSE: 0.42
R-squared: 0.57


**With sampling**

In [75]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    temp_train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val

    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    for w in range(1,6,2):
        print('Weight:',w)
        temp_df = oversample_extremes(temp_train, low_weight=w, high_weight = w)

        y_train = temp_df['Quality']
        temp_df.drop(['Quality'], axis = 1, inplace = True)
        X_train = temp_df

        dt_reg = tree.DecisionTreeRegressor(criterion = 'mse',
                                       max_depth = 8,
                                       max_leaf_nodes = 100,
                                       min_samples_leaf = 20,
                                       min_samples_split = 10
                                       )


        dt_fit = dt_reg.fit(X_train, y_train)
        dt_preds = dt_fit.predict(X_val)

        print("MAE: {:.{}f}".format(mean_absolute_error(y_val, dt_preds), 2))
        print("MSE: {:.{}f}".format(mean_squared_error(y_val, dt_preds), 2))
        print("R-squared: {:.{}f}".format(r2_score(y_val, dt_preds), 2))


Data: no_tempinc.csv
Weight: 1
MAE: 0.87
MSE: 1.76
R-squared: 0.52
Weight: 3
MAE: 0.96
MSE: 2.16
R-squared: 0.41
Weight: 5
MAE: 1.01
MSE: 2.35
R-squared: 0.36

Data: no_tempinc_minmax.csv
Weight: 1
MAE: 0.04
MSE: 0.00
R-squared: 0.51
Weight: 3
MAE: 0.03
MSE: 0.00
R-squared: 0.47
Weight: 5
MAE: 0.04
MSE: 0.00
R-squared: 0.49

Data: no_tempinc_standard.csv
Weight: 1
MAE: 0.46
MSE: 0.66
R-squared: 0.42
Weight: 3
MAE: 0.45
MSE: 0.65
R-squared: 0.43
Weight: 5
MAE: 0.46
MSE: 0.70
R-squared: 0.39

Data: tempinc.csv
Weight: 1
MAE: 0.88
MSE: 2.11
R-squared: 0.43
Weight: 3
MAE: 1.00
MSE: 2.41
R-squared: 0.36
Weight: 5
MAE: 1.05
MSE: 2.74
R-squared: 0.27

Data: tempinc_minmax.csv
Weight: 1
MAE: 0.04
MSE: 0.00
R-squared: 0.49
Weight: 3
MAE: 0.04
MSE: 0.00
R-squared: 0.39
Weight: 5
MAE: 0.03
MSE: 0.00
R-squared: 0.46

Data: tempinc_standard.csv
Weight: 1
MAE: 0.49
MSE: 0.50
R-squared: 0.56
Weight: 3
MAE: 0.50
MSE: 0.59
R-squared: 0.48
Weight: 5
MAE: 0.49
MSE: 0.59
R-squared: 0.48


### GradientBoostingRegressor

In [None]:
# gbr = ensemble.GradientBoostingRegressor()

In [253]:
#get best hyper parameters for the model
# hyperparameters = {
#     'loss' : ['ls', 'lad', 'huber', 'quantile'],
#     'learning_rate' : (1,2),
#     'n_estimators': range(90, 201, 10),
#     'max_depth': range(2, 12, 2),
#     'min_samples_split' : range(2, 10, 2),
#     'min_samples_leaf' : list(range(1, 10, 1))
# #     'min_weight_fraction_leaf' : (0.0)
#                               }

# grid = GridSearchCV(gbr, param_grid=hyperparameters, n_jobs=-1)
# grid.fit(X_train_values, y_train_values)
# print('score = {}\nparams={}'.format(grid.best_score_, grid.best_params_))

score = 0.5520808583904835
params={'learning_rate': 1, 'loss': 'lad', 'max_depth': 4, 'min_samples_leaf': 9, 'min_samples_split': 4, 'n_estimators': 170}


**Without sampling**

In [78]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_train = train['Quality']
    train.drop(['Quality'], axis = 1, inplace = True)
    X_train = train
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val
    
    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test

    gbr = ensemble.GradientBoostingRegressor(loss='huber',
                                         learning_rate=1, 
                                         max_depth=4, 
                                         n_estimators=170,
                                         min_samples_leaf = 9,
                                         min_samples_split=4)
    

    gbr_fit = gbr.fit(X_train, y_train)
    gbr_preds = gbr_fit.predict(X_val)
    
    print("MAE: {:.{}f}".format(mean_absolute_error(y_val, gbr_preds), 2))
    print("MSE: {:.{}f}".format(mean_squared_error(y_val, gbr_preds), 2))
    print("R-squared: {:.{}f}".format(r2_score(y_val, gbr_preds), 2))
    


Data: no_tempinc.csv
MAE: 0.86
MSE: 1.59
R-squared: 0.57

Data: no_tempinc_minmax.csv
MAE: 0.04
MSE: 0.00
R-squared: 0.44

Data: no_tempinc_standard.csv
MAE: 0.46
MSE: 0.46
R-squared: 0.51

Data: tempinc.csv
MAE: 0.85
MSE: 1.77
R-squared: 0.50

Data: tempinc_minmax.csv
MAE: 0.04
MSE: 0.00
R-squared: 0.55

Data: tempinc_standard.csv
MAE: 0.45
MSE: 0.48
R-squared: 0.54


**With sampling**

In [79]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    temp_train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val

    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    for w in range(1,6,2):
        print('Weight:',w)
        temp_df = oversample_extremes(temp_train, low_weight=w, high_weight = w)

        y_train = temp_df['Quality']
        temp_df.drop(['Quality'], axis = 1, inplace = True)
        X_train = temp_df

        gbr = ensemble.GradientBoostingRegressor(loss='huber',
                                             learning_rate=1, 
                                             max_depth=4, 
                                             n_estimators=170,
                                             min_samples_leaf = 9,
                                             min_samples_split=4)


        gbr_fit = gbr.fit(X_train, y_train)
        gbr_preds = gbr_fit.predict(X_val)

        print("MAE: {:.{}f}".format(mean_absolute_error(y_val, gbr_preds), 2))
        print("MSE: {:.{}f}".format(mean_squared_error(y_val, gbr_preds), 2))
        print("R-squared: {:.{}f}".format(r2_score(y_val, gbr_preds), 2))
    


Data: no_tempinc.csv
Weight: 1
MAE: 0.97
MSE: 2.81
R-squared: 0.39
Weight: 3
MAE: 1.02
MSE: 2.82
R-squared: 0.39
Weight: 5
MAE: 1.03
MSE: 3.12
R-squared: 0.32

Data: no_tempinc_minmax.csv
Weight: 1
MAE: 0.04
MSE: 0.00
R-squared: 0.42
Weight: 3
MAE: 0.04
MSE: 0.00
R-squared: 0.39
Weight: 5
MAE: 0.04
MSE: 0.01
R-squared: 0.31

Data: no_tempinc_standard.csv
Weight: 1
MAE: 0.42
MSE: 0.37
R-squared: 0.61
Weight: 3
MAE: 0.45
MSE: 0.38
R-squared: 0.60
Weight: 5
MAE: 0.49
MSE: 0.56
R-squared: 0.41

Data: tempinc.csv
Weight: 1
MAE: 0.85
MSE: 1.59
R-squared: 0.52
Weight: 3
MAE: 1.04
MSE: 2.19
R-squared: 0.34
Weight: 5
MAE: 0.94
MSE: 2.23
R-squared: 0.33

Data: tempinc_minmax.csv
Weight: 1
MAE: 0.04
MSE: 0.00
R-squared: 0.58
Weight: 3
MAE: 0.04
MSE: 0.00
R-squared: 0.39
Weight: 5
MAE: 0.04
MSE: 0.00
R-squared: 0.27

Data: tempinc_standard.csv
Weight: 1
MAE: 0.47
MSE: 0.70
R-squared: 0.46
Weight: 3
MAE: 0.49
MSE: 0.75
R-squared: 0.41
Weight: 5
MAE: 0.52
MSE: 0.85
R-squared: 0.33


### LinearRegression

**Without sampling**

In [67]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_train = train['Quality']
    train.drop(['Quality'], axis = 1, inplace = True)
    X_train = train
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val
    
    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test

    l_reg = LinearRegression()
    l_reg_fit = l_reg.fit(X_train, y_train)
    l_reg_preds = l_reg_fit.predict(X_val)
    
    print("MAE: {:.{}f}".format(mean_absolute_error(y_val, l_reg_preds), 2))
    print("MSE: {:.{}f}".format(mean_squared_error(y_val, l_reg_preds), 2))
    print("R-squared: {:.{}f}".format(r2_score(y_val, l_reg_preds), 2))


Data: no_tempinc.csv
MAE: 0.93
MSE: 2.03
R-squared: 0.45

Data: no_tempinc_minmax.csv
MAE: 0.04
MSE: 0.00
R-squared: 0.41

Data: no_tempinc_standard.csv
MAE: 0.48
MSE: 0.43
R-squared: 0.50

Data: tempinc.csv
MAE: 0.92
MSE: 2.05
R-squared: 0.45

Data: tempinc_minmax.csv
MAE: 0.04
MSE: 0.00
R-squared: 0.47

Data: tempinc_standard.csv
MAE: 0.53
MSE: 0.64
R-squared: 0.38


**With sampling**

In [69]:
os.chdir("../p_data")
for file in glob.glob('*.csv'):
    print('\nData:',file)
    df = pd.read_csv(file)
    temp_train, val, test = np.split(df.sample(frac=1), 
                                [int(.6*len(df)), 
                                 int(.8*len(df))])
    
    y_val = val['Quality']
    val.drop(['Quality'], axis = 1, inplace = True)
    X_val = val

    y_test = test['Quality']
    test.drop(['Quality'], axis = 1, inplace = True)
    X_test = test
    
    for w in range(1,6,2):
        print('Weight:',w)
        temp_df = oversample_extremes(temp_train, low_weight=w, high_weight = w)

        y_train = temp_df['Quality']
        temp_df.drop(['Quality'], axis = 1, inplace = True)
        X_train = temp_df

        l_reg = LinearRegression()
        l_reg_fit = l_reg.fit(X_train, y_train)
        l_reg_preds = l_reg_fit.predict(X_val)

        print("MAE: {:.{}f}".format(mean_absolute_error(y_val, l_reg_preds), 2))
        print("MSE: {:.{}f}".format(mean_squared_error(y_val, l_reg_preds), 2))
        print("R-squared: {:.{}f}".format(r2_score(y_val, l_reg_preds), 2))



Data: no_tempinc.csv
Weight: 1
MAE: 0.87
MSE: 1.51
R-squared: 0.48
Weight: 3
MAE: 0.97
MSE: 1.71
R-squared: 0.41
Weight: 5
MAE: 1.08
MSE: 2.04
R-squared: 0.30

Data: no_tempinc_minmax.csv
Weight: 1
MAE: 0.04
MSE: 0.00
R-squared: 0.49
Weight: 3
MAE: 0.04
MSE: 0.00
R-squared: 0.49
Weight: 5
MAE: 0.04
MSE: 0.00
R-squared: 0.49

Data: no_tempinc_standard.csv
Weight: 1
MAE: 0.51
MSE: 0.67
R-squared: 0.38
Weight: 3
MAE: 0.51
MSE: 0.67
R-squared: 0.38
Weight: 5
MAE: 0.51
MSE: 0.67
R-squared: 0.38

Data: tempinc.csv
Weight: 1
MAE: 0.91
MSE: 1.51
R-squared: 0.48
Weight: 3
MAE: 1.06
MSE: 1.88
R-squared: 0.36
Weight: 5
MAE: 1.19
MSE: 2.31
R-squared: 0.21

Data: tempinc_minmax.csv
Weight: 1
MAE: 0.04
MSE: 0.01
R-squared: 0.36
Weight: 3
MAE: 0.04
MSE: 0.01
R-squared: 0.36
Weight: 5
MAE: 0.04
MSE: 0.01
R-squared: 0.36

Data: tempinc_standard.csv
Weight: 1
MAE: 0.51
MSE: 0.65
R-squared: 0.35
Weight: 3
MAE: 0.51
MSE: 0.65
R-squared: 0.35
Weight: 5
MAE: 0.51
MSE: 0.65
R-squared: 0.35
