In [28]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import pathlib as pl
import os 
import pandas as pd

# Metrics for Model Selection

In this notebook you will fit polynomials to data to decide which order of polynomial is the best fit. Unlike before, the data you will be using is 3 dimensional, meaning it isn't possible to plot. Instead, you will write functions to calculate various metrics that are used to determine model fit. 

Complete this notebook, then answer the questions that go along side it. 

In [29]:
# set random seed for reproducibility
seed = 2022
np.random.seed(seed)

## Load the data 

In [30]:
path_csv = pl.Path(os.getcwd()) / f'M6_Performance_Metrics_Data.csv'
with open(path_csv, 'rb') as file:
    data = pd.read_csv(file)

In [31]:
print(data.head())
data.shape[0]

   Unnamed: 0        x1        x2        x3          y
0           0  0.382303 -1.596593  1.233776   4.935364
1           1  1.902436  1.579109 -0.341741  25.138660
2           2 -1.689244  1.298489 -1.472081  -4.786340
3           3 -1.510509  1.937616 -1.600244  -3.185759
4           4  1.621717  0.515558 -1.869644  19.712731


100

## Section 1 : Split the data into training, validation and test sets

### TO DO: write a function that splits the data into traning, validation and test sets.

The function should take as inputs the dataframe and the percentage splits for each of training, validation and test. It should output 3 dataframes, one for each of the sets. 

In [46]:
## write your function here ##
from collections import namedtuple
Split = namedtuple('Split', ['training', 'validation', 'test'])
seed = 2022
np.random.seed(seed)
def split(dataframe, splits=Split(training=0.4,validation=0.3,test=0.3)):
    rows = dataframe.shape[0]
    
    n_train = int(rows*splits.training)
    n_validate = int(rows*splits.validation)
    
    # generate a random permutation of indices of the data and split into training, validation and test
    perm = np.random.permutation(rows)
    indices_train, indices_validate, indices_test = np.split(perm, [n_train, n_train+ n_validate])
    print('Train:', indices_train, 'Validate', indices_validate)
    return dataframe.iloc[indices_train], dataframe.iloc[indices_validate], dataframe.iloc[indices_test]

In [47]:
def split_dataframe(df, data_split):
    """function to divide a dataframe into training, validation and test dataframes
    :param df: the full dataframe which is to be divided 
    :param data_split: a list containing the fraction of the full dataframe for each
    of training, validation and test, in that order
    :return training, validation, test: dataframes for each of the sets"""
    
    training = df.sample(frac=data_split[0])
    validation = df.drop(training.index).sample(frac=data_split[1]*len(df)/len(df.drop(training.index)))
    test = df.drop(validation.index).drop(training.index)
    
    return training, validation, test

### TO DO: Use your function to split the data so the training set has 40% of the data and the validation and test sets have 30% of the data each

In [48]:
train, validation, test = split(data)


Train: [79 76 83  5 35 57 22 96 67 58 93  3 69 60 39 17 54 44 61 94 32 84 70 20
 50 81 47 51  4 97 30 10  1 25 65  7 26 31 82  6] Validate [ 9 28 62 63 89 34 95 66  8 40 90 59 36  0 68 77 46 43 78 73 21 74 85 29
 71 64 91 42 52 13]


In [33]:
#### write your code here ####
#train, validation, test = split(data)
train, validation, test = split_dataframe(data, [0.4,0.3,0.3])
print(train,validation,test)
len(data)

    Unnamed: 0        x1        x2        x3          y
79          79 -1.386593  1.158374  1.994851  12.563918
76          76  1.790229  0.389979  1.157690   9.563458
83          83 -1.893098  0.440954 -0.797749  -2.209804
5            5 -1.928868 -1.475115 -0.677217  24.007974
35          35  0.497772  0.803435 -0.801939   3.127374
57          57  0.403963 -1.396032  0.560340   2.279149
22          22 -0.359099  0.952081  1.743385  12.820280
96          96 -1.236030  1.027829  0.105539  -0.461212
67          67  1.384385 -0.544277  1.379405   4.000594
58          58 -1.268421 -1.431103  0.282959   8.868657
93          93  1.636172  0.576516 -1.850643  20.271508
3            3 -1.510509  1.937616 -1.600244  -3.185759
69          69 -1.591721  1.030819  1.479829  -0.869927
60          60  0.948066  1.626476 -0.151725   5.842609
39          39  1.552398 -1.767069 -0.447956 -10.507609
17          17 -0.880164  0.077271 -1.176139   5.167053
54          54  0.583874 -1.257991 -1.220823   4

100

## Section 2: Write Metrics Functions 

### TO DO: Write the functions that calcluate the metrics you will use to evaluate the model fits

Write Functions that return:
- The mean absolute error
- The average error
- The mean absolute percentage error 
- The root mean squared error 
- The total sum of squared errors 

In [34]:
## write your code here ##
def round_error(func):
    def wrapper(*args, **kwargs):
        result = func(*args, **kwargs)
        return round(result, 3)
    return wrapper

@round_error
def mean_absolute_error(Y_actual, Y_pred):
    return np.mean(np.abs(Y_actual - Y_pred))

@round_error
def average_error(Y_actual, Y_pred):
    return np.mean(Y_pred - Y_actual)

@round_error
def mean_absolute_perc_error(Y_actual, Y_pred):
    return 100 * np.mean(np.abs((Y_pred - Y_actual)/Y_actual))

@round_error
def RMSE(Y_actual, Y_pred):
    return np.sqrt(np.mean((Y_actual - Y_pred)**2))
    
@round_error
def total_sum_squared_error(Y_actual, Y_pred):
    return np.sum((Y_actual - Y_pred)**2)

## Section 3: Fit models to training data and calculate performance metric on validation sets

For polynomials of order 1, 2, 3, and 4, you will use fit_model to fit each each model. This function uses scikit-learn polynomial regression. 


### TODO: write function to convert dataframe into numpy arrays

The scikit-learn functions take numpy arrays as their inputs. Therefore before you can fit any data you need to write a function to turn a dataframe with columns [x1, x2, x3, y] into two numpy arrays: X and y. X should have dimensions (N, D), where N is the number of data points and D is the dimensionality of the data (in this case 3). y should have dimensions (N, ). 


In [35]:
def fit_model(X, y, order):
    """creates scikit-learn regression object and fits it to the X and y data"""
    model = Pipeline([('poly', PolynomialFeatures(degree=order)),
                      ('linear', LinearRegression(fit_intercept=False))])
    model = model.fit(X, y)
    return model 

In [36]:
### write your function here ## 
def convert(df):
    return np.array(df[['x1', 'x2', 'x3']]), np.array(df['y'])

### TO DO: For polynomials of order 1 to 6 inclusive: 
1. Fit a polynomial to the training data using the fit_model function 
2. Use model.predict(X) to get the model predictions on the validation set
3. Store the model in a dictionary of models where the keys indicate the order and the items are the models
4. Store the predictions in a seperate dictionary where the keys indicate the order and the items are numpy arrays of the predictions 

In [37]:
## write your code here ##
models = {}
predictions = {}
X_train, Y_train = convert(train)
X_val, Y_val = convert(validation)
for order in range(1,7):
    model = fit_model(X_train,Y_train,order)
    models[order] = model
    predictions[order] = model.predict(X_val)

## Section 4: Calculate metrics for each of the models

Now we want to calculate the metrics for each of the models. 


### TODO: Use the dictionary of predictions you have to caluclate and record (could be in a dataframe, or you could plot it on a graph) each of the metrics. 
1. Calculate each of the metrics for the model using the functions you wrote before
2. Store the metrics in a dataframe, with one row for each model or plot on a graph
3. Answer the questions that go alongside this notebook 

HINT: you can write a list of functions of the form:

methods = [RMSE, average_error, mean_abs_percent_error, total_sum_squared_error]

which you can then iterate over using a for loop. 



In [38]:
## write your code here ##
str_map = {}
str_map['MAE'] = mean_absolute_error
str_map['AE'] = average_error
str_map['MAPE'] = mean_absolute_perc_error
str_map['RMSE'] = RMSE
str_map['TSSE'] = total_sum_squared_error

columns = str_map.keys()
errors = []
for key in predictions:
    error = []
    for func in str_map:    
        error.append(str_map[func](Y_val, predictions[key]))
    errors.append(error)

df = pd.DataFrame(errors, columns=columns)

print(df)

     MAE     AE    MAPE   RMSE      TSSE
0  5.101 -1.204  77.700  7.684  1771.097
1  2.625  0.389  56.927  3.282   323.116
2  4.805  0.766  86.271  6.404  1230.391
3  1.050 -0.328  15.689  1.780    95.022
4  1.799 -0.124  25.506  2.956   262.134
5  3.386  1.603  33.060  8.298  2065.943


## Section 5: Use the test set to evaluate the performance of your chosen model

### TODO: For your selected model, calculate the RMSE, Average Error and Mean Absolute Percentage Error of the test data

In [39]:
## write your code here ## 
X_test, Y_test = convert(test)
best = 4
Y_pred = models[best].predict(X_test)
print('RMSE:', RMSE(Y_test, Y_pred))
print('Average Error:', average_error(Y_test, Y_pred))
print('Mean Absolute percentage Error:', mean_absolute_perc_error(Y_test, Y_pred))

RMSE: 2.447
Average Error: 0.039
Mean Absolute percentage Error: 39.253


In [40]:
for func in str_map:    
    print(func, str_map[func](Y_test, Y_pred))

MAE 1.375
AE 0.039
MAPE 39.253
RMSE 2.447
TSSE 179.691
