In [1]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import pathlib as pl
import os 
import pandas as pd

# Metrics for Model Selection

In this notebook you will fit polynomials to data to decide which order of polynomial is the best fit. Unlike before, the data you will be using is 3 dimensional, meaning it isn't possible to plot. Instead, you will write functions to calculate various metrics that are used to determine model fit. 

Complete this notebook, then answer the questions that go along side it. 

In [2]:
# set random seed for reproducibility
seed = 2022
np.random.seed(seed)

## Load the data 

In [3]:
path_csv = pl.Path(os.getcwd()) / f'M6_Performance_Metrics_Data.csv'
with open(path_csv, 'rb') as file:
    data = pd.read_csv(file)

In [4]:
print(data.head())

   Unnamed: 0        x1        x2        x3          y
0           0  0.382303 -1.596593  1.233776   4.935364
1           1  1.902436  1.579109 -0.341741  25.138660
2           2 -1.689244  1.298489 -1.472081  -4.786340
3           3 -1.510509  1.937616 -1.600244  -3.185759
4           4  1.621717  0.515558 -1.869644  19.712731


## Section 1 : Split the data into training, validation and test sets

### TO DO: write a function that splits the data into traning, validation and test sets.

The function should take as inputs the dataframe and the percentage splits for each of training, validation and test. It should output 3 dataframes, one for each of the sets. 

In [14]:
data.iloc[:,1:-1]

Unnamed: 0,x1,x2,x3
0,0.382303,-1.596593,1.233776
1,1.902436,1.579109,-0.341741
2,-1.689244,1.298489,-1.472081
3,-1.510509,1.937616,-1.600244
4,1.621717,0.515558,-1.869644
...,...,...,...
95,-0.369062,1.344716,-0.545936
96,-1.236030,1.027829,0.105539
97,-0.415708,-0.626697,1.076959
98,-0.467675,1.527521,0.675122


In [15]:
## write your function here ##

def split_data(df, ratios_list):

    from sklearn.model_selection import train_test_split

    train_ratio = ratios_list[0]
    validation_ratio = ratios_list[1]
    test_ratio = ratios_list[2]
    
    # train is now 75% of the entire data set
    # # the _junk suffix means that we drop that variable completely
    
    x_train, x_test, y_train, y_test = train_test_split(data.iloc[:,1:-1], data.iloc[:,-1], test_size=1 - train_ratio)

    # test is now 10% of the initial data set
    # validation is now 15% of the initial data set
    x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 
    
    return x_train, x_val, x_test

### TO DO: Use your function to split the data so the training set has 40% of the data and the validation and test sets have 30% of the data each

In [20]:
#### write your code here ####

x_train, x_val, x_test = split_data(data, [0.4,0.3, 0.3])

(          x1        x2        x3
 23 -0.496090  1.552690 -1.333864
 99 -0.126524  1.807901 -0.740513
 54  0.583874 -1.257991 -1.220823
 16 -1.144002  0.949800  0.866272
 65  0.308612 -0.923909 -0.235798
 40 -1.057484 -0.005690 -0.318702
 94  0.632650 -1.628347  1.683086
 80 -1.359348 -1.780858  0.847985
 28 -1.044932 -1.981752  1.856430
 60  0.948066  1.626476 -0.151725
 0   0.382303 -1.596593  1.233776
 57  0.403963 -1.396032  0.560340
 62 -1.670056  0.362760  1.394161
 25  1.051658  1.366586 -0.973434
 56  1.127148  0.655574  0.900366
 14  1.889985  0.464764 -1.297304
 55 -1.277623 -0.817546 -1.888549
 95 -0.369062  1.344716 -0.545936
 75  0.627830 -1.468808 -1.339753
 20  1.148637 -0.128755 -0.384135
 6  -0.581466 -1.672979 -0.331812
 30 -0.251202  1.921266  0.645585
 24  1.808328 -0.137865 -0.016387
 36  0.723287 -0.578830 -0.655962
 4   1.621717  0.515558 -1.869644
 82  1.157484 -0.758447  1.424520
 89 -1.083336  0.410126 -1.567022
 79 -1.386593  1.158374  1.994851
 71 -0.162152 

## Section 2: Write Metrics Functions 

### TO DO: Write the functions that calcluate the metrics you will use to evaluate the model fits

Write Functions that return:
- The mean absolute error
- The average error
- The mean absolute percentage error 
- The root mean squared error 
- The total sum of squared errors 

In [23]:
## write your code here ##

def mae(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.mean(np.abs(y_true - predictions))

def ae(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.mean(y_true - predictions)

def mape(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.mean(np.abs((y_true - predictions)/predictions))

def rmse(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.mean(np.sqrt(y_true - predictions)^2)

def sse(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.sum((y_true - predictions)^2)

## Section 3: Fit models to training data and calculate performance metric on validation sets

For polynomials of order 1, 2, 3, and 4, you will use fit_model to fit each each model. This function uses scikit-learn polynomial regression. 


### TODO: write function to convert dataframe into numpy arrays

The scikit-learn functions take numpy arrays as their inputs. Therefore before you can fit any data you need to write a function to turn a dataframe with columns [x1, x2, x3, y] into two numpy arrays: X and y. X should have dimensions (N, D), where N is the number of data points and D is the dimensionality of the data (in this case 3). y should have dimensions (N, ). 


In [33]:
def fit_model(X, y, order):
    """creates scikit-learn regression object and fits it to the X and y data"""
    model = Pipeline([('poly', PolynomialFeatures(degree=order)),
                      ('linear', LinearRegression(fit_intercept=False))])
    model = model.fit(X, y)
    return model 

In [24]:
### write your function here ## 

def to_numpy_df(df):
    X = df.iloc[:,1:-1].to_numpy()
    y = df.iloc[:,:-1].to_numpy()
    return X,y

### TO DO: For polynomials of order 1 to 6 inclusive: 
1. Fit a polynomial to the training data using the fit_model function 
2. Use model.predict(X) to get the model predictions on the validation set
3. Store the model in a dictionary of models where the keys indicate the order and the items are the models
4. Store the predictions in a seperate dictionary where the keys indicate the order and the items are numpy arrays of the predictions 

In [41]:
#Create an empty array to store the fitted polynomial functions
p = {}

#Fit polynomials of orders 0 to 30 and store them in the array
for i in range(0,7):
    model = fit_model(X, y, i)
    model.predict(X)
    training_dict = {i: model}
    # predictions_dict = model.predict()

# display(model_1)
# model_1.predict(X)

# for i in range(1,7):
#     X, y = to_numpy_df(data)
#     fit_model(X, y, i)
training_dict

ValueError: could not broadcast input array from shape (100,3) into shape (100,0)

## Section 4: Calculate metrics for each of the models

Now we want to calculate the metrics for each of the models. 


### TODO: Use the dictionary of predictions you have to caluclate and record (could be in a dataframe, or you could plot it on a graph) each of the metrics. 
1. Calculate each of the metrics for the model using the functions you wrote before
2. Store the metrics in a dataframe, with one row for each model or plot on a graph
3. Answer the questions that go alongside this notebook 

HINT: you can write a list of functions of the form:

methods = [RMSE, average_error, mean_abs_percent_error, total_sum_squared_error]

which you can then iterate over using a for loop. 



In [None]:
## write your code here ##

## Section 5: Use the test set to evaluate the performance of your chosen model

### TODO: For your selected model, calculate the RMSE, Average Error and Mean Absolute Percentage Error of the test data

In [None]:
## write your code here ## 