# Part b) Resampling techniques, adding more complexity
In this notebook, we will use different techniques to ensure that our evaluation will be done on "fresh" data, rather than on the same data that was used for training.

In [5]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
os.sys.path.append(os.path.dirname(os.path.abspath('.')))

# Import local modules
from src.data.generate_data import FrankeFunction
from src.models.models import OLS
from src.evaluation.evaluation import mse, r_squared
from src.resampling.resampling import K_fold_splitter

In [4]:
df_X = pd.read_csv('../data/generated/X.csv', index_col=0)
df_z_no_noise = pd.read_csv('../data/generated/no_noise.csv', usecols=[1])
df_z_some_noise = pd.read_csv('../data/generated/some_noise.csv', usecols=[1])
df_z_noisy = pd.read_csv('../data/generated/noisy.csv', usecols=[1])

X = np.array(df_X)
z_no_noise = np.array(df_z_no_noise).ravel()
z_some_noise = np.array(df_z_some_noise).ravel()
z_noisy = np.array(df_z_noisy).ravel()

We will now use $\texttt{train_test_split}$ from $\texttt{sklearn.model_selection}$ in order to split our datasets in train and test sets. As we have three separate sets of target values, we will do this thrice.

In [19]:
targets = [{
    'name': 'No noise',
    'values': z_no_noise
},
{
    'name': 'Some noise (sigma 0.1)',
    'values': z_some_noise
},
{
    'name': 'Noisy (sigma 0.9)',
    'values': z_noisy
}]
col_names = ['MSE', 'R^2']
output_df = pd.DataFrame(columns=col_names)

print('%-30s|%-10s|%-10s' %('Data', 'MSE', 'R^2'))
print('-'*50)

for target in targets:
    X_train, X_test, z_train, z_test = train_test_split(X, target['values'], test_size=0.3, random_state=12)
    ol = OLS()
    ol.fit(X_train, z_train)
    predictions = ol.predict(X_test)
    mse_value = mse(z_test, predictions)
    r_2_value = r_squared(z_test, predictions)
    print('%-30s|%-10f|%-10f' %(target['name'], mse_value, r_2_value))
    output_df = output_df.append(pd.DataFrame(data=[[mse_value, r_2_value]], columns=col_names, index=[target['name']]))

output_df.to_csv('../reports/csv_files/2_mse_r2_score.csv')

Data                          |MSE       |R^2       
--------------------------------------------------
No noise                      |0.003428  |0.999630  
Some noise (sigma 0.1)        |0.014821  |0.998520  
Noisy (sigma 0.9)             |1.067697  |0.990548  


In [3]:
ols = OLS()
z = FrankeFunction(x, y, noise=3, seed=43)
kfs = K_fold_splitter(X.shape[0], 10)
fold_indices = kfs.cross_val_split()

for fold in fold_indices:
    X_train = X[fold['train_indices']]
    z_train = z[fold['train_indices']]
    X_test = X[fold['test_indices']]
    z_test = z[fold['test_indices']]
    ols.fit(X_train, z_train)
    predictions = ols.predict(X_test)
    mean_squared_error = mse(z_test, predictions)
    r_s = r_squared(z_test, predictions)
    print('%-10.3f%-10.8f' %(mean_squared_error, r_s))
    


6.878     0.97772845
11.510    0.96764420
9.960     0.97329942
10.612    0.97663653
11.641    0.96517201
8.161     0.97529395
10.097    0.97292284
11.266    0.97223209
9.160     0.97282911
7.978     0.97724881
