# Exercise 3: Polynomial Regression

## 1. Load the dataset

In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import pandas as pd 
import numpy as np 
from joblib import dump 

In [3]:
from alancarrml.data.sets import load_sets

### [2.3] Load the sets from data/processed

In [10]:
import os 

project_dir = os.path.dirname(os.getcwd()) 
data_dir = os.path.join(project_dir, 'data') 
datasets_dir = os.path.join(data_dir, 'processed') 
os.listdir(datasets_dir)

['x_val.csv',
 'x_test.csv',
 'y_test.csv',
 'y_train.csv',
 'y_val.csv',
 'x_train.csv']

In [17]:
datasets_dir

'/home/bened/DataScience/AMLA/labs/alancarrml-project/data/processed'

In [18]:
os.path.exists(os.path.join(datasets_dir, "x_train.csv"))

True

In [20]:
X_train = pd.read_csv(os.path.join(datasets_dir, "x_train.csv")) 
print(type(X_train))

<class 'pandas.core.frame.DataFrame'>


In [21]:
from alancarrml.data.sets import load_sets 

datasets = load_sets() 

In [22]:
datasets.keys()

dict_keys(['x_val', 'x_test', 'y_test', 'y_train', 'y_val', 'x_train'])

In [23]:
X_train = datasets.get('x_train')   
X_val = datasets.get('x_val') 
X_test = datasets.get('x_test')  

y_train = datasets.get('y_train') 
y_val = datasets.get('y_val') 
y_test = datasets.get('y_test')

In [34]:
def verify_datasets(dataset_names=['X_train', 'X_val', 'X_test', 'y_train', 'y_val', 'y_test']): 
    """
    Prints type and shape of all datasets if they exist as global variables. 

    Parameters
    ______________________________________________________________________________________________
    dataset_names: list[str] 
        Names of dataset variables. Defaults to standard naming conventions. 

    Returns
    ______________________________________________________________________________________________
    None 
    """
    global_vars = globals() 

    if 'X_train' in global_vars: 
        type_string, dim_string = verify_dataset(global_vars, 'X_train')
        print_info('X_train', type_string, dim_string)
        
    elif 'x_train' in global_vars:
        type_string, dim_string = verify_dataset(global_vars, 'x_train')
        print_info('x_train', type_string, dim_string) 

    if 'X_val' in global_vars:
        type_string, dim_string = verify_dataset(global_vars, 'X_val')
        print_info('X_val', type_string, dim_string) 

    elif 'x_val' in global_vars:
        type_string, dim_string = verify_dataset(global_vars, 'x_val')
        print_info('x_val', type_string, dim_string)

    if 'X_test' in global_vars: 
        type_string, dim_string = verify_dataset(global_vars, 'X_test')
        print_info('X_test', type_string, dim_string) 

    elif 'x_test' in global_vars: 
        type_string, dim_string = verify_dataset(global_vars, 'x_test')
        print_info('x_test', type_string, dim_string)
    
    if 'y_train' in global_vars: 
        type_string, dim_string = verify_dataset(global_vars, 'y_train')
        print_info('y_train', type_string, dim_string)
    
    if 'y_val' in global_vars: 
        type_string, dim_string = verify_dataset(global_vars, 'y_val')
        print_info('y_val', type_string, dim_string)
    
    if 'y_test' in global_vars: 
        type_string, dim_string = verify_dataset(global_vars, 'y_test')
        print_info('y_test', type_string, dim_string) 

def verify_dataset(global_vars, dataset_name): 
    if dataset_name in global_vars: 
        dataset = global_vars.get(dataset_name) 
        try: 
            return f"Type: {type(dataset)}", f"Dimensions: {dataset.shape}"
        except AttributeError: 
            return f"Type: {type(dataset)}", f"Length: {len(dataset)}"

def print_info(dataset_name, type_string, dim_string): 
    print(f"{dataset_name}:") 
    print(f"     {type_string}") 
    print(f"     {dim_string} \n") 

In [35]:
verify_datasets()

X_train:
     Type: <class 'pandas.core.frame.DataFrame'>
     Dimensions: (30000, 9) 

X_val:
     Type: <class 'pandas.core.frame.DataFrame'>
     Dimensions: (10000, 9) 

X_test:
     Type: <class 'pandas.core.frame.DataFrame'>
     Dimensions: (10000, 9) 

y_train:
     Type: <class 'pandas.core.frame.DataFrame'>
     Dimensions: (30000, 2) 

y_val:
     Type: <class 'pandas.core.frame.DataFrame'>
     Dimensions: (10000, 2) 

y_test:
     Type: <class 'pandas.core.frame.DataFrame'>
     Dimensions: (10000, 2) 



## 3. Apply Polynomial Transformation

In [36]:
from sklearn.preprocessing import PolynomialFeatures

In [37]:
squarer = PolynomialFeatures(2)

In [38]:
squarer.fit(X_train) 
X_train_squared = squarer.transform(X_train)

In [40]:
print(X_train.shape) 
print(X_train_squared.shape)

(30000, 9)
(30000, 55)


In [41]:
X_val_squared = squarer.transform(X_val) 
X_test_squared = squarer.transform(X_test)

In [42]:
print(X_val_squared.shape) 
print(X_test_squared.shape)

(10000, 55)
(10000, 55)


## 4. Train Linear Regression Model

In [43]:
from sklearn.linear_model import LinearRegression 

In [44]:
reg = LinearRegression() 

In [45]:
reg.fit(X_train_squared, y_train) 

In [50]:
import joblib 

models_dir = os.path.join(project_dir, "models") 

with open(os.path.join(models_dir, "linear_poly_2.joblib"), 'wb') as j: 
    joblib.dump(reg, j) 

os.listdir(models_dir)

['ohe.joblib', 'linear_poly_2.joblib', 'scaler.joblib']

In [51]:
y_train_preds = reg.predict(X_train_squared) 
y_val_preds = reg.predict(X_val_squared)

In [52]:
from alancarrml.models.performance import score_regressors 

### [4.7] Display the RMSE and MAE scores of this model on the training set

In [None]:
train_scores = score_regressors(y_train_preds, y_train, "train") 

                Scores for y_train
------------------------------------------------------------
        Root Mean Squared Error    |   3059.2819
        Mean Squared Error         |   18718412.0500
        Mean Absolute Error        |   2056.4413
        R2 Score                   |   0.8721
------------------------------------------------------------



### [4.8] Display the RMSE and MAE scores of this model on the validation set 

In [None]:
val_scores = score_regressors(y_val_preds, y_val, "val") 

                Scores for y_val
------------------------------------------------------------
        Root Mean Squared Error    |   3090.1659
        Mean Squared Error         |   19098250.5787
        Mean Absolute Error        |   2081.3982
        R2 Score                   |   0.8712
------------------------------------------------------------



### [4.9] Display the RMSE and MAE scores of this model on the testing set

In [55]:
test_preds = reg.predict(X_test_squared) 
test_scores = score_regressors(test_preds, y_test, "test") 

                Scores for test
------------------------------------------------------------
        Root Mean Squared Error    |   3035.1790
        Mean Squared Error         |   18424623.5447
        Mean Absolute Error        |   2044.1240
        R2 Score                   |   0.8758
------------------------------------------------------------

