# Lab 4 - Multivariate Linear and Polynomial Regression, and Evaluation using R-Squared, MAPE and MAE

### Importing the dependencies

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

### Load the datasets

In [5]:
# Loading the dataset and the features into X and target values into y
x, y = datasets.load_diabetes(as_frame=True, scaled=False, return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) # Splitting the data into train and test variables. 70% into training and 30% into testing
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.5, random_state=0) # Splitting the test data further into validation and test again by 50% for both. So in it's entirety, we have 70% for training, 15% for testing and 15% for validation.

In [7]:
x.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,59.0,2.0,32.1,101.0,157.0,93.2,38.0,4.0,4.8598,87.0
1,48.0,1.0,21.6,87.0,183.0,103.2,70.0,3.0,3.8918,69.0
2,72.0,2.0,30.5,93.0,156.0,93.6,41.0,4.0,4.6728,85.0
3,24.0,1.0,25.3,84.0,198.0,131.4,40.0,5.0,4.8903,89.0
4,50.0,1.0,23.0,101.0,192.0,125.4,52.0,4.0,4.2905,80.0


## Multivariate Linear Regression

In [76]:
linear_model = LinearRegression() # Running the linear regression model
linear_model.fit(x_train, y_train) # Fitting the train variables into the linear regression model

# Predicting the values using the linear regression model
y_pred_test = linear_model.predict(x_test)
y_pred_train = linear_model.predict(x_train)
y_pred_valid = linear_model.predict(x_valid)

print('The number of parameteres for Multivariate Linear Regression is:', len(linear_model.coef_))

The number of parameteres for Multivariate Linear Regression is: 10


## Univariate Polynomial Regression on the BMI feature

In [77]:
x_bmi = x['bmi'].values.reshape(-1,1) # Reshaping the BMI feature to fit in the Polynomial Regression model

# Splitting the data into training, testing and validation set
x_p_train, x_p_test, y_p_train, y_p_test = train_test_split(x_bmi, y, test_size=0.3, random_state=0)
x_p_test, x_p_valid, y_p_test, y_p_valid = train_test_split(x_p_test, y_p_test, test_size=0.5, random_state=0)

polyn_model = PolynomialFeatures(degree=2, include_bias=False)

x_pol_train = polyn_model.fit_transform(x_p_train)
x_pol_test = polyn_model.fit_transform(x_p_test)
x_pol_valid = polyn_model.fit_transform(x_p_valid)

pol_model = LinearRegression()


pol_model.fit(x_pol_train, y_train)

y_pol_pred_train = pol_model.predict(x_pol_train)
y_pol_pred_test = pol_model.predict(x_pol_test)
y_pol_pred_valid = pol_model.predict(x_pol_valid)

print('The number of parameters in the Polynomial Regression based on the BMI feature is:', len(pol_model.coef_))

The number of parameters in the Polynomial Regression based on the BMI feature is: 2


## Multivariate Polynomial Regression

In [78]:
polynomial_model = PolynomialFeatures(degree=2, include_bias=False)

x_poly_train = polynomial_model.fit_transform(x_train)
x_poly_test = polynomial_model.fit_transform(x_test)
x_poly_valid = polynomial_model.fit_transform(x_valid)

poly_model = LinearRegression()

poly_model.fit(x_poly_train, y_train)

y_poly_pred_train = poly_model.predict(x_poly_train)
y_poly_pred_test = poly_model.predict(x_poly_test)
y_poly_pred_valid = poly_model.predict(x_poly_valid)

print ('The number of parameters in the Multivariate polynomial regression is:', len(poly_model.coef_))

The number of parameters in the Multivariate polynomial regression is: 65


## Comparison between the models

In [79]:
# Multivariate Linear Regression
# R-Squared for Multivariate Linear Regression
print('The results of the Loss functions for the Multivariate Linear Regression models are:')
r2_dict = {'train' : r2_score(y_train, y_pred_train), 'validation' : r2_score(y_valid, y_pred_valid)}
for keys in r2_dict:
    value = r2_dict[keys]
    print(f'The R-Squared of {keys} set is {value}')

# Mean Squared Error for Multivariate Linear Regression
mse_dict = {'train':mean_absolute_error(y_train, y_pred_train), 'validation':mean_absolute_error(y_valid, y_pred_valid)}
for keys in mse_dict:
    value = mse_dict[keys]
    print(f'The Mean Absolute Error of {keys} set is {value}')
    
# Mean absolute percentage error for Multivariate Linear Regression
mape_dict = {'train':mean_absolute_percentage_error(y_train, y_pred_train), 'validation':mean_absolute_percentage_error(y_valid, y_pred_valid)}
for keys in mape_dict:
    value = mape_dict[keys]
    print(f'The Mean Absolute Percentage Error of {keys} set is {value}')
print('\n')

# Polynomial Regression based on the BMI
# R-Squared for polynomial regression on the BMI feature
print('The results of the loss functions for the Polynomial Regression based on BMI are:')
r2_dict = {'train' : r2_score(y_train, y_pol_pred_train), 'validation' : r2_score(y_valid, y_pol_pred_valid)}
for keys in r2_dict:
    value = r2_dict[keys]
    print(f'The R-Squared of {keys} set is {value}')
    
# Mean Squared Error for polynomial regression on the BMI feature
mse_dict = {'train': mean_absolute_error(y_train, y_pol_pred_train),
            'validation': mean_absolute_error(y_valid, y_pol_pred_valid)}
for keys in mse_dict:
    value = mse_dict[keys]
    print(f'The Mean Absolute Error of {keys} set is {value}')
    
# Mean absolute percentage error for polynomial regression on the BMI feature
mape_dict = {'train': mean_absolute_percentage_error(y_train, y_pol_pred_train),
             'validation': mean_absolute_percentage_error(y_valid, y_pol_pred_valid)}
for keys in mape_dict:
    value = mape_dict[keys]
    print(f'The Mean Absolute Percentage Error of {keys} set is {value}')
print('\n')

# Multivariate Polynomial Regression
# R-Squared for multivariate polynomial regression
print('The results of the loss functions for the multivariate polynomial regression are:')
r2_dict = {'train' : r2_score(y_train, y_poly_pred_train), 'validation' : r2_score(y_valid, y_poly_pred_valid)}
for keys in r2_dict:
    value = r2_dict[keys]
    print(f'The R-Squared of {keys} set is {value}')
    
# Mean Squared Error for multivariate polynomial regression
mse_dict = {'train': mean_absolute_error(y_train, y_poly_pred_train),
            'validation': mean_absolute_error(y_valid, y_poly_pred_valid)}
for keys in mse_dict:
    value = mse_dict[keys]
    print(f'The Mean Absolute Error of {keys} set is {value}')
    
# Mean absolute percentage error for multivariate polynomial regression
mape_dict = {'train': mean_absolute_percentage_error(y_train, y_poly_pred_train),
             'validation': mean_absolute_percentage_error(y_valid, y_poly_pred_valid)}
for keys in mape_dict:
    value = mape_dict[keys]
    print(f'The Mean Absolute Percentage Error of {keys} set is {value}')

The results of the Loss functions for the Multivariate Linear Regression models are:
The R-Squared of train set is 0.5539378915448929
The R-Squared of validation set is 0.3577498389638193
The Mean Absolute Error of train set is 43.05497348042472
The Mean Absolute Error of validation set is 40.27022711755064
The Mean Absolute Percentage Error of train set is 0.39144712116653047
The Mean Absolute Percentage Error of validation set is 0.40037139917326187


The results of the loss functions for the Polynomial Regression based on BMI are:
The R-Squared of train set is 0.38077692043301736
The R-Squared of validation set is 0.0497533183932678
The Mean Absolute Error of train set is 51.712537284241954
The Mean Absolute Error of validation set is 50.015690579188416
The Mean Absolute Percentage Error of train set is 0.4817649534670314
The Mean Absolute Percentage Error of validation set is 0.4858660629790605


The results of the loss functions for the multivariate polynomial regression are:
The 


***

#### Loss functions Explained:
###### R-Squared:
* R-squared is calculated by subtracting the quotient of Mean-Squared Error and the Variance by 1 (R2 = 1-(MSE/Var)).
* R-Squared typically ranges from 0 to 1 and if you get a negative output that means your data might not be normalized.
* It is generally expected for the result to have a higher value since that would mean that your model has less error.

###### Mean Absolute Error (MAE)
* Mean absolute error calculates the absolute average between your predicted and actual values
* Smaller MAE values typically means less errors in your model.

###### Mean Absolute Percentage Error (MAPE)
* Mean absolute percentage error calculates the absolute percentage average between your predicted and actual values.
* It is very similar to the MAE, and the lowest percentage indicates less error in your model.

#### Insights:
##### For R-Squared:
* While comparing the R-Squared of the validation set, we can clearly see that Multivariate Linear Regression has the higher score of 0.35, which indicates less error while comparing the R-Square with the other models.
##### For Mean Absolute Error (MAE):
* While comparing the Mean Absolute Error of the validation set, we can see that the Multivariate Linear Regression set has the least score of 40.27 which indicates that it has the less possibility for errors amongst other models.
##### For Mean Absolute Percentage Error (MAPE)
* While comparing the Mean Absolute Percentage Error of the validation set, we can see that the Multivariate Linear Regression set also has the least score of 40%, which indicates less possibility for errors amongst other models.

## Questions

***

#### How many parameters are we fitting for each of the three models? Explain these values

* For the Multivariate Linear Regression model, we have 10 parameters
* * Here we have 10 parameters which are the number of features that we have in the dataset. 
* For the Univariate Polynomial Regression model, we have 2 parameters
* * Here we only have 2 parameters since we are only taking in the BMI feature and we are transforming on the degree of 2
* For the Multivariate Polynomial Regresesion model, we have 65 parameters
* * Here we have 65 parameters since we are taking the whole features of the dataset and we are transforming on the degree of 2, similar to the univariate polynomial model


#### Which model would you choose for deployment and why?
* I would choose the Multivariate Linear Regression for deployment because it's validation set comparing to all Loss functions and models, has the least error.