In [3]:
# Multiple Linear Regression Machine Learning In Python

# Demostration of MLR, cross-validation, evaluating cross-validation.

# y = a1x1 + a2x2 + a3x3 + ... + anxn + b
# y = target
# x = single feature
# a,b = parameters/coefficients of the model - slope,intercept

# How do we choose a and b?
# - Define an error function for any given line
# - Choose the line that minimizes the error function
# Error function = lost function = cost function
# Regression minimizes a loss function to choose a coefficient 'a', for each feature and the intercept 'b'. If we allow these coefficients to become too large = overfitting.


# __________Terms__________
# Residual:                     is the difference between the expected results from a model and the true values from data.
# Variance:                     is the variability in the expected results (predictions) of a given data point between different runs of the model.
# R-squared:                    is the absolute amount of variation as a proportion of total variation. quantifies the amount of variance in the target variable that is explained by the features. Ranges from 0 to 1, 0=low, 1=high
# Mean Squared Error (MSE):     measures the amount of error in statistical models. It assesses the average squared difference between the observed and predicted values. When a model has no error, the MSE equals zero. As model error increases, its value increases.
# Root Mean Squared Error(RMSE):Root mean square error or root mean square deviation is one of the most commonly used measures for evaluating the quality of predictions. It shows how far predictions fall from measured true values using Euclidean distance.
# RSS:                          residual sum of squares, The residual sum of squares (RSS) is the absolute amount of explained variation.
# Ordinary Least Squares(OLS):  Goal is to Minimize RSS. A Common technique for estimating coefficients of linear regression equations which describe the relationship
#                               between one or more independent quantitative variables and a dependent variable (simple or multiple linear regression).
#                               OLS estimators minimize the sum of the squared errors (a difference between observed values and predicted values).
# - Advantages of OLS:          OLS is the most efficient linear regression estimator when the assumptions hold true. 
#                               Another benefit of satisfying these assumptions is that as the sample size increases to infinity, the coefficient estimates converge on the actual population parameters.
# - Disadvantages of OLS:       As with OLS, a large data set is necessary in order to obtain reliable results. 
#                               The regression results are sensitive to functional form if the error term is not adequately interpreted, which can lead to widely varying conclusions depending on how the regression is initially set up.
# Cross Fold Validation:        Folds the training data over in nth folds. In 5-fold the data would be spliced in 5ths, then 4 would be used to compare on the 5th, iterates 5 times to use each block/fold as a validation. More folds = higher computational expense.
#                               Cross-validation is a vital approach to evaluating a model. It maximizes the amount of data that is available to the model, as the model is not only trained but also tested on all of the available data.
#                               By using cross-validation, we can see how performance varies depending on how the data is split.
# Hyperparameter:               Variable used to to optimize model parameters.
# Regularization:               Penalizes large coefficients.
# - Ridge Regression:           Ridge penalizes large positive or negative coefficients. contains the hyperprameter Alpha which is simular to Kappa in KNN. Alpha controls model complexity.
#                               When Alpha = 0 we are preforming OLS (Can lead to overfitting). A very high Alpha can lead extreme penalization of coefficients ie. underfitting.
# - Lasso Regression:           Can be used to select feature importance, as it actually shrinks the coefficients of least importance to 0. The features not reduced will be selected by Lasso.


# pip3 install pandas
# pip3 install scikit-learn
# pip3 install numpy
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Importing the 'advertising_and_sales_clean.csv' dataset as a pandas dataframe
sales_df = pd.read_csv('../../_datasets/advertising_and_sales_clean.csv')

# Removing column "influencer"
sales_df = sales_df.drop("influencer", axis=1)

# Create X and y arrays, X represents the features, y represents the target.
X = sales_df.drop('sales', axis=1).values
y = sales_df['sales'].values

#Â Create a KFold object
# KFold() is splitting data into 6 folds/blocks, randomizing the data before batching it into blocks, and setting seed to 5
kf = KFold(n_splits=6, shuffle=True, random_state=5)

# Instantiate the model
reg = LinearRegression()

# Compute 6-fold cross-validation scores
cv_results = cross_val_score(reg, X, y, cv=kf)

# Print the n number of R^2 scores from each block/fold
print(cv_results)

[0.99894062 0.99909245 0.9990103  0.99896344 0.99889153 0.99903953]


In [4]:
# Print the mean
print(np.mean(cv_results))

# Print the standard deviation
print(np.std(cv_results))

# Print the 95% confidence interval
# 0.975 to get a two-sided confidence interval. This gives 2.5% of the probability in the upper tail and 2.5% in the lower tail.
print(np.quantile(cv_results, [0.025, 0.975]))

# An average score of 0.9989896443678249 with a low standard deviation is very high for a model out of the box.

0.9989896443678249
6.608118371529651e-05
[0.99889767 0.99908583]
