In [1]:
# Multiple Linear Regression Machine Learning In Python

# Demostration of MLR, train/test split, creating a prediction, evaluating R2 and RMSE of model.

# y = a1x1 + a2x2 + a3x3 + ... + anxn + b
# y = target
# x = single feature
# a,b = parameters/coefficients of the model - slope,intercept

# How do we choose a and b?
# - Define an error function for any given line
# - Choose the line that minimizes the error function
# Error function = lost function = cost function

# __________Terms__________
# Residual:                     is the difference between the expected results from a model and the true values from data.
# Variance:                     is the variability in the expected results (predictions) of a given data point between different runs of the model.
# R-squared:                    is the absolute amount of variation as a proportion of total variation. quantifies the amount of variance in the target variable that is explained by the features. Ranges from 0 to 1, 0=low, 1=high
# Mean Squared Error (MSE):     measures the amount of error in statistical models. It assesses the average squared difference between the observed and predicted values. When a model has no error, the MSE equals zero. As model error increases, its value increases.
# Root Mean Squared Error(RMSE):Root mean square error or root mean square deviation is one of the most commonly used measures for evaluating the quality of predictions. It shows how far predictions fall from measured true values using Euclidean distance.
# RSS:                          residual sum of squares, The residual sum of squares (RSS) is the absolute amount of explained variation.
# Ordinary Least Squares(OLS):  Goal is to Minimize RSS. A Common technique for estimating coefficients of linear regression equations which describe the relationship
#                               between one or more independent quantitative variables and a dependent variable (simple or multiple linear regression).
#                               OLS estimators minimize the sum of the squared errors (a difference between observed values and predicted values).
# - Advantages of OLS:          OLS is the most efficient linear regression estimator when the assumptions hold true. 
#                               Another benefit of satisfying these assumptions is that as the sample size increases to infinity, the coefficient estimates converge on the actual population parameters.
# - Disadvantages of OLS:       As with OLS, a large data set is necessary in order to obtain reliable results. 
#                               The regression results are sensitive to functional form if the error term is not adequately interpreted, which can lead to widely varying conclusions depending on how the regression is initially set up.


# pip3 install pandas
# pip3 install scikit-learn
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Importing the 'advertising_and_sales_clean.csv' dataset as a pandas dataframe
sales_df = pd.read_csv('../../_datasets/advertising_and_sales_clean.csv')

# Removing column "influencer"
sales_df = sales_df.drop("influencer", axis=1)

# Create X and y arrays, X represents the features, y represents the target.
X = sales_df.drop('sales', axis=1).values
y = sales_df['sales'].values

# Creating splits on df
# test_size parameter states we are reserving 70% for training and 30% for testing.
# random_state parameter sets a seed.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate the model
reg = LinearRegression()

# Fit the model to the data
reg.fit(X_train, y_train)


LinearRegression()

In [13]:
# Make predictions
y_pred = reg.predict(X_test)

# Display the first 2 predictions, as well as the actual values.
# .format() is placing the passed values into the respective {}'s.
print("Predictions: {}, Actual Values: {}".format(y_pred[:2], y_test[:2]))

Predictions: [53176.66154234 70996.19873235], Actual Values: [55261.28 67574.9 ]


In [14]:
# Compute R-squared
r_squared = reg.score(X_test, y_test)

# Compute RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)

# Print the metrics
print("R^2: {}".format(r_squared))
print("RMSE: {}".format(rmse))

# At 0.9990152104759368 R^2, the features explain 99% of the variance in sales values

R^2: 0.9990152104759368
RMSE: 2944.433199600101
