In [None]:
# A module can be installed from within the notebook by 
# typing the following command


#!pip install sklearn

In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [None]:
auto = pd.read_csv('Car_Purchasing_Data.csv')
print(auto.head())

In [None]:
print(auto.shape)


In [None]:
auto.head()

In [None]:
print(auto.columns)
print(auto.dtypes)

### What's our goal

We have 9 columns. The dependent column is Car Purchase Amount. 

Since Customer Name, Customer e-mail and Country columns are catergorical variables and will not have any bearing on our prediction we will drop these columns now. 

Goal: We want to find a linear equation so that we can predict Car Purchase Amount. 

We will consider: Gender, Age, Annual Salary, Credit Card Debt and Net Worth as independent variables and Car Purchase Amount will be our dependent varaible also known as target variable. 



In [None]:
# dropping 

auto.drop(["Customer Name", "Customer e-mail", "Country"], axis=1, inplace=True)

#df.drop(['column_name1', 'column_name2'], axis=1, inplace=True)

print(auto.shape)

print(auto.columns)

What is correlation? 

Correlation describes the linear relationship between two variables. 

Correlation coefficient is a value that describes the strenght of the 
relationship between two variables. 

Correlation graph

<img src="correlation_graph.png" width=400, height=300>

Correlation coefficient formula

<img src="correlation_formula.png" width=400, height=300>

Values of $r$ range from -1 to 1, -1 represents negative correlation, 1 represents positive correlation. 

Reference - https://www.wallstreetmojo.com/correlation-coefficient-formula/

Which features to select?

Choose features that are not correlated. 

In [None]:
# library
%matplotlib inline

import seaborn as sns
import matplotlib.pyplot as plt
 
# Basic correlogram
sns.pairplot(auto)
plt.show()

In [None]:
matrix = np.triu(auto.corr())

ax = sns.heatmap(auto.corr(), annot = True, square=True, \
            linewidths=1, linecolor='black') #, mask=matrix)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
matrix = np.triu(auto.corr())

ax = sns.heatmap(auto.corr(), annot = True, square=True, \
            linewidths=1, linecolor='black', mask=matrix)

bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)

## Let's list which variables are correlated with which:

### Multilinear Regression

In a multilinear regression, instead of one independent variable, we will consider more than one independent variable to find a linear relationship between independent variables and dependent variable. We will consider 5 independent variables to find a linear relationship between them and our target variable, Car Purchase Amount. 

In [None]:
# creating an instance of LinearRegression class
reg = linear_model.LinearRegression()
print(reg)

#### Explanation

x = all independent variables that we are considering for our study

y = car purchase amount

Goal: get a linear relationship between y and x.

How many rows are there? 

The data is split so that we can use majority for training the model. Once the model is trained, we use the remaining data that was not used to test the model. 

y = y_known

using the model we can y_predict.


train_test_split() will return 4 things: x_train, x_test, y_train, y_test


x_1  y_1

x_2  y_2

x_3  y_3

x_4  y_4

x_5  y_5


x_train = [x_1, x_2,x_3,x_4]
y_train = [y_1,y_2,y_3,y_4]

x_test = [x_5]
y_test = [y_5] 

In [None]:
print("Car Purchase min Amount: ", min(auto["Car Purchase Amount"]))

print("Car purchase max Amount: ", max(auto["Car Purchase Amount"]))

#### Standard Scaler

We have to plot each feature and see if the distribution is normal. If yes, they we can perfrom standard scaler so that 
the mean will be zero and standard deviation will be 1. 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
a_scaler = StandardScaler() # creating an instance of Standard Scaler

In [None]:
auto_independent = a_scaler.fit_transform(auto[['Age','Annual Salary', 'Credit Card Debt', 'Net Worth']])

In [None]:
# using train_test_split(), we are splitting the data into training and test 
# x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = train_test_split(auto_independent, auto["Car Purchase Amount"], test_size=0.2, random_state=4)

In [None]:
# we have to fit the data
reg.fit(x_train, y_train)

In [None]:
# y = mx + b here m is the coefficient (or slope) of x 
# and b is the intercept

print(reg.coef_) # 
print(reg.intercept_)

In [None]:
# the relationship between the independent and dependent variables is 


#### Metrics for Linear Regression

Mean Squared Error 

For linear regresion with one variable, $ y = mx +b  = wx + b$

$ MSE = \frac{1}{N} \sum_{i=1}^{n} (y_i - (mx_i + b))^2 $ 

$y_i$ is the actual value and $mx_i + b$ is the predicted value.

$N$ is the number of observations.

The loss function based on the MSE is 

$ L(m, b) = \frac{1}{N} \sum_{i=1}^{n} (y_i - (mx_i + b))^2 $ 

our goal is to minimize $L$ with respect to $m$ and $b$

The gradient of $L$

$L'(m, b) = \begin{bmatrix} \frac{dL}{dm} \\ \frac{dL}{db} \end{bmatrix} = \begin{bmatrix} \frac{1}{N} \sum -x_i.2(y_i - (mx_i + b))  \\ \frac{1}{N} \sum -2(y_i - (mx_i + b))  \end{bmatrix}$ 

Update equation of m and b with learning rate $\epsilon$ is

$ m_{new} = m_{old} - \epsilon \frac{dL}{dm} (m_{old}) $

$ b_{new} = b_{old} - \epsilon \frac{dL}{db} (b_{old})$

<img src="linear_loss.png" width=400, height=300>

true  predicted true-predicted  (true-predicted)^2

3       5        -2                 4

5       3         2                 4

#### Different Gradient Descents

Gradient Descent - every single data point is considered for update. 

Batch Gradient Descent - A whole batch of data is considered and then an update is done. 
It is slow when the training data is large. 

Stochastic Gradient Descent - a single point at random is chosen and loss is computed for update. 

Mini-batch Stochastic Gradient Descent - a mini-batch of randomly selected data points is considered and the average loss of the mini-batch is computed for the update. 

In [None]:
# yhat stands for perdicted value of y and for this we 
# have to use the x_test values

yhat = reg.predict(x_test) 

In [None]:
from sklearn.metrics import mean_squared_error

mse_test = mean_squared_error(y_test, yhat)
print(mse_test)

In [None]:
# y_t_predict is the predicted y values for the x_train data
y_t_predict = reg.predict(x_train)

# note that y_train is the true y value
mse_train = mean_squared_error(y_train, y_t_predict)
print(mse_train)

In [None]:
# Since MSE has no upper bound, we compute the ratio between 
# mse_test and mse_train or difference to see if 
# ratio is close to one and difference is less than 5%

r1 = mse_test/mse_train

diff1 = np.abs(mse_train - mse_test)

print(r1)

print(diff1)

In [None]:
from sklearn.metrics import r2_score

print("r-squared for the test data: ", r2_score(y_test, yhat))
    
print("r-squared for the train data: ", r2_score(y_train, y_t_predict))

#### Conclusion:

r-squared for test is  and for train is 



In [None]:
"""
In-class activity: In the auto example, find the multi-linear relationship 
between 'Age','Annual Salary', 'Credit Card Debt' with 'Car Purchase Amount'. 
Find the mean squared error  and r-squared for test set 
and train set and make a conclusion. 
"""

#### Linear model example

Building a linear model with one independent variable and one dependent variable.

For this, we will consider the study time versus score from the pptx

In [None]:
# example of study time versus score from the pptx

x = np.array([16, 34, 8, 38, 39, 40, 54, 21, 16, 67, 40, 43, 47, 56, 60, 80])
y = np.array([50, 61, 45, 60, 60, 67, 65, 59, 57, 73, 68, 71, 75, 71, 88, 94])

In [None]:
print(y.shape) 
y = y.reshape(-1,1) # -1 is used to access last values in a list or a tuple
print(y.shape)
x = x.reshape(-1,1)
print(x.shape)

In [None]:
# using train_test_split(), we are splitting the data into training and test 
# x_train, x_test, y_train, y_test

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
reg.fit(x_train, y_train)

In [None]:
print(reg.coef_)
print(reg.intercept_)

In [None]:
# constraints are : minimum study time can be zero. 
# The max score can be 100

In [None]:
#𝑦 = 43.562 + 0.577∗ 75 
print(43.562 + (0.577*75))

In [None]:
y_max = 100
x_input = 100
y = 43.562 + 0.577*x_input
print(y)
lower = min(y, y_max)

print(lower)

In [None]:
# linear regression calculator

# https://www.socscistatistics.com/tests/regression/default.aspx

Resources -
https://towardsdatascience.com/differential-equations-basics-c72db0a8c42a

Understandable Statistics: Concepts and Methods, Enhanced 11th Edition
by Charles Henry Brase (Author), Corrinne Pellillo Brase (Author)
