# Multiple Linear Regression

In [None]:
# same like single linear, but this regression have more than one independant variable
# formula of MLR is y = b + m1x1 + m2x2 +...+ mnxn
# y is dependant variable
# b is intercept
# m is coefficient
# x is independant variable
# n is how many indepandant variable exist in case

# to use linear regression, we must have several assumptions:
# data is linear, homoscedasticity, multivariate normality, independence of errors, 
# lack of multicollinearity
# all of these must true before we can use linear regression

# in this startup case, we have 3 independant variable that have same value (dollars amount)
# but we also have state, which is categorical data so we have make dummy variables to this category.
# the technique to make dummy variable is like one-hot encoding,
# one-hot encoding make n variable based on n category
# dummy variable make n-1 variable based on n category
# ex: 2 category, new york, california, miami, LA
# dummy var : newyork 1 0 0, california 0 1 0, miami 0 0 1, LA 0 0 0
# one-hot : newyork 1 0 0 0, california 0 1 0 0, miami 0 0 1 0, LA 0 0 0 1

# dummy variable trap
# why dummy variable only use n-1 variable? because of dummy variable trap
# dummy variable trap can happen because of multicollinearity.
# in simple term it happen because one variable can explain other variable.
# using our example, by using only 0 0 0 data, we already know that the data is LA.
# we dont need 0 0 0 1 to represent LA.
# the effect is in linear regression, every variable is have individual coefficient,
# if we have 0 0 0 1, when 1 occur it also have coefficient in which already stated as intercept.
# the solution of this is by remove the last variable, or remove intercept.

# p values
# p values is a value in hypothesis testing to help support or rejecting the null hypothesis 
# smaller p values, stronger evidence to reject hypothesis
# we get p values from calculation

# building model
# some of independent values have to thrown out.
# reason 1 : garbage in, garbage out, too many independent is bad
# reason 2 : variable have to be explained, and how it can affect dependant variable
# only keep most important variable to be independent variable

# to build model, we can use several method

# all in, use all variable
# when to use : prior knowledge, have to, prepare for backward elimination

# backward elimination
# 1. select significance level
# 2. fit full model with all possible predictors
# 3. consider the predictor with the highest p value, if p value > significance level, remove the predictor
# else fin
# 4. fit model without this variable, back to step 3

# forward elimination
# 1. select significance level
# 2. fit all simple regression models, select one with the lowest p value
# 3. keep this variable, and fit with extra predictor added.
# 4. consider the predictor with lowest p value, if p value < significance level, back to step 3
# else fin, keep the previous model without new predictor added.

# bidirectional elimination
# 1. select significance level to enter & significance level to stay
# 2. perform the next step of forward selection
# 3. perform all step from backward elimination, back to step 2, until no new variable can enter & old exit
# model ready

# all possible models
# like bruteforce, resourceful but accurate
# construct all possible models.


## Importing the libraries

In [1]:
# import all module
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
# import dataset from file
dataset = pd.read_csv('50_Startups.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1:].values

In [3]:
print(x)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

In [4]:
print(y)

[[192261.83]
 [191792.06]
 [191050.39]
 [182901.99]
 [166187.94]
 [156991.12]
 [156122.51]
 [155752.6 ]
 [152211.77]
 [149759.96]
 [146121.95]
 [144259.4 ]
 [141585.52]
 [134307.35]
 [132602.65]
 [129917.04]
 [126992.93]
 [125370.37]
 [124266.9 ]
 [122776.86]
 [118474.03]
 [111313.02]
 [110352.25]
 [108733.99]
 [108552.04]
 [107404.34]
 [105733.54]
 [105008.31]
 [103282.38]
 [101004.64]
 [ 99937.59]
 [ 97483.56]
 [ 97427.84]
 [ 96778.92]
 [ 96712.8 ]
 [ 96479.51]
 [ 90708.19]
 [ 89949.14]
 [ 81229.06]
 [ 81005.76]
 [ 78239.91]
 [ 77798.83]
 [ 71498.49]
 [ 69758.98]
 [ 65200.33]
 [ 64926.08]
 [ 49490.75]
 [ 42559.73]
 [ 35673.41]
 [ 14681.4 ]]


## Encoding categorical data

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct= ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[3])],remainder='passthrough')
x = np.array(ct.fit_transform(x))

# we don't have to do feature scaling, because every variable have it own coefficient
# we don't need to check linear regression assumption because it will be a waste of time
# and if it don't have linear variable so it will give poor accuracy and we must use other model

## Splitting the dataset into the Training set and Test set

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

## Training the Multiple Linear Regression model on the Training set

In [7]:
# we don't have to do something to avoid dummy variable trap
# because the class that will be used in this step will automatically avoid dummy variable trap.
# so we only need to prepare encoded category to one-hot encoding

# we don't have to determine which combination of variable that have p value needed because the class also automatically take care that problem.
# the MLR class automatically search the highest accuracy.

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(x_train,y_train)

LinearRegression()

## Predicting the Test set results

In [8]:
y_pred = lr.predict(x_test)

#set decimal precision
np.set_printoptions(precision = 2)

# concatenate vertically / horizontally from 2 set of data
# reshape the vector to 1 column
# concatenate axis set to 1 to vertical concatenate, 0 to horizontal (default)
# if want concat vertical, the result must be shaped to vertical, vice versa
print(np.concatenate((y_pred.reshape(1,len(y_pred)),y_test.reshape(1,len(y_test)))))

[[126362.88  84608.45  99677.49  46357.46 128750.48  50912.42 109741.35
  100643.24  97599.28 113097.43]
 [134307.35  81005.76  99937.59  64926.08 125370.37  35673.41 105733.54
  107404.34  97427.84 122776.86]]


In [9]:
print(y_pred)

[[126362.88]
 [ 84608.45]
 [ 99677.49]
 [ 46357.46]
 [128750.48]
 [ 50912.42]
 [109741.35]
 [100643.24]
 [ 97599.28]
 [113097.43]]


In [10]:
print(y_pred.reshape(1,len(y_pred)))

[[126362.88  84608.45  99677.49  46357.46 128750.48  50912.42 109741.35
  100643.24  97599.28 113097.43]]


In [11]:
print(y_test)

[[134307.35]
 [ 81005.76]
 [ 99937.59]
 [ 64926.08]
 [125370.37]
 [ 35673.41]
 [105733.54]
 [107404.34]
 [ 97427.84]
 [122776.86]]


In [12]:
print(y_test.reshape(1,len(y_test)))

[[134307.35  81005.76  99937.59  64926.08 125370.37  35673.41 105733.54
  107404.34  97427.84 122776.86]]


In [13]:
print(lr.coef_)
print(lr.intercept_)

[[-3.15e+02  6.24e+02 -3.08e+02  8.06e-01 -6.88e-02  2.99e-02]]
[54343.3]
