In [13]:
#  *** MULTIPLE LINEAR REGRESSION ***


# ***Data Preprocessing***
# ***Importing libraries***
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


# ***Importing the dataset*** 
df = pd.read_csv('50_Startups.csv')

    # Feature Matrix
X = df.iloc[:, :-1].values

    # Dependent variable vector
y = df.iloc[:, -1:].values


# Encoding Categorical Data for the independent variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

# *** Avoiding the DUMMY VARIABLE TRAP ***
X = X[:, 1:] # The python libraray takes care of this, no need to do it manually everytime

# ***Spiltting the dataset into Training set and Test set***
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

"""
# No need to do it, taken care by library
#  *** Feature Scaling ***
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)
"""

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


'\n# No need to do it, taken care by library\n#  *** Feature Scaling ***\nfrom sklearn.preprocessing import StandardScaler\nsc_X = StandardScaler()\nX_train = sc_X.fit_transform(X_train)\nX_test = sc_X.transform(X_test)\nsc_y = StandardScaler()\ny_train = sc_y.fit_transform(y_train)\n'

In [7]:
df 
# Encoding is needed

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [8]:
X

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 29491

In [9]:
y

array([[192261.83],
       [191792.06],
       [191050.39],
       [182901.99],
       [166187.94],
       [156991.12],
       [156122.51],
       [155752.6 ],
       [152211.77],
       [149759.96],
       [146121.95],
       [144259.4 ],
       [141585.52],
       [134307.35],
       [132602.65],
       [129917.04],
       [126992.93],
       [125370.37],
       [124266.9 ],
       [122776.86],
       [118474.03],
       [111313.02],
       [110352.25],
       [108733.99],
       [108552.04],
       [107404.34],
       [105733.54],
       [105008.31],
       [103282.38],
       [101004.64],
       [ 99937.59],
       [ 97483.56],
       [ 97427.84],
       [ 96778.92],
       [ 96712.8 ],
       [ 96479.51],
       [ 90708.19],
       [ 89949.14],
       [ 81229.06],
       [ 81005.76],
       [ 78239.91],
       [ 77798.83],
       [ 71498.49],
       [ 69758.98],
       [ 65200.33],
       [ 64926.08],
       [ 49490.75],
       [ 42559.73],
       [ 35673.41],
       [ 14681.4 ]])

In [12]:
np.set_printoptions(formatter={'float': lambda x: "{0:0.0f}".format(x)})
X
# x[2] represents New York dummy variable
# x[0] represnets California dv
# x[1] represents Florida dv

array([[0, 0, 1, 165349, 136898, 471784],
       [1, 0, 0, 162598, 151378, 443899],
       [0, 1, 0, 153442, 101146, 407935],
       [0, 0, 1, 144372, 118672, 383200],
       [0, 1, 0, 142107, 91392, 366168],
       [0, 0, 1, 131877, 99815, 362861],
       [1, 0, 0, 134615, 147199, 127717],
       [0, 1, 0, 130298, 145530, 323877],
       [0, 0, 1, 120543, 148719, 311613],
       [1, 0, 0, 123335, 108679, 304982],
       [0, 1, 0, 101913, 110594, 229161],
       [1, 0, 0, 100672, 91791, 249745],
       [0, 1, 0, 93864, 127320, 249839],
       [1, 0, 0, 91992, 135495, 252665],
       [0, 1, 0, 119943, 156547, 256513],
       [0, 0, 1, 114524, 122617, 261776],
       [1, 0, 0, 78013, 121598, 264346],
       [0, 0, 1, 94657, 145078, 282574],
       [0, 1, 0, 91749, 114176, 294920],
       [0, 0, 1, 86420, 153514, 0],
       [1, 0, 0, 76254, 113867, 298664],
       [0, 0, 1, 78389, 153773, 299737],
       [0, 1, 0, 73995, 122783, 303319],
       [0, 1, 0, 67533, 105751, 304769],
       [0,

In [14]:
X

array([[0, 1, 165349, 136898, 471784],
       [0, 0, 162598, 151378, 443899],
       [1, 0, 153442, 101146, 407935],
       [0, 1, 144372, 118672, 383200],
       [1, 0, 142107, 91392, 366168],
       [0, 1, 131877, 99815, 362861],
       [0, 0, 134615, 147199, 127717],
       [1, 0, 130298, 145530, 323877],
       [0, 1, 120543, 148719, 311613],
       [0, 0, 123335, 108679, 304982],
       [1, 0, 101913, 110594, 229161],
       [0, 0, 100672, 91791, 249745],
       [1, 0, 93864, 127320, 249839],
       [0, 0, 91992, 135495, 252665],
       [1, 0, 119943, 156547, 256513],
       [0, 1, 114524, 122617, 261776],
       [0, 0, 78013, 121598, 264346],
       [0, 1, 94657, 145078, 282574],
       [1, 0, 91749, 114176, 294920],
       [0, 1, 86420, 153514, 0],
       [0, 0, 76254, 113867, 298664],
       [0, 1, 78389, 153773, 299737],
       [1, 0, 73995, 122783, 303319],
       [1, 0, 67533, 105751, 304769],
       [0, 1, 77044, 99281, 140575],
       [0, 0, 64665, 139553, 137963],
       

In [15]:
# *** Fitting Multiple Linear Regression to the Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
# ***Predicting the Test Set results
y_pred = regressor.predict(X_test)

In [28]:
# *** Building the optimal model using BACKWARD ELIMINATION Method
    # Creating a column of 1s for B_o in our equation
import statsmodels.formula.api as sm
X_train = np.append(arr = np.ones((40,1)).astype(int), values = X_train, axis = 1) 

In [29]:
X_train

array([[1, 1, 0, 55494, 103057, 214635],
       [1, 0, 1, 46014, 85047, 205518],
       [1, 1, 0, 75329, 144136, 134050],
       [1, 0, 0, 46426, 157694, 210798],
       [1, 1, 0, 91749, 114176, 294920],
       [1, 1, 0, 130298, 145530, 323877],
       [1, 1, 0, 119943, 156547, 256513],
       [1, 0, 1, 1000, 124153, 1904],
       [1, 0, 1, 542, 51743, 0],
       [1, 0, 1, 65605, 153032, 107138],
       [1, 0, 1, 114524, 122617, 261776],
       [1, 1, 0, 61994, 115641, 91131],
       [1, 0, 0, 63409, 129220, 46085],
       [1, 0, 0, 78013, 121598, 264346],
       [1, 0, 0, 23641, 96190, 148001],
       [1, 0, 0, 76254, 113867, 298664],
       [1, 0, 1, 15506, 127382, 35534],
       [1, 0, 1, 120543, 148719, 311613],
       [1, 0, 0, 91992, 135495, 252665],
       [1, 0, 0, 64665, 139553, 137963],
       [1, 0, 1, 131877, 99815, 362861],
       [1, 0, 1, 94657, 145078, 282574],
       [1, 0, 0, 28754, 118546, 172796],
       [1, 0, 0, 0, 116984, 45173],
       [1, 0, 0, 162598, 151378, 

In [30]:
# Ordinary least squares
X_train_opt = X_train[:,[0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog= y_train, exog= X_train_opt).fit()
regressor_OLS.summary()
# SL = 0.05 and eliminating those features which have p > SL

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,129.7
Date:,"Fri, 19 Apr 2019",Prob (F-statistic):,3.91e-21
Time:,17:02:02,Log-Likelihood:,-421.1
No. Observations:,40,AIC:,854.2
Df Residuals:,34,BIC:,864.3
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.255e+04,8358.538,5.091,0.000,2.56e+04,5.95e+04
x1,-959.2842,4038.108,-0.238,0.814,-9165.706,7247.138
x2,699.3691,3661.563,0.191,0.850,-6741.822,8140.560
x3,0.7735,0.055,14.025,0.000,0.661,0.886
x4,0.0329,0.066,0.495,0.624,-0.102,0.168
x5,0.0366,0.019,1.884,0.068,-0.003,0.076

0,1,2,3
Omnibus:,15.823,Durbin-Watson:,2.468
Prob(Omnibus):,0.0,Jarque-Bera (JB):,23.231
Skew:,-1.094,Prob(JB):,9.03e-06
Kurtosis:,6.025,Cond. No.,1490000.0


In [31]:
X_train_opt = X_train[:,[0,1,3,4,5]]
regressor_OLS = sm.OLS(endog= y_train, exog= X_train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,166.7
Date:,"Fri, 19 Apr 2019",Prob (F-statistic):,2.87e-22
Time:,17:02:58,Log-Likelihood:,-421.12
No. Observations:,40,AIC:,852.2
Df Residuals:,35,BIC:,860.7
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.292e+04,8020.397,5.352,0.000,2.66e+04,5.92e+04
x1,-1272.1608,3639.780,-0.350,0.729,-8661.308,6116.986
x2,0.7754,0.053,14.498,0.000,0.667,0.884
x3,0.0319,0.065,0.488,0.629,-0.101,0.165
x4,0.0363,0.019,1.902,0.065,-0.002,0.075

0,1,2,3
Omnibus:,16.074,Durbin-Watson:,2.467
Prob(Omnibus):,0.0,Jarque-Bera (JB):,24.553
Skew:,-1.086,Prob(JB):,4.66e-06
Kurtosis:,6.164,Cond. No.,1430000.0


In [32]:
X_train_opt = X_train[:,[0,3,4,5]]
regressor_OLS = sm.OLS(endog= y_train, exog= X_train_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,227.8
Date:,"Fri, 19 Apr 2019",Prob (F-statistic):,1.8499999999999998e-23
Time:,17:03:13,Log-Likelihood:,-421.19
No. Observations:,40,AIC:,850.4
Df Residuals:,36,BIC:,857.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.299e+04,7919.773,5.428,0.000,2.69e+04,5.91e+04
x1,0.7788,0.052,15.003,0.000,0.674,0.884
x2,0.0294,0.064,0.458,0.650,-0.101,0.160
x3,0.0347,0.018,1.896,0.066,-0.002,0.072

0,1,2,3
Omnibus:,15.557,Durbin-Watson:,2.481
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.539
Skew:,-1.081,Prob(JB):,1.28e-05
Kurtosis:,5.974,Cond. No.,1430000.0


In [33]:
X_train_opt = X_train[:,[0,3,5]]
regressor_OLS = sm.OLS(endog= y_train, exog= X_train_opt).fit()
regressor_OLS.summary()
# If we are gonna thoroughly follow backward elimination then we will eliminate 5th column
# But we are gonna use other powerful metrics such as R-squared and Adj R-Squared to decide with
# more certainity whether we need to keep it or not

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.947
Method:,Least Squares,F-statistic:,349.0
Date:,"Fri, 19 Apr 2019",Prob (F-statistic):,9.65e-25
Time:,17:05:29,Log-Likelihood:,-421.3
No. Observations:,40,AIC:,848.6
Df Residuals:,37,BIC:,853.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.635e+04,2971.236,15.598,0.000,4.03e+04,5.24e+04
x1,0.7886,0.047,16.846,0.000,0.694,0.883
x2,0.0326,0.018,1.860,0.071,-0.003,0.068

0,1,2,3
Omnibus:,14.666,Durbin-Watson:,2.518
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.582
Skew:,-1.03,Prob(JB):,3.39e-05
Kurtosis:,5.847,Cond. No.,497000.0


In [40]:
X_train_aise_opt = X_train[:, [0,3]]
regressor_OLS = sm.OLS(endog= y_train, exog= X_train_aise_opt).fit()
regressor_OLS.summary()
# So its better not to remove the the marketing head column
# Always check R2 and ADJ R2

0,1,2,3
Dep. Variable:,y,R-squared:,0.945
Model:,OLS,Adj. R-squared:,0.944
Method:,Least Squares,F-statistic:,652.4
Date:,"Sun, 21 Apr 2019",Prob (F-statistic):,1.56e-25
Time:,12:36:05,Log-Likelihood:,-423.09
No. Observations:,40,AIC:,850.2
Df Residuals:,38,BIC:,853.6
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.842e+04,2842.717,17.032,0.000,4.27e+04,5.42e+04
x1,0.8516,0.033,25.542,0.000,0.784,0.919

0,1,2,3
Omnibus:,13.132,Durbin-Watson:,2.325
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.254
Skew:,-0.991,Prob(JB):,0.000295
Kurtosis:,5.413,Cond. No.,157000.0


In [37]:
y_train_opt = y_train
y_test_opt  = y_test

X_test = np.append(arr= np.ones((10,1)).astype(int), values= X_test, axis=1)
X_test_opt = X_test[:,[0,3,5]]

In [39]:
X_test_opt

array([[1, 66052, 118148],
       [1, 100672, 249745],
       [1, 101913, 229161],
       [1, 27893, 164471],
       [1, 153442, 407935],
       [1, 72108, 353184],
       [1, 20230, 185265],
       [1, 61136, 88218],
       [1, 73995, 303319],
       [1, 142107, 366168]])