In [None]:
#Steps :
#1: Grab The Data
#2: Separate the Data into Dependant and Independent Variables
#3: Deal with Missing values (Impute/Drop) ** Not Required in this case **
#4: Encode Non Numeric categorical Data and create dummy variables
#5: Eliminate the Non Influencing factors (independent variables) 
#6: Feature Scale ** Not Used in this case **
#7: Apply Dimensionality Reduction ** Not Used in this case **
#8: Divide the data into training and testing data
#9: Build the Model, Check the Co-Efficient and the Intercepts
#10: Run The Model on Test Data using K-Fold Cross Val 
#11: Tune the Hyperparameters of the algorithm and Go to Step 9 till you find satisfactory accuracy
#12: Interpret the Results

In [1]:
# Multiple Linear Regression

# Importing the libraries
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score 

# Importing the dataset
df = pd.read_csv('50_Startups.csv')
df.head() ## Looking at top 5 records

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [3]:
## Lets see how many unique states exist
unique=df.State.unique() ## equivaluent to select distinct in SQL and filter in excel
print(unique)

count=df.State.unique().size
print(count)

['New York' 'California' 'Florida']
3


In [4]:
### Convert Non Numerical Categorical column into numeric ones
##Check the categories which are non numerical

categoryList = list(df.select_dtypes(include=['object']).columns) ## 'object' type means catgeorical variables
print(categoryList)
print("\n")

['State']




In [5]:
## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(df[categoryList], prefix= categoryList)
print(dummies.head())
print("\n")

   State_California  State_Florida  State_New York
0                 0              0               1
1                 1              0               0
2                 0              1               0
3                 0              0               1
4                 0              1               0




In [6]:
df.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(df.head())
print("\n")

   R&D Spend  Administration  Marketing Spend     Profit
0  165349.20       136897.80        471784.10  192261.83
1  162597.70       151377.59        443898.53  191792.06
2  153441.51       101145.55        407934.54  191050.39
3  144372.41       118671.85        383199.62  182901.99
4  142107.34        91391.77        366168.42  166187.94




In [7]:
df=pd.concat([df,dummies], axis =1 ) ## added encoded categorical columns
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


In [8]:
X = df.drop(['Profit'], axis=1)
y = df.Profit

X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [9]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [10]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print(X_train[0:5])

    R&D Spend  Administration  Marketing Spend  State_California  \
33   55493.95       103057.49        214634.81                 0   
35   46014.02        85047.44        205517.64                 0   
26   75328.87       144135.98        134050.07                 0   
34   46426.07       157693.92        210797.67                 1   
18   91749.16       114175.79        294919.57                 0   

    State_Florida  State_New York  
33              1               0  
35              0               1  
26              1               0  
34              0               0  
18              1               0  


In [11]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train) ## fitted training data for the algorithm to find the pattern


#r2_mc=regressor.score(X_train,y_train)

# Intercept ## Mean value of Y when X=0 [constant=when all independent variables are zero]
print("Intercept:",regressor.intercept_)

# Co-Efficient of each Variable
print("Regression Coeff:",regressor.coef_) 

Intercept: 42467.52924855314
Regression Coeff: [ 7.73467193e-01  3.28845975e-02  3.66100259e-02  8.66383692e+01
 -8.72645791e+02  7.86007422e+02]


In [12]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
print(y_pred)
print("\n")
print(y_test)

[103015.20159796 132582.27760816 132447.73845174  71976.09851258
 178537.48221055 116161.24230165  67851.69209676  98791.73374687
 113969.43533012 167921.0656955 ]


28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64


In [13]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 94.6% of the variance or variability of the data is explained by the model

MAE: 7514.2936596406125
MSE: 83502864.03257766
RMSE: 9137.99015279496
Explained Variance Score: 94.69192858652778


In [14]:
from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) 
## Closer to 1 means better prediction. r2 indicates the closeness to regression line to actual line

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) 
## Closer to 1 the better the prediction. However, it punishes if unnecessary variables are added. 
## adjusted r2 works as a metric in this case as the number of observations/number of rows are low (~50)

r2: 0.9347068473282423
adj_r2= 0.8041205419847268


In [16]:
### from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 3,scoring='neg_median_absolute_error') 
print (regression_avg.mean())
print (regression_avg.std())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 3,scoring='explained_variance') 
print (regression_avg.mean()*100)
print (regression_avg.std()*100)

-7049.081236618258
3284.2278039495764
89.40193252736495
7.606993774732442


# Predicting against Real Dataset

In [17]:
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
df = pd.read_csv('Startups_Test_Samp.csv')
df.head() ## Looking at top 5 records

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,185349.2,146897.8,571784.1,New York,
1,182597.7,161377.59,343898.53,California,
2,153441.51,101145.55,407934.54,Florida,
3,154372.41,128671.85,383199.62,New York,
4,132107.34,81391.77,466168.42,Florida,


In [18]:
### Convert Non Numerical Categorical column into numeric ones
##Check the categories which are non numerical

categoryList = list(df.select_dtypes(include=['object']).columns)
print(categoryList)
print("\n")

## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(df[categoryList], prefix= categoryList)
print(dummies.head())
print("\n")

df.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(df.head())
print("\n")


df=pd.concat([df,dummies], axis =1 ) ## added encoded categorical columns
df.head()
                 
X = df.drop(['Profit'], axis=1)
y = df.Profit

X.head()

['State']


   State_California  State_Florida  State_New York
0                 0              0               1
1                 1              0               0
2                 0              1               0
3                 0              0               1
4                 0              1               0


   R&D Spend  Administration  Marketing Spend  Profit
0  185349.20       146897.80        571784.10     NaN
1  182597.70       161377.59        343898.53     NaN
2  153441.51       101145.55        407934.54     NaN
3  154372.41       128671.85        383199.62     NaN
4  132107.34        81391.77        466168.42     NaN




Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,185349.2,146897.8,571784.1,0,0,1
1,182597.7,161377.59,343898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,154372.41,128671.85,383199.62,0,0,1
4,132107.34,81391.77,466168.42,0,1,0


In [19]:
y = regressor.predict(X)
y

array([212378.76779102, 201684.46921256, 178537.48221055, 180915.80126906,
       163518.5503792 ])

# Building the Optimal Model using Backward Elimination

In [20]:
import statsmodels.regression.linear_model as sm
#pip install -U statsmodels

In [21]:
## Multiple Linear Regression : y=b0+b1x1+b2x2+......+bnxn
## Add x0 which is linked with variable b0 and always equals to 1.
#X=np.append(arr=X,values=np.ones((50,1)).astype(int), axis=1) ## Add 1; 50 times; axis=1 as it is a row

import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
df = pd.read_csv('50_Startups.csv')
df.head() ## Looking at top 5 records
                 
### Convert Non Numerical Categorical column into numeric ones
##Check the categories which are non numerical

categoryList = list(df.select_dtypes(include=['object']).columns)


## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(df[categoryList], prefix= categoryList)
print(dummies.head())


df.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(df.head())
print("\n")


df=pd.concat([df,dummies], axis =1 ) ## added encoded categorical columns
df.head()
                 
X = df.drop(['Profit'], axis=1)
y = df.Profit


## y=bo+b1x1+....bnxn can also be written as :
## y=boxo+b1x1+.....bnxn [assuming x0=1]

   State_California  State_Florida  State_New York
0                 0              0               1
1                 1              0               0
2                 0              1               0
3                 0              0               1
4                 0              1               0
   R&D Spend  Administration  Marketing Spend     Profit
0  165349.20       136897.80        471784.10  192261.83
1  162597.70       151377.59        443898.53  191792.06
2  153441.51       101145.55        407934.54  191050.39
3  144372.41       118671.85        383199.62  182901.99
4  142107.34        91391.77        366168.42  166187.94




In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
R&D Spend           50 non-null float64
Administration      50 non-null float64
Marketing Spend     50 non-null float64
Profit              50 non-null float64
State_California    50 non-null uint8
State_Florida       50 non-null uint8
State_New York      50 non-null uint8
dtypes: float64(4), uint8(3)
memory usage: 1.8 KB


In [23]:
X=np.append(arr=np.ones((50,1)).astype(int),values=X, axis=1) ## axis=1 means add columns, 50=total number of rows/records. 
#so it creates 50*1 matrix
## np.ones=add 1

In [24]:
print(df.head())
print("\n")
print(X[0:5])

   R&D Spend  Administration  Marketing Spend     Profit  State_California  \
0  165349.20       136897.80        471784.10  192261.83                 0   
1  162597.70       151377.59        443898.53  191792.06                 1   
2  153441.51       101145.55        407934.54  191050.39                 0   
3  144372.41       118671.85        383199.62  182901.99                 0   
4  142107.34        91391.77        366168.42  166187.94                 0   

   State_Florida  State_New York  
0              0               1  
1              0               0  
2              1               0  
3              0               1  
4              1               0  


[[1.0000000e+00 1.6534920e+05 1.3689780e+05 4.7178410e+05 0.0000000e+00
  0.0000000e+00 1.0000000e+00]
 [1.0000000e+00 1.6259770e+05 1.5137759e+05 4.4389853e+05 1.0000000e+00
  0.0000000e+00 0.0000000e+00]
 [1.0000000e+00 1.5344151e+05 1.0114555e+05 4.0793454e+05 0.0000000e+00
  1.0000000e+00 0.0000000e+00]
 [1.000000

In [25]:
regressor_OLS=sm.OLS(endog=y,exog=X).fit() ## data fitted for the model to detect unnecessary variables

In [26]:
## ## Significance Level = 0.05 or 5%
## Lower the P-Value , better it is
## Lower P Value, reject the null hypothesis, accept alternate hypothesis.
## Alternate hypothesis= This particular variable is influencing the outcome
## Find out the independent variable with highest P-Value
## If the variable with highest P-value is greater than Significance level, drop that variable
## Perform this exericse till the indepedent variable with highest P-Value is lower than significance level

## x1=R&D
##x2=Admin
## x3=Marketing
## x4=california
##x5=Florida
## x6=new York

regressor_OLS.summary()                  

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Sun, 05 Jul 2020",Prob (F-statistic):,1.34e-27
Time:,08:42:04,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.763e+04,5073.636,7.417,0.000,2.74e+04,4.79e+04
x1,0.8060,0.046,17.369,0.000,0.712,0.900
x2,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x3,0.0270,0.017,1.574,0.123,-0.008,0.062
x4,1.249e+04,2449.797,5.099,0.000,7554.868,1.74e+04
x5,1.269e+04,2726.700,4.654,0.000,7195.596,1.82e+04
x6,1.245e+04,2486.364,5.007,0.000,7439.285,1.75e+04

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,4.47e+21


In [27]:
X=X[:,[0,1,3,4,5,6]] ## REmoved x2=Admin
X[0]

array([1.000000e+00, 1.653492e+05, 4.717841e+05, 0.000000e+00,
       0.000000e+00, 1.000000e+00])

In [28]:
print("\n")
regressor_OLS=sm.OLS(endog=y,exog=X).fit()
regressor_OLS.summary()

## x1=R&D
## x2=Marketing
## x3=california
##x4=Florida
## x5=new York





0,1,2,3
Dep. Variable:,Profit,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,215.8
Date:,"Sun, 05 Jul 2020",Prob (F-statistic):,9.720000000000001e-29
Time:,08:42:54,Log-Likelihood:,-525.53
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1071.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.525e+04,2100.376,16.782,0.000,3.1e+04,3.95e+04
x1,0.7967,0.042,18.771,0.000,0.711,0.882
x2,0.0298,0.016,1.842,0.072,-0.003,0.062
x3,1.171e+04,1910.312,6.130,0.000,7861.854,1.56e+04
x4,1.185e+04,2170.903,5.459,0.000,7477.785,1.62e+04
x5,1.169e+04,1988.428,5.879,0.000,7684.996,1.57e+04

0,1,2,3
Omnibus:,14.64,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.037
Skew:,-0.938,Prob(JB):,2.7e-05
Kurtosis:,5.565,Cond. No.,5.19e+21


In [29]:
X=X[:,[0,1,3,4,5]] ## Removed x2=Marketing
regressor_OLS=sm.OLS(endog=y,exog=X).fit()
regressor_OLS.summary() 

## x1=R&D
## x2=Marketing
## x3=california
##x4=Florida
## x5=new York

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.943
Method:,Least Squares,F-statistic:,272.4
Date:,"Sun, 05 Jul 2020",Prob (F-statistic):,2.76e-29
Time:,08:43:19,Log-Likelihood:,-527.35
No. Observations:,50,AIC:,1063.0
Df Residuals:,46,BIC:,1070.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.686e+04,1959.786,18.806,0.000,3.29e+04,4.08e+04
x1,0.8530,0.030,28.226,0.000,0.792,0.914
x2,1.189e+04,1956.677,6.079,0.000,7955.697,1.58e+04
x3,1.306e+04,2122.665,6.152,0.000,8785.448,1.73e+04
x4,1.19e+04,2036.022,5.847,0.000,7805.580,1.6e+04

0,1,2,3
Omnibus:,13.418,Durbin-Watson:,1.122
Prob(Omnibus):,0.001,Jarque-Bera (JB):,17.605
Skew:,-0.907,Prob(JB):,0.00015
Kurtosis:,5.271,Cond. No.,1.71e+21


In [30]:
print(df.head())
print("\n")
print(X[0])

   R&D Spend  Administration  Marketing Spend     Profit  State_California  \
0  165349.20       136897.80        471784.10  192261.83                 0   
1  162597.70       151377.59        443898.53  191792.06                 1   
2  153441.51       101145.55        407934.54  191050.39                 0   
3  144372.41       118671.85        383199.62  182901.99                 0   
4  142107.34        91391.77        366168.42  166187.94                 0   

   State_Florida  State_New York  
0              0               1  
1              0               0  
2              1               0  
3              0               1  
4              1               0  


[1.000000e+00 1.653492e+05 0.000000e+00 0.000000e+00 1.000000e+00]


In [31]:
X=X[:,[1,2,3,4]] ## Dont pick up the 1 added as a constant

In [32]:
X[0]

array([1.653492e+05, 0.000000e+00, 0.000000e+00, 1.000000e+00])

In [33]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [34]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
print(y_pred)
print("\n")
print(y_test)

[104968.88239767 133957.33622217 135497.0349131   72485.3105571
 179362.04719701 109820.27677127  65657.67476855 100480.71994663
 111730.61659318 169713.51906841]


28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64


In [36]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 95.9% of the variance is explained by the model



from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_mean_absolute_error') 
print ("Cross val Mean Abs Error:",regression_avg.mean())
print ("Std Dev: Cross val Mean Abs Error:",regression_avg.std())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print ("Cross Val Explained Variance:",regression_avg.mean()*100)


from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction

MAE: 6789.980111923095
MSE: 67565210.75863671
RMSE: 8219.806004927166
Explained Variance Score: 95.91235085783279
Cross val Mean Abs Error: -8603.304421372253
Std Dev: Cross val Mean Abs Error: 3677.6547003358123
Cross Val Explained Variance: 57.380038144763176
r2: 0.947168930401692
adj_r2= 0.9049040747230457


# Including Marketing Spend 

In [37]:
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
df = pd.read_csv('50_Startups.csv')
df.head() ## Looking at top 5 records
                 
### Convert Non Numerical Categorical column into numeric ones
##Check the categories which are non numerical

categoryList = list(df.select_dtypes(include=['object']).columns)


## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(df[categoryList], prefix= categoryList)
print(dummies.head())


df.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(df.head())
print("\n")


df=pd.concat([df,dummies], axis =1 ) ## added encoded categorical columns
df.head()
                 
X = df.drop(['Profit','Administration'], axis=1)
y = df.Profit


## y=bo+b1x1+bnxn can also be written as :
## y=boxo+b1x1+bnxn [assuming x0=1]

   State_California  State_Florida  State_New York
0                 0              0               1
1                 1              0               0
2                 0              1               0
3                 0              0               1
4                 0              1               0
   R&D Spend  Administration  Marketing Spend     Profit
0  165349.20       136897.80        471784.10  192261.83
1  162597.70       151377.59        443898.53  191792.06
2  153441.51       101145.55        407934.54  191050.39
3  144372.41       118671.85        383199.62  182901.99
4  142107.34        91391.77        366168.42  166187.94




In [38]:
X.head()

Unnamed: 0,R&D Spend,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,471784.1,0,0,1
1,162597.7,443898.53,1,0,0
2,153441.51,407934.54,0,1,0
3,144372.41,383199.62,0,0,1
4,142107.34,366168.42,0,1,0


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
y_pred

print("Intercept:",regressor.intercept_)

# Co-Efficient of each Variable
print("Regression Coeff:",regressor.coef_) 

Intercept: 46238.45656989747
Regression Coeff: [ 7.85141717e-01  3.39564023e-02  9.06036086e+01 -7.35864550e+02
  6.45260941e+02]


In [41]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 95.9% of the variance is explained by the model


from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_mean_absolute_error') 
print ("Cross Val Mean Absolute Error",regression_avg.mean())
print ("Std Dev : Cross Val Mean Absolute Error",regression_avg.std())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print ("Cross Val Explained Variance:",regression_avg.mean()*100)

from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction

MAE: 7109.161997627603
MSE: 70726699.1446261
RMSE: 8409.916714488087
Explained Variance Score: 95.46038229358123
Cross Val Mean Absolute Error -8125.0355918930945
Std Dev : Cross Val Mean Absolute Error 3290.3457957037294
Cross Val Explained Variance: 75.76668685601769
r2: 0.9446968769428328
adj_r2= 0.8755679731213737


# Code to automate Backward Elimination

In [42]:
import numpy as np
#import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
df = pd.read_csv('50_Startups.csv')
df.head() ## Looking at top 5 records
                 
### Convert Non Numerical Categorical column into numeric ones
##Check the categories which are non numerical

categoryList = list(df.select_dtypes(include=['object']).columns)


## Create dummy variables for non numerical categorical variables
dummies = pd.get_dummies(df[categoryList], prefix= categoryList)
print(dummies.head())


df.drop(categoryList, axis=1, inplace = True) ## Drop Non numerical categorical columns
print(df.head())
print("\n")


df=pd.concat([df,dummies], axis =1 ) ## added encoded categorical columns
df.head()
                 
X = df.drop(['Profit'], axis=1)
y = df.Profit

X=np.append(arr=np.ones((50,1)).astype(int),values=X, axis=1) ## axis=1 means add columns, 50=total number of records. 
#so it creates 50*1 matrix
## np.ones=add 1

   State_California  State_Florida  State_New York
0                 0              0               1
1                 1              0               0
2                 0              1               0
3                 0              0               1
4                 0              1               0
   R&D Spend  Administration  Marketing Spend     Profit
0  165349.20       136897.80        471784.10  192261.83
1  162597.70       151377.59        443898.53  191792.06
2  153441.51       101145.55        407934.54  191050.39
3  144372.41       118671.85        383199.62  182901.99
4  142107.34        91391.77        366168.42  166187.94




In [43]:
X[0]

array([1.000000e+00, 1.653492e+05, 1.368978e+05, 4.717841e+05,
       0.000000e+00, 0.000000e+00, 1.000000e+00])

In [44]:
        import statsmodels.regression.linear_model as sm
        def backwardElimination(x, sl): ## function which accepts X and significance level
            numVars = len(x[0]) ## reading all rows
            for i in range(0, numVars): ## iterating through the rows
                regressor_OLS = sm.OLS(y, x).fit() # finding the p-values
                maxVar = max(regressor_OLS.pvalues)#.astype(float) ## getting the variables with max p-value
                if maxVar > sl: ## checking if its greater than the significance level ( in this case 0.05)
                    for j in range(0, numVars - i): ## since python starts from 0, we need to adjust the index
                        if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                            x = np.delete(x, j, 1) ## delete that unimportant variable
            regressor_OLS.summary()
            return x

In [45]:
sl= 0.05 ## try with 0.08
X = backwardElimination(X, sl)

X[0]

array([1.000000e+00, 1.653492e+05, 0.000000e+00, 0.000000e+00,
       1.000000e+00])

In [46]:
X=X[:,[1,2,3,4]] 

In [47]:
print(df.head())
print("\n")
print(X[0])

   R&D Spend  Administration  Marketing Spend     Profit  State_California  \
0  165349.20       136897.80        471784.10  192261.83                 0   
1  162597.70       151377.59        443898.53  191792.06                 1   
2  153441.51       101145.55        407934.54  191050.39                 0   
3  144372.41       118671.85        383199.62  182901.99                 0   
4  142107.34        91391.77        366168.42  166187.94                 0   

   State_Florida  State_New York  
0              0               1  
1              0               0  
2              1               0  
3              0               1  
4              1               0  


[1.653492e+05 0.000000e+00 0.000000e+00 1.000000e+00]


In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
y_pred

array([104968.88239767, 133957.33622217, 135497.0349131 ,  72485.3105571 ,
       179362.04719701, 109820.27677127,  65657.67476855, 100480.71994663,
       111730.61659318, 169713.51906841])

In [49]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

## Higher the Explained Variance Score, the better the model is:
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred)*100)
## 95.9% of the variance is explained by the model


from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='neg_mean_absolute_error') 
print ("Cross Val Mean Absolute Error",regression_avg.mean())

from sklearn.model_selection import cross_val_score 
regression_avg = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10,scoring='explained_variance') 
print ("Cross Val Explained Variance:",regression_avg.mean()*100)

from sklearn.metrics import r2_score 
print('r2:',r2_score(y_test, y_pred)) ## Closer to 1 means better prediction

adj_r2=1 - float(len(y_test)-1)/(len(y_test)-len(regressor.coef_)-1)*(1 - metrics.r2_score(y_test,y_pred))
print("adj_r2=",adj_r2) ##Closer to 1 the better the prediction

MAE: 6789.980111923095
MSE: 67565210.75863671
RMSE: 8219.806004927166
Explained Variance Score: 95.91235085783279
Cross Val Mean Absolute Error -8603.304421372253
Cross Val Explained Variance: 57.380038144763176
r2: 0.947168930401692
adj_r2= 0.9049040747230457
