    Has 5 methods of building models:
All in  |  Backward Elimination | Foward Selection | Bidirectional Elimination | Score Comparison

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pts

In [2]:
# Import the dataset

dataset = pd.read_csv('50_Startups.csv')

In [3]:
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [4]:
#Separating matrix of features and dependent variable

X = dataset.iloc[:, :-1].values #independent
Y = dataset.iloc[:, 4].values #dependent

In [5]:
#Encoding Categorical Data - Dummy Enconding

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([('encoder', OneHotEncoder(), [3])], remainder='passthrough')

X = np.array(ct.fit_transform(X), dtype=np.float)

In [6]:
#------------AVOIDING THE DUMMY VARIABLE TRAP------------#

X = X[:, 1:]

In [7]:
#Splitting the Dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

We don't need to apply features skating for multiple linear regression becausse the library will take care of that for us!

#### Fitting Multiple Linear Regression to the Training set

In [8]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#### Predicting the Test set results

In [9]:
Y_pred = lr.predict(X_test) 

In [10]:
for r in range(len(Y_pred)):
    print('#',r+1, Y_pred[r], '=!', Y_test[r])

# 1 103015.20159796216 =! 103282.38
# 2 132582.27760815847 =! 144259.4
# 3 132447.73845174976 =! 146121.95
# 4 71976.09851258763 =! 77798.83
# 5 178537.4822105438 =! 191050.39
# 6 116161.24230163351 =! 105008.31
# 7 67851.69209676176 =! 81229.06
# 8 98791.73374687924 =! 97483.56
# 9 113969.43533011667 =! 110352.25
# 10 167921.06569550227 =! 166187.94


In [11]:
#Building the optimal model using Backward Elimination

import statsmodels.api as sm
X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis = 1)

In [12]:
X_opt = X[:,[0,3]]

lr_OLS = sm.OLS(endog = Y, exog = X_opt).fit() #ordinary least squares

lr_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Wed, 03 Jun 2020",Prob (F-statistic):,3.5000000000000004e-32
Time:,18:14:11,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


# Backward Elimination Function with p-values only:

In [13]:
def backwardElimination(x, sl, y):
    import statsmodels.api as sm
    
    numVars = len(x[0])
    
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(endog = y, exog = x).fit() #X and Y are matrix of features and dependent variable respectively
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
                else:
                    continue
        
    print (regressor_OLS.summary())
    return x

#Example:
SL = 0.05
X_opt = X[:, 0:6]
X_Modeled = backwardElimination(X_opt, SL, Y)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.947
Model:                            OLS   Adj. R-squared:                  0.945
Method:                 Least Squares   F-statistic:                     849.8
Date:                Wed, 03 Jun 2020   Prob (F-statistic):           3.50e-32
Time:                        18:14:11   Log-Likelihood:                -527.44
No. Observations:                  50   AIC:                             1059.
Df Residuals:                      48   BIC:                             1063.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.903e+04   2537.897     19.320      0.0

# Backward Elimination Function with p-values and Adjusted R Squared:

In [14]:
def backwardEliminationEvolution(x, SL, y):
    import statsmodels.api as sm
    
    numVars = len(x[0])
    temp = np.zeros((50,6)).astype(int)#Do small adjust when needed (50 rows and 6 columns)
    
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(endog = y, exog = x).fit() #X and Y are matrix of features and dependent variable respectively
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(endog = y, exog = x).fit()#X and Y are matrix of features and dependent variable respectively
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    print (regressor_OLS.summary())
    return x

#Example:
SL = 0.05
X_opt = X[:, :-1]
X_Modeled = backwardEliminationEvolution(X_opt, SL, Y)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.948
Model:                            OLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     426.8
Date:                Wed, 03 Jun 2020   Prob (F-statistic):           7.29e-31
Time:                        18:14:11   Log-Likelihood:                -526.83
No. Observations:                  50   AIC:                             1060.
Df Residuals:                      47   BIC:                             1065.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       5.489e+04   6016.718      9.122      0.0