In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [94]:
# the data set is of 50 companies - create a model to predict profit based on the features

In [95]:
# multiple linear regression :
# y = Bo + B1*X1 + B2*X2 + B3*X3 + B4*X4 + ........ + Bn*Xn 

# y -> dependent variable 
# X1,X2,X3,... -> independent variables

# assumptions of a linear regression -> linearity , homoscedasticity , multivariate normality , independence of errors , lack of multicollinearity

# dummy variables :
                #  a categorical variable in independent variable : approach to take care of them : dummy variables
                #  for each differ type of category : create a differ column
                #  for each row : if type of category is same as the column type : put 1 , else put 0
                #  similar to one hot encoding 
                #  these new columns are called dummy variable -> in regression eqn -> new variables -> D1,D2,D3,... 
                #  the dummy variables -> work like switch -> 1,0 
                #  however , the bias of the col with values 0 is included in Bo 

# Dummy Variable Trap :
                      # leads to multicollinearity 
                      # cant have Bo,B4,B5 at the same time
                      # always omit one dummy variable 
                      # lets say -> total differ categories -> 10 , so create 9 sets of dummy variables 

# P-value : measure to check if results of exp are withinf normal range of values for events being observed

In [96]:
# Building a model 
# we need to decide which parameters we wanna keep and which we wanna remove
# some parameters do not actually contribute and some might lead to conflicts

# Methods of building models :
    # * All-in
    # * Step wise Regression :
    #                          1. Backword-Elimination
    #                          2. Forward-Elimination
    #                          3. Bi-directional Elimination
    # * Score Comparison

# All-In : just use all 
        #  : prior knowledge of using all or have to use all
        #  : preparing for backword-elimination

# Backward-Elimination -
                    # step 1 : select a significance level to stay in the model  eg: SL=0.05
                    # step 2 : fit the full model with all possible predictors
                    # step 3 : consider the predictor with the highest P values. If P>Sl go to step 4 otherwise go to FIN
                    # step 4 : remove the predictor
                    # step 5 : fit model without this variable 
                    # again go to step 3
                    # FIN - finish -> model is ready

# Forward-Selection -
                    # step 1 : select a significance level to enter the model eg SL=0.05
                    # step 2 : fit a simple regression model with every single independent variable that we have -> select the one with the lowest P-Value
                    # step 3 : keep this variable and fit all possible models with one extra predictor added to the one(s) we already have -> basically , create a reg model with all possible 2 var regression
                    # step 4 : now out of all this -> consider the predictor with lowest P-value -> if P<Sl go to step 3 , otherwise go to FIN
                    # FIN - finish -> ready

# Bidirectional-Elimination -
                    # step 1 : select a significance level to enter and to stay in the model eg:SLENTER = 0.05 , SLSTAY = 0.05
                    # step 2 : perform the next step of Forward Selection (new variables must have P<SLENTER to enter)
                    # step 3 : perform all the steps of Backward Elimination (old variables must have P<SLSTAY to stay)
                    # again go to step 2 -> very iterative process
                    # stop when -> no new variables can enter and no old variables can exit
                    # FIN - finish -> ready

# All Possible Models -
                    # step 1 : select a criterion of goodness of fit 
                    # step 2 : construct all possible regression models : 2^n - 1 total combinations
                    # step 3 :  select the one with best criterion
                    # expo growth -> very resource consuming
                    

In [97]:
# CODE

dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,4].values
# print(X)


# need to encode the categorical variable - independent variable in this case

# from sklearn.preprocessing import LabelEncoder , OneHotEncoder
# labelEncoder_X = LabelEncoder()
# X[:,3] = labelEncoder_X.fit_transform(X[:,3])
# onehotencoder = OneHotEncoder(categorical_features=[3])
# X = onehotencoder.fit_transform(X).toarray()

# X = pd.get_dummies(dataset,columns =['State'])

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
# State column
ct = ColumnTransformer([("State", OneHotEncoder(), [3])], remainder = 'passthrough')
X = ct.fit_transform(X)


# avoiding the dummy variable trap -> omit one col
X=X[:,1:]
# print(X) 

#splitting the dataset into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
# print(X_train)

#feature scaling will be taken care by regression library

In [98]:
# fit model
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() # regressor is obj of LR class
regressor.fit(X_train,Y_train)

In [99]:
# testing performance of regressor 
Y_pred = regressor.predict(X_test)
Y_pred

array([103015.20159797, 132582.27760815, 132447.73845175,  71976.09851258,
       178537.48221054, 116161.24230164,  67851.69209676,  98791.73374688,
       113969.43533012, 167921.0656955 ])

In [100]:
# building optimal model using backward elimintn

import statsmodels.formula.api as sm
# this lib doesnt include Bo in the regression eqn so for that we will add a col of ones in beginning of X
# this col will act as a basis for Bo 
X = np.append(arr=np.ones((50,1)).astype(int),values=X,axis=1)
print(X)  # added a col of ones in beginning of X

[[1 0.0 1.0 165349.2 136897.8 471784.1]
 [1 0.0 0.0 162597.7 151377.59 443898.53]
 [1 1.0 0.0 153441.51 101145.55 407934.54]
 [1 0.0 1.0 144372.41 118671.85 383199.62]
 [1 1.0 0.0 142107.34 91391.77 366168.42]
 [1 0.0 1.0 131876.9 99814.71 362861.36]
 [1 0.0 0.0 134615.46 147198.87 127716.82]
 [1 1.0 0.0 130298.13 145530.06 323876.68]
 [1 0.0 1.0 120542.52 148718.95 311613.29]
 [1 0.0 0.0 123334.88 108679.17 304981.62]
 [1 1.0 0.0 101913.08 110594.11 229160.95]
 [1 0.0 0.0 100671.96 91790.61 249744.55]
 [1 1.0 0.0 93863.75 127320.38 249839.44]
 [1 0.0 0.0 91992.39 135495.07 252664.93]
 [1 1.0 0.0 119943.24 156547.42 256512.92]
 [1 0.0 1.0 114523.61 122616.84 261776.23]
 [1 0.0 0.0 78013.11 121597.55 264346.06]
 [1 0.0 1.0 94657.16 145077.58 282574.31]
 [1 1.0 0.0 91749.16 114175.79 294919.57]
 [1 0.0 1.0 86419.7 153514.11 0.0]
 [1 0.0 0.0 76253.86 113867.3 298664.47]
 [1 0.0 1.0 78389.47 153773.43 299737.29]
 [1 1.0 0.0 73994.56 122782.75 303319.26]
 [1 1.0 0.0 67532.53 105751.03 30476

In [101]:
# backward-elimination :


# import statsmodels.regression.linear_model as lm
# import statsmodels.api as smf;
# X_opt = X[:,[0,1,2,3,4,5]]  # optimal matrix that will have only significant variables
# # regressor_new = lm.OLS(endog=Y,exog=X_opt).fit()  #object of statsmodel ols class
# regressor_new = smf.OLS(Y,X_opt).fit()

import statsmodels.regression.linear_model as lm
# X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]].tolist()
SL = 0.05
regression_OLS = lm.OLS(endog = Y, exog = X_opt). fit()
regression_OLS.summary()
# the lower the P value -> the more signficant the parameter is
# x2 is having P  value of 0.99 -> remove it


0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,169.9
Date:,"Tue, 03 Jan 2023",Prob (F-statistic):,1.34e-27
Time:,17:07:39,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1063.0
Df Residuals:,44,BIC:,1074.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.013e+04,6884.820,7.281,0.000,3.62e+04,6.4e+04
x1,198.7888,3371.007,0.059,0.953,-6595.030,6992.607
x2,-41.8870,3256.039,-0.013,0.990,-6604.003,6520.229
x3,0.8060,0.046,17.369,0.000,0.712,0.900
x4,-0.0270,0.052,-0.517,0.608,-0.132,0.078
x5,0.0270,0.017,1.574,0.123,-0.008,0.062

0,1,2,3
Omnibus:,14.782,Durbin-Watson:,1.283
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.266
Skew:,-0.948,Prob(JB):,2.41e-05
Kurtosis:,5.572,Cond. No.,1450000.0


In [102]:
X_opt = X[:, [0, 1, 3, 4, 5]].tolist()
SL = 0.05
regression_OLS = lm.OLS(endog = Y, exog = X_opt). fit()
regression_OLS.summary()
# now remove x1

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Tue, 03 Jan 2023",Prob (F-statistic):,8.49e-29
Time:,17:09:52,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04,6.35e+04
x1,220.1585,2900.536,0.076,0.940,-5621.821,6062.138
x2,0.8060,0.046,17.606,0.000,0.714,0.898
x3,-0.0270,0.052,-0.523,0.604,-0.131,0.077
x4,0.0270,0.017,1.592,0.118,-0.007,0.061

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [104]:
X_opt = X[:, [0,3,4,5]].tolist()
SL = 0.05
regression_OLS = lm.OLS(endog = Y, exog = X_opt). fit()
regression_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Tue, 03 Jan 2023",Prob (F-statistic):,4.53e-30
Time:,17:13:36,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715,0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130,0.076
x3,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [106]:
X_opt = X[:, [0, 3, 5]].tolist()
SL = 0.05
regression_OLS = lm.OLS(endog = Y, exog = X_opt). fit()
regression_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Tue, 03 Jan 2023",Prob (F-statistic):,2.1600000000000003e-31
Time:,17:15:20,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04,5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713,0.880
x2,0.0299,0.016,1.927,0.060,-0.001,0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [108]:
# 5th parameter has P-value of 0.60 -> should we remove it or not
# since sl=0.05 -> removin it :
X_opt = X[:, [0, 3]].tolist()
SL = 0.05
regression_OLS = lm.OLS(endog = Y, exog = X_opt). fit()
regression_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Tue, 03 Jan 2023",Prob (F-statistic):,3.5000000000000004e-32
Time:,17:17:01,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04,5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795,0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [116]:
# after following back-elimination 
# only one parameter turned out to be imp -> 3rd col
# other metrics for 5th col -> when P is near the SL -> R sq or adjusted R sq


# another imp thing -> rather than copy and paste back-ward elimination each time
# use this 

import statsmodels.regression.linear_model as lm
def backwardElimination(x,sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = lm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if (maxVar>sl):
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]].tolist()
X_Modeled = backwardElimination(X_opt, SL)
print(X_Modeled)
# print(X_Modeled.dtype)

[[1.0000000e+00 1.6534920e+05]
 [1.0000000e+00 1.6259770e+05]
 [1.0000000e+00 1.5344151e+05]
 [1.0000000e+00 1.4437241e+05]
 [1.0000000e+00 1.4210734e+05]
 [1.0000000e+00 1.3187690e+05]
 [1.0000000e+00 1.3461546e+05]
 [1.0000000e+00 1.3029813e+05]
 [1.0000000e+00 1.2054252e+05]
 [1.0000000e+00 1.2333488e+05]
 [1.0000000e+00 1.0191308e+05]
 [1.0000000e+00 1.0067196e+05]
 [1.0000000e+00 9.3863750e+04]
 [1.0000000e+00 9.1992390e+04]
 [1.0000000e+00 1.1994324e+05]
 [1.0000000e+00 1.1452361e+05]
 [1.0000000e+00 7.8013110e+04]
 [1.0000000e+00 9.4657160e+04]
 [1.0000000e+00 9.1749160e+04]
 [1.0000000e+00 8.6419700e+04]
 [1.0000000e+00 7.6253860e+04]
 [1.0000000e+00 7.8389470e+04]
 [1.0000000e+00 7.3994560e+04]
 [1.0000000e+00 6.7532530e+04]
 [1.0000000e+00 7.7044010e+04]
 [1.0000000e+00 6.4664710e+04]
 [1.0000000e+00 7.5328870e+04]
 [1.0000000e+00 7.2107600e+04]
 [1.0000000e+00 6.6051520e+04]
 [1.0000000e+00 6.5605480e+04]
 [1.0000000e+00 6.1994480e+04]
 [1.0000000e+00 6.1136380e+04]
 [1.0000