# Multiple Linear Regression - Python

In [1]:
# Importing dependencies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:,4].values

## Enconding categorical data

In [3]:
# Enconding independent variable

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()
X[:,3] = labelencoder_X.fit_transform(X[:,3])

oneHotEnconder = OneHotEncoder(categorical_features = [3])
X = oneHotEnconder.fit_transform(X).toarray()

### Avoiding Dummy variable trap
To avoid the __Dummy variable trap__ is required remove one column of generated by OneHotEncoding

In [4]:
# Removing first column

X = X[:, 1:]

## Splitting the dataset into training and test set

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

## Fitting Model

In [6]:
# Fitting Multiple Linear Regression to the training set

from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Predict Test Set Results

In [7]:
Y_pred = model.predict(X_test)

## Backward Elimination

In [8]:
# Building the optimal model using Backward Elimination

# import statsmodels.formula.api as sm
X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis = 1)

#|> First iteration
# X_opt = X[:,[0,1,2,3,4,5]]
# model_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
# model_OLS.summary()

#|> Second iteration - removing index 2
# X_opt = X[:,[0,1,3,4,5]]
# model_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
# model_OLS.summary()

#|> Third iteration - removing index 1
# X_opt = X[:,[0,3,4,5]]
# model_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
# model_OLS.summary()

#|> Fourth iteration - removing index 4
# X_opt = X[:,[0,3,5]]
# model_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
# model_OLS.summary()

#|> Fifth iteration - removing index 5
# X_opt = X[:,[0,3]]
# model_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
# model_OLS.summary()

In [9]:
# Backward Elimination with p-values only:

import statsmodels.formula.api as sm

def backwardElimination(x, sl):
    numVars = len(x[0])
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        if maxVar > sl:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    x = np.delete(x, j, 1)
    regressor_OLS.summary()
    return x

SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

In [14]:
# Backward Elimination with p-values and Adjusted R Squared:

import statsmodels.formula.api as sm

def backwardElimination(x, SL):
    numVars = len(x[0])
    temp = np.zeros((50,6)).astype(int)
    for i in range(0, numVars):
        regressor_OLS = sm.OLS(Y, x).fit()
        maxVar = max(regressor_OLS.pvalues).astype(float)
        adjR_before = regressor_OLS.rsquared_adj.astype(float)
        if maxVar > SL:
            for j in range(0, numVars - i):
                if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                    temp[:,j] = x[:, j]
                    x = np.delete(x, j, 1)
                    tmp_regressor = sm.OLS(Y, x).fit()
                    adjR_after = tmp_regressor.rsquared_adj.astype(float)
                    if (adjR_before >= adjR_after):
                        x_rollback = np.hstack((x, temp[:,[0,j]]))
                        x_rollback = np.delete(x_rollback, j, 1)
                        print (regressor_OLS.summary())
                        return x_rollback
                    else:
                        continue
    regressor_OLS.summary()
    return x
 
SL = 0.05
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
X_Modeled = backwardElimination(X_opt, SL)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.950
Model:                            OLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     450.8
Date:                Sun, 29 Apr 2018   Prob (F-statistic):           2.16e-31
Time:                        23:10:32   Log-Likelihood:                -525.54
No. Observations:                  50   AIC:                             1057.
Df Residuals:                      47   BIC:                             1063.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.698e+04   2689.933     17.464      0.0