# Multiple Linear Regression

## Importing the libraries

In [66]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [67]:
dataset = pd.read_csv('50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

## Taking care of missing data

In [68]:
# Deprecated one
# from sklearn.preprocessing import Imputer
# imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
#from sklearn.impute import SimpleImputer
#imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#imputer = imputer.fit(X[:, 1:3])
#X[:, 1:3] = imputer.transform(X[:, 1:3])

## Encoding categorical data

In [69]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

## Avoiding the Dummy Variable Trap

In [70]:
# sklearn handle it, then we do not need to do this, but ...
X = X[:, 1:]

## Splitting the dataset into the Training set and Test set

In [71]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Feature Scaling

In [72]:
# The use of scaling depends on the method used in machine learning
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_test = sc_X.transform(X_test)

## Fitting multiple linear regression to the training set

In [73]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Predicting the test set results

In [74]:
y_pred = regressor.predict(X_test)

## Building the optimal model using Backward Elimination

In [75]:
import statsmodels.formula.api as sm
X = np.append(arr=np.ones((50,1)).astype(int), values=X, axis=1)

In [93]:
x_selection = np.array([0, 1, 2, 3, 4, 5])

bk_loop = True
while bk_loop:
    X_opt = X[:, x_selection]
    regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
    p_values = regressor_OLS.pvalues
    index_max = np.argmax(p_values)
    print('X indexes = ', x_selection)
    print('p_values = ', p_values)
    print('index to check = ', index_max, 'p_value to check = ', p_values[index_max])
    print('-----------------')
    if p_values[index_max] > 0.05:
        x_selection = np.delete(x_selection, index_max)
    else:
        bk_loop = False

X indexes =  [0 1 2 3 4 5]
p_values =  [  4.44417839e-09   9.53242901e-01   9.89794124e-01   2.57877192e-21
   6.07737327e-01   1.22676927e-01]
index to check =  2 p_value to check =  0.989794124161
-----------------
X indexes =  [0 1 3 4 5]
p_values =  [  1.63919646e-09   9.39832977e-01   8.29397484e-22   6.03729160e-01
   1.18461365e-01]
index to check =  1 p_value to check =  0.939832977258
-----------------
X indexes =  [0 3 4 5]
p_values =  [  1.05737916e-09   2.63496772e-22   6.01755108e-01   1.04716819e-01]
index to check =  2 p_value to check =  0.60175510785
-----------------
X indexes =  [0 3 5]
p_values =  [  3.50406217e-22   6.04043259e-24   6.00303972e-02]
index to check =  2 p_value to check =  0.0600303971911
-----------------
X indexes =  [0 3]
p_values =  [  2.78269692e-24   3.50032224e-32]
index to check =  0 p_value to check =  2.78269692297e-24
-----------------


## Evaluate performance

In [98]:
from sklearn.linear_model import LinearRegression
from sklearn import metrics
regressor = LinearRegression()
x_indexes = [0, 1, 2, 3, 4]
regressor.fit(X_train[:, x_indexes], y_train)
y_pred = regressor.predict(X_test[:, x_indexes])
performance  = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(performance)

9137.99015279


In [99]:
x_indexes = [2]
regressor.fit(X_train[:, x_indexes], y_train)
y_pred = regressor.predict(X_test[:, x_indexes])
performance  = np.sqrt(metrics.mean_squared_error(y_test, y_pred))
print(performance)

8274.86801823
