In [32]:
# Multiple Linear Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [33]:
# verificar se os tipos estao corretos
dataset.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [56]:
# features
X = dataset.iloc[:, :-1]
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [57]:
# one hot enconding - dummy variables
X = pd.get_dummies(X)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [58]:
# Avoiding the Dummy Variable Trap
X = X.iloc[:, :-1]
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida
0,165349.2,136897.8,471784.1,0,0
1,162597.7,151377.59,443898.53,1,0
2,153441.51,101145.55,407934.54,0,1
3,144372.41,118671.85,383199.62,0,0
4,142107.34,91391.77,366168.42,0,1


In [60]:
# features
X = X.iloc[:,:].values

# targets
y = dataset.iloc[:, 4].values

In [61]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [62]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [63]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)

results = pd.DataFrame()
results['y_test'] = y_test
results['y_pred'] = y_pred
results['diff'] = abs(y_pred - y_test)
results['diff %'] = results['diff'] / y_test * 100

results

Unnamed: 0,y_test,y_pred,diff,diff %
0,103282.38,103015.201598,267.178402,0.258687
1,144259.4,132582.277608,11677.122392,8.094531
2,146121.95,132447.738452,13674.211548,9.358082
3,77798.83,71976.098513,5822.731487,7.484343
4,191050.39,178537.482211,12512.907789,6.549533
5,105008.31,116161.242302,11152.932302,10.621
6,81229.06,67851.692097,13377.367903,16.468697
7,97483.56,98791.733747,1308.173747,1.341943
8,110352.25,113969.43533,3617.18533,3.277854
9,166187.94,167921.065696,1733.125696,1.042871


In [66]:
import statsmodels.formula.api as sm

X = np.append(arr=np.ones((50,1)), values=X, axis=1)

# backward elimination

In [67]:
X_opt = X[:, [0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sun, 01 Oct 2017",Prob (F-statistic):,4.53e-30
Time:,16:20:37,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,1.671e+04,2190.784,7.626,0.000,1.23e+04 2.11e+04
x1,1.671e+04,2190.784,7.626,0.000,1.23e+04 2.11e+04
x2,1.671e+04,2190.784,7.626,0.000,1.23e+04 2.11e+04
x3,0.8057,0.045,17.846,0.000,0.715 0.897
x4,-0.0268,0.051,-0.526,0.602,-0.130 0.076
x5,0.0272,0.016,1.655,0.105,-0.006 0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,8.62e+17


In [21]:
X_opt = X[:, [0,1,2,3,5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.946
Method:,Least Squares,F-statistic:,217.2
Date:,"Sun, 01 Oct 2017",Prob (F-statistic):,8.49e-29
Time:,16:13:21,Log-Likelihood:,-525.38
No. Observations:,50,AIC:,1061.0
Df Residuals:,45,BIC:,1070.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,5.011e+04,6647.870,7.537,0.000,3.67e+04 6.35e+04
x1,0.8060,0.046,17.606,0.000,0.714 0.898
x2,-0.0270,0.052,-0.523,0.604,-0.131 0.077
x3,0.0270,0.017,1.592,0.118,-0.007 0.061
x4,220.1585,2900.536,0.076,0.940,-5621.821 6062.138

0,1,2,3
Omnibus:,14.758,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.172
Skew:,-0.948,Prob(JB):,2.53e-05
Kurtosis:,5.563,Cond. No.,1400000.0


In [22]:
X_opt = X[:, [0,1,2,3]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Sun, 01 Oct 2017",Prob (F-statistic):,4.53e-30
Time:,16:13:21,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04 6.34e+04
x1,0.8057,0.045,17.846,0.000,0.715 0.897
x2,-0.0268,0.051,-0.526,0.602,-0.130 0.076
x3,0.0272,0.016,1.655,0.105,-0.006 0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [23]:
X_opt = X[:, [0,1,3]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,450.8
Date:,"Sun, 01 Oct 2017",Prob (F-statistic):,2.1600000000000003e-31
Time:,16:13:21,Log-Likelihood:,-525.54
No. Observations:,50,AIC:,1057.0
Df Residuals:,47,BIC:,1063.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,4.698e+04,2689.933,17.464,0.000,4.16e+04 5.24e+04
x1,0.7966,0.041,19.266,0.000,0.713 0.880
x2,0.0299,0.016,1.927,0.060,-0.001 0.061

0,1,2,3
Omnibus:,14.677,Durbin-Watson:,1.257
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.161
Skew:,-0.939,Prob(JB):,2.54e-05
Kurtosis:,5.575,Cond. No.,532000.0


In [24]:
X_opt = X[:, [0,1]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.947
Model:,OLS,Adj. R-squared:,0.945
Method:,Least Squares,F-statistic:,849.8
Date:,"Sun, 01 Oct 2017",Prob (F-statistic):,3.5000000000000004e-32
Time:,16:13:21,Log-Likelihood:,-527.44
No. Observations:,50,AIC:,1059.0
Df Residuals:,48,BIC:,1063.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
const,4.903e+04,2537.897,19.320,0.000,4.39e+04 5.41e+04
x1,0.8543,0.029,29.151,0.000,0.795 0.913

0,1,2,3
Omnibus:,13.727,Durbin-Watson:,1.116
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.536
Skew:,-0.911,Prob(JB):,9.44e-05
Kurtosis:,5.361,Cond. No.,165000.0


In [78]:
X_train, X_test, y_train, y_test = train_test_split(X_opt, y, test_size = 0.2, random_state = 0)

regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

results_opt = pd.DataFrame()
results_opt['y_test'] = y_test
results_opt['y_pred'] = y_pred
results_opt['diff'] = abs(y_pred - y_test)
results_opt['diff %'] = results_opt['diff'] / y_test * 100

results['diff %'].sum(), results_opt['diff %'].sum()

(64.49754085314093, 62.88208342834584)

In [77]:
results_opt

Unnamed: 0,y_test,y_pred,diff,diff %
0,103282.38,103901.89697,619.51697,0.599828
1,144259.4,132763.059931,11496.340069,7.969214
2,146121.95,133567.9037,12554.0463,8.591486
3,77798.83,72911.789767,4887.040233,6.281637
4,191050.39,179627.925672,11422.464328,5.97877
5,105008.31,115166.648648,10158.338648,9.673843
6,81229.06,67113.576906,14115.483094,17.377381
7,97483.56,98154.806868,671.246868,0.688574
8,110352.25,114756.115552,4403.865552,3.990735
9,166187.94,169064.014088,2876.074088,1.730615


In [76]:
results

Unnamed: 0,y_test,y_pred,diff,diff %
0,103282.38,103015.201598,267.178402,0.258687
1,144259.4,132582.277608,11677.122392,8.094531
2,146121.95,132447.738452,13674.211548,9.358082
3,77798.83,71976.098513,5822.731487,7.484343
4,191050.39,178537.482211,12512.907789,6.549533
5,105008.31,116161.242302,11152.932302,10.621
6,81229.06,67851.692097,13377.367903,16.468697
7,97483.56,98791.733747,1308.173747,1.341943
8,110352.25,113969.43533,3617.18533,3.277854
9,166187.94,167921.065696,1733.125696,1.042871
