In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split as tts
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
data = pd.read_csv('50_Startups.csv')
data.head()
X = data.iloc[:,:-1].values
Y = data.iloc[:, -1].values



In [2]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
X[:,3] = LabelEncoder().fit_transform(X[:,3])  #LabelEncode 4th col
print(X.shape)
X = OneHotEncoder(categorical_features= [3]).fit_transform(X).toarray() #OneHotEncode 4th col
print(X.shape)
X = X[:,1:] # Handling Dummy variable Trap

(50, 4)
(50, 6)


In [4]:
#Train-Test Split
X_train, X_test, Y_train, Y_test = tts(X,Y,test_size = 0.2, random_state = 0)

In [5]:
print(X_train.shape, X_test.shape, Y_train.shape,  Y_test.shape )

(40, 5) (10, 5) (40,) (10,)


In [6]:
model = LinearRegression()
model.fit(X_train, Y_train)
Y_ = model.predict(X_test)

In [7]:
#Backward Elimination to remove non-significant vars
import statsmodels.formula.api as stat
import statsmodels.tools
#If we don't add ones column, Y = b1*x1 + b2*x2 + .....
# b0 will be ignored otherwise
X_train = statsmodels.tools.add_constant(data=X_train, prepend = True)
X_test = statsmodels.tools.add_constant(data=X_test, prepend = True)

#Alternate Way
#values added later to keep ones first
#axis = 1 , append column
#X = np.append(arr = np.ones((50,1)).astype(int), values = X, axis=1 )

In [8]:
significance_level = 0.30
# Lower significance_level is more important. All columns with greater significance_level should be removed.

In [9]:
cols = [i for i in range(X_train.shape[1])]
modified_model = stat.OLS(endog = Y_train, exog = X_train).fit()
while True:
    selected_data = X_train[:,cols]
    #endog = Output  exog = input
    modified_model = stat.OLS(endog = Y_train, exog = selected_data).fit()
    P_values = modified_model.pvalues
    index_of_max_P = int(np.argmax(P_values))
    if P_values[index_of_max_P] < significance_level:
        break
    print("Removed Column index ",cols[index_of_max_P])
    cols.remove(cols[index_of_max_P])
modified_model.summary()

Removed Column index  2
Removed Column index  1
Removed Column index  4


0,1,2,3
Dep. Variable:,y,R-squared:,0.95
Model:,OLS,Adj. R-squared:,0.947
Method:,Least Squares,F-statistic:,349.0
Date:,"Sun, 13 Jan 2019",Prob (F-statistic):,9.65e-25
Time:,19:21:40,Log-Likelihood:,-421.3
No. Observations:,40,AIC:,848.6
Df Residuals:,37,BIC:,853.7
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.635e+04,2971.236,15.598,0.000,4.03e+04,5.24e+04
x1,0.7886,0.047,16.846,0.000,0.694,0.883
x2,0.0326,0.018,1.860,0.071,-0.003,0.068

0,1,2,3
Omnibus:,14.666,Durbin-Watson:,2.518
Prob(Omnibus):,0.001,Jarque-Bera (JB):,20.582
Skew:,-1.03,Prob(JB):,3.39e-05
Kurtosis:,5.847,Cond. No.,497000.0


In [10]:
print("Actual Profit", "\t", "Pred_LinearRegression","\t","pred_modified")
for i in range(len(Y_test)):
    print(Y_test[i],"\t",Y_[i],"\t",modified_model.predict(X_test[i,cols])[0])

Actual Profit 	 Pred_LinearRegression 	 pred_modified
103282.38 	 103015.20159796256 	 102284.64605182887
144259.4 	 132582.27760815827 	 133873.92383811902
146121.95 	 132447.73845174964 	 134182.149516501
77798.83 	 71976.09851258734 	 73701.10693630343
191050.39 	 178537.4822105436 	 180642.2529973609
105008.31 	 116161.24230163355 	 114717.24903894297
81229.06 	 67851.69209676137 	 68335.07575312015
97483.56 	 98791.73374687947 	 97433.45922275007
110352.25 	 113969.43533011663 	 114580.92136452146
166187.94 	 167921.065695502 	 170343.3197949841
