In [199]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('start_up_data.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

In [170]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [171]:
# state column is the categorical data that is text, as you will know by now we can't have txt in data if we want to run
# any kind of model in it

# Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoder = LabelEncoder()
x[:, 3] = label_encoder.fit_transform(x[:, 3])

In [172]:
x[:, 3]

array([2, 0, 1, 2, 1, 2, 0, 1, 2, 0, 1, 0, 1, 0, 1, 2, 0, 2, 1, 2, 0, 2,
       1, 1, 2, 0, 1, 2, 1, 2, 1, 2, 0, 1, 0, 2, 1, 0, 2, 0, 0, 1, 0, 2,
       0, 2, 1, 0, 2, 0], dtype=object)

In [173]:
# since there are more then two categorical data that is converted to numberic and all the numeric are in the same column
# so the model might misunderstood that the data are in some order, so to avoid it we will use OneHotEncoder which will put
# each data in different columns

onehotencoder = OneHotEncoder(categorical_features = [3])
x = onehotencoder.fit_transform(x).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [174]:
x[1]

array([1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
       1.5137759e+05, 4.4389853e+05])

In [175]:
# spliting the dataset inot train set and test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.02, random_state = 0)

In [176]:
# Fitting model to the training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [177]:
# Predict the test result
y_pred = regressor.predict(x_test)

In [178]:
y_pred

array([101496.90531747])

In [132]:
# Finding the coefficient
regressor.coef_

array([-7.62970625e+00,  6.30520607e+01, -5.54223545e+01,  8.06252704e-01,
       -2.98835254e-02,  2.72232883e-02])

In [133]:
# Finding the y-intercept
regressor.intercept_

50421.34737358676

In [161]:
# Finding the optimal model using back elimination 
import statsmodels.formula.api as sm
x = np.append( arr = np.ones((50, 1)).astype(int) , values = x, axis = 1 )
x_opt = x[: , [0,1,2,3,4,5]]
regressor_OLS = sm.OLS(endog = y , exog = x_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.043
Date:,"Sat, 29 Jun 2019",Prob (F-statistic):,0.312
Time:,01:32:13,Log-Likelihood:,-600.12
No. Observations:,50,AIC:,1204.0
Df Residuals:,48,BIC:,1208.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x1,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x2,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x3,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x4,2.324e+04,1402.665,16.567,0.000,2.04e+04,2.61e+04
x5,-1.228e+04,1.2e+04,-1.021,0.312,-3.65e+04,1.19e+04

0,1,2,3
Omnibus:,0.079,Durbin-Watson:,0.073
Prob(Omnibus):,0.961,Jarque-Bera (JB):,0.19
Skew:,0.087,Prob(JB):,0.909
Kurtosis:,2.753,Cond. No.,2.5e+50


In [162]:
# in each step we should we should see the 4th column the p value, which we usually take 0.5 and see the values in that columns
# any number which is greater then 0.5 should be removed
x_opt  = x[: , [0,1,3,4, 5]]
regressor_OLS = sm.OLS(endog = y , exog = x_opt).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.043
Date:,"Sat, 29 Jun 2019",Prob (F-statistic):,0.312
Time:,01:32:28,Log-Likelihood:,-600.12
No. Observations:,50,AIC:,1204.0
Df Residuals:,48,BIC:,1208.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.905e+04,1753.331,16.567,0.000,2.55e+04,3.26e+04
x1,2.905e+04,1753.331,16.567,0.000,2.55e+04,3.26e+04
x2,2.905e+04,1753.331,16.567,0.000,2.55e+04,3.26e+04
x3,2.905e+04,1753.331,16.567,0.000,2.55e+04,3.26e+04
x4,-1.228e+04,1.2e+04,-1.021,0.312,-3.65e+04,1.19e+04

0,1,2,3
Omnibus:,0.079,Durbin-Watson:,0.073
Prob(Omnibus):,0.961,Jarque-Bera (JB):,0.19
Skew:,0.087,Prob(JB):,0.909
Kurtosis:,2.753,Cond. No.,8.530000000000001e+34


In [179]:
x_opt = x[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog = y , exog = x_opt).fit()
regressor_OLS.summary()

x_opt = x[: , [0,3,5]]
regressor_OLS = sm.OLS(endog = y , exog = x_opt).fit()
regressor_OLS.summary()

x_opt = x[: , [0,5]]
regressor_OLS = sm.OLS(endog = y , exog = x_opt).fit()
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.899
Model:,OLS,Adj. R-squared:,0.895
Method:,Least Squares,F-statistic:,213.7
Date:,"Sat, 29 Jun 2019",Prob (F-statistic):,1.26e-24
Time:,03:03:25,Log-Likelihood:,-597.93
No. Observations:,50,AIC:,1200.0
Df Residuals:,48,BIC:,1204.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,2.457e+04,1.04e+04,2.362,0.022,3654.855,4.55e+04
x2,0.4346,0.025,17.436,0.000,0.385,0.485

0,1,2,3
Omnibus:,3.328,Durbin-Watson:,1.889
Prob(Omnibus):,0.189,Jarque-Bera (JB):,2.299
Skew:,0.386,Prob(JB):,0.317
Kurtosis:,3.712,Cond. No.,464000.0


In [195]:
# finally we will remain wiht the value which is less then 0.5 
# Fitting the Multiple Linear Regression in the Optimal Training set

X_Optimal_Train, X_Optimal_Test = train_test_split(x_opt,test_size = 0.02, random_state = 0)
regressor.fit(X_Optimal_Train, y_train)

# Predicting the Optimal Test set results

Y_Optimal_Pred = regressor.predict(X_Optimal_Test)

In [196]:
Y_Optimal_Pred

array([133238.95252862])