# Multilinear Regression

__Import libraries__

In [1]:
# essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# split
from sklearn.model_selection import train_test_split

# encoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# ML algoritm
from sklearn.linear_model import LinearRegression

In [2]:
dataset = pd.read_csv('50_Startups.csv')

In [3]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [4]:
print(X.shape)
print(y.shape)

(50, 4)
(50,)


In [5]:
# encode the categorical feature
labelencoder = LabelEncoder().fit(X[:, 3])
X[:, 3] = labelencoder.transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [6]:
X.shape

(50, 6)

In [7]:
# split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [8]:
# instantiate the model 
linreg = LinearRegression().fit(X_train, y_train)
y_pred = linreg.predict(X_test)
print("Test set score: {}".format(linreg.score(X_test, y_test)))

Test set score: 0.9347068473282095


In [9]:
X_notdumm = X[:, 1:]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_notdumm, y, random_state=0, test_size=0.2)

In [11]:
# instantiate the model, after avoiding dummy trap
linreg = LinearRegression().fit(X_train, y_train)
y_pred_new = linreg.predict(X_test)
print("Test set score: {}".format(linreg.score(X_test, y_test)))

Test set score: 0.9347068473283267


Not so different from the data that we not manually avoid the dummy trap

In [16]:
import statsmodels.formula.api as sm

X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]

In [17]:
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.024
Model:,OLS,Adj. R-squared:,-0.018
Method:,Least Squares,F-statistic:,0.5748
Date:,"Tue, 06 Aug 2019",Prob (F-statistic):,0.567
Time:,05:11:44,Log-Likelihood:,-600.05
No. Observations:,50,AIC:,1206.0
Df Residuals:,47,BIC:,1212.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.364e+04,1725.786,19.495,0.000,3.02e+04,3.71e+04
x1,3.364e+04,1725.786,19.495,0.000,3.02e+04,3.71e+04
x2,3.364e+04,1725.786,19.495,0.000,3.02e+04,3.71e+04
x3,2974.4815,8105.911,0.367,0.715,-1.33e+04,1.93e+04
x4,1.784e+04,8254.519,2.162,0.036,1237.380,3.44e+04
x5,1.283e+04,8105.911,1.582,0.120,-3481.236,2.91e+04

0,1,2,3
Omnibus:,0.111,Durbin-Watson:,0.081
Prob(Omnibus):,0.946,Jarque-Bera (JB):,0.207
Skew:,0.104,Prob(JB):,0.902
Kurtosis:,2.762,Cond. No.,5.31e+17


---

# IMPORTANT POINTS

- Same as Simple Linear Regression
- The difference is now have more than one feature (more than one dimension)