# Multi Linear Regression

Get the dataset

Goal: Predict Profit

In [3]:
import pandas as pd
dataset = pd.read_csv('50_Startups.csv')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
dataset.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
#Separating Dependent and Independent Varaibles
X = dataset.iloc[:, :-1].values #Independent Variables
y = dataset.iloc[:, -1].values #Dependent Variables

## Since the dataset has one categorical data column "State", we use OneHotEncoder

In [7]:
#Encoding Categorical Data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

transformer= [('encoder',OneHotEncoder(),[3])]
ct= ColumnTransformer(transformers=transformer,remainder='passthrough')

In [8]:
import numpy as np
X= np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

# Split dataset into Train and Test set

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=0)

# Train Multiple Linear Regression Model on Train Set

In [10]:
from sklearn.linear_model import LinearRegression
regressor= LinearRegression()
regressor.fit(X_train,y_train)

# Predict the Test Results

In [11]:
y_pred= regressor.predict(X_test)

In [14]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


# Now lets do feature scaling and again predict the test results

In [17]:
X_train2= X_train
X_test2= X_test

In [18]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train2[:,:3]= sc.fit_transform(X_train[:,:3])
X_test2[:,:3]= sc.fit_transform(X_test[:,:3])
X_train2

array([[-0.8164965809277259, 1.7320508075688774, -0.733799385705343,
        55493.95, 103057.49, 214634.81],
       [-0.8164965809277259, -0.5773502691896258, 1.3627702877384942,
        46014.02, 85047.44, 205517.64],
       [-0.8164965809277259, 1.7320508075688774, -0.733799385705343,
        75328.87, 144135.98, 134050.07],
       [1.2247448713915887, -0.5773502691896258, -0.733799385705343,
        46426.07, 157693.92, 210797.67],
       [-0.8164965809277259, 1.7320508075688774, -0.733799385705343,
        91749.16, 114175.79, 294919.57],
       [-0.8164965809277259, 1.7320508075688774, -0.733799385705343,
        130298.13, 145530.06, 323876.68],
       [-0.8164965809277259, 1.7320508075688774, -0.733799385705343,
        119943.24, 156547.42, 256512.92],
       [-0.8164965809277259, -0.5773502691896258, 1.3627702877384942,
        1000.23, 124153.04, 1903.93],
       [-0.8164965809277259, -0.5773502691896258, 1.3627702877384942,
        542.05, 51743.15, 0.0],
       [-0.8164965

In [19]:
regressor2= LinearRegression()
regressor2.fit(X_train2,y_train)

In [20]:
y_pred2= regressor2.predict(X_test)

In [21]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred2.reshape(len(y_pred2),1), y_test.reshape(len(y_test),1)),1))

[[103418.04 103282.38]
 [132897.47 144259.4 ]
 [132850.58 146121.95]
 [ 72378.94  77798.83]
 [178940.32 191050.39]
 [116486.22 105008.31]
 [ 68176.67  81229.06]
 [ 99116.72  97483.56]
 [114372.28 110352.25]
 [168323.91 166187.94]]


In [22]:
print(np.concatenate((y_pred2.reshape(len(y_pred2),1), y_pred.reshape(len(y_pred),1)),1))

[[103418.04 103015.2 ]
 [132897.47 132582.28]
 [132850.58 132447.74]
 [ 72378.94  71976.1 ]
 [178940.32 178537.48]
 [116486.22 116161.24]
 [ 68176.67  67851.69]
 [ 99116.72  98791.73]
 [114372.28 113969.44]
 [168323.91 167921.07]]
