# Multiple Linear Regression

In [1]:
# importing all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# reading the dataset
dataset = pd.read_csv("datasets/50_Startups.csv")

In [3]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
# profit is our target variable 
# categorical variables present -> state

In [5]:
dataset.shape

(50, 5)

In [6]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [7]:
dataset['State'].value_counts()

California    17
New York      17
Florida       16
Name: State, dtype: int64

In [8]:
# it is of nominal encoding variable


In [9]:
new_state = pd.get_dummies(dataset['State'],drop_first = True)

In [10]:
new_state.head(5)

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


In [14]:
dataset = dataset.drop(['State'],axis=1)

In [15]:
dataset.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,165349.2,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [17]:
dataset = pd.concat((dataset,new_state),axis=1)

In [18]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [19]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit', 'Florida',
       'New York'],
      dtype='object')

In [21]:
x_data = dataset[['R&D Spend', 'Administration', 'Marketing Spend','Florida','New York']]

In [22]:
y_data = dataset['Profit']

In [23]:
x_data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [24]:
y_data.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [25]:
# performing data splitting
from sklearn.model_selection import train_test_split

In [26]:
from sklearn.metrics import r2_score

In [29]:
xtrain,xtest,ytrain,ytest = train_test_split(x_data,y_data,test_size =.2 , random_state =12)

In [30]:
xtrain.shape

(40, 5)

In [31]:
xtest.shape

(10, 5)

In [32]:
from sklearn.preprocessing import StandardScaler

In [33]:
std = StandardScaler()

In [34]:
std_train = std.fit_transform(xtrain)
std_test = std.transform(xtest)

In [36]:
from sklearn.linear_model import LinearRegression

In [37]:
reg = LinearRegression()

In [38]:
reg.fit(std_train,ytrain)

LinearRegression()

In [39]:
y_pred=reg.predict(std_test)

In [41]:
r2_score(ytest,y_pred)

0.9473897619182123