In [2]:
# link to the dataset is 
# https://raw.githubusercontent.com/krishnaik06/Multiple-Linear-Regression/master/50_Startups.csv

In [3]:
# import all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
# load the dataset
df = pd.read_csv(r"D:\ds_resources\50_Startups.csv")
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
# get the basic dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [6]:
# check for null values
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [7]:
# since state is an object it has to be converted to int or float for the model
"""
Create dummies of the state column and drop the first axis because 3 columns can be represented by 2.
After that save it to the df and convert them to int
"""
df[['Florida', 'New York']] = pd.get_dummies(df['State'], drop_first=True)

In [8]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,Florida,New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,1
1,162597.7,151377.59,443898.53,California,191792.06,0,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,1
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0


In [9]:
# convert the state columns to int
df['Florida'] = df['Florida'].astype(int)
df['New York'] = df['New York'].astype(int)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
R&D Spend          50 non-null float64
Administration     50 non-null float64
Marketing Spend    50 non-null float64
State              50 non-null object
Profit             50 non-null float64
Florida            50 non-null int32
New York           50 non-null int32
dtypes: float64(4), int32(2), object(1)
memory usage: 2.5+ KB


In [11]:
# now we can drop the original state column
df.drop(['State'], axis =1, inplace=True)

In [12]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [13]:
# seperate the feature and target columns as 'x' and 'y' resp.
x = df.drop('Profit', axis =1)
y = df[['Profit']]

In [14]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [15]:
y.head()

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [16]:
# do the train test split of the dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [19]:
x_train.head(), y_train.head()

(    R&D Spend  Administration  Marketing Spend  Florida  New York
 33   55493.95       103057.49        214634.81        1         0
 35   46014.02        85047.44        205517.64        0         1
 26   75328.87       144135.98        134050.07        1         0
 34   46426.07       157693.92        210797.67        0         0
 18   91749.16       114175.79        294919.57        1         0,
        Profit
 33   96778.92
 35   96479.51
 26  105733.54
 34   96712.80
 18  124266.90)

In [21]:
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [24]:
# now predict the test results for the x_test set
y_pred = regressor.predict(x_test)

In [25]:
# Now check the accuracy of the model
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)

In [26]:
score

0.9347068473282424

In [27]:
# This means that the model has an accuracy of 93%