#Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Read the dataset

In [2]:
dataset = pd.read_csv('/content/drive/MyDrive/IBMR/ML/Supervised/Regression/MultipleLinearRegression/50_Startups.csv')
dataset.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# Splitting the dataset into Independent Variable (X) and Dependent Variable (y)

In [3]:
X = dataset.iloc[:, :-1]
X

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida
5,131876.9,99814.71,362861.36,New York
6,134615.46,147198.87,127716.82,California
7,130298.13,145530.06,323876.68,Florida
8,120542.52,148718.95,311613.29,New York
9,123334.88,108679.17,304981.62,California


In [6]:
y = dataset.iloc[:,-1].values
y

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

# Convert categorical data into neumerical values.

If all the features in the dataset are of numerical datatype we can easily apply mathematical computations on the datset to create Machine Learning models.

 Here column containing categorical data is 'State'. Hence we will transform it into neumerical representation using Sklearn's OnehotEncoder  

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [8]:
# Create an object (ct) from ColumnTransformer class that will be applied on the dataset

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')

In [9]:
X = np.array(ct.fit_transform(X))

In [11]:
# First 3 columns represent the numerical encoded feature 'State'

# 0 , 0 , 1 ---> New York
# 1 , 0, 0 ---> California
# 0, 1, 0 ---> Florida

X

array([[0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.5344151e+05,
        1.0114555e+05, 4.0793454e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.4437241e+05,
        1.1867185e+05, 3.8319962e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.4210734e+05,
        9.1391770e+04, 3.6616842e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.3187690e+05,
        9.9814710e+04, 3.6286136e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.2054252e+05,
        1.4871895e+05, 3.1161329e+05],
       [1.0000000e+00, 0.0000000e+00,

# Splitting the data into training and testing

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

#Training the model on the Training Set

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
regressor = LinearRegression()

In [15]:
regressor.fit(X_train, y_train)

# Predicting the profits on Test set

In [16]:
y_pred = regressor.predict(X_test)

In [23]:
y_pred


array([128628.91222473, 158417.46574625, 115286.64713299, 194826.70268539,
       160119.04503381,  73404.18351055,  73862.43077401, 190419.10178457,
       101661.77349818,  63231.19393249])

In [22]:
y_test

array([124266.9 , 156122.51, 105008.31, 192261.83, 155752.6 ,  77798.83,
        90708.19, 191792.06, 107404.34,  65200.33])

In [24]:
X_test

array([[0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 9.1749160e+04,
        1.1417579e+05, 2.9491957e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.3461546e+05,
        1.4719887e+05, 1.2771682e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 7.2107600e+04,
        1.2786455e+05, 3.5318381e+05],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 1.6534920e+05,
        1.3689780e+05, 4.7178410e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 1.3029813e+05,
        1.4553006e+05, 3.2387668e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 2.7892920e+04,
        8.4710770e+04, 1.6447071e+05],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 2.8663760e+04,
        1.2705621e+05, 2.0112682e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 1.6259770e+05,
        1.5137759e+05, 4.4389853e+05],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 6.4664710e+04,
        1.3955316e+05, 1.3796262e+05],
       [1.0000000e+00, 0.0000000e+00,

In [25]:
# Create a pandas dataframe that consists of X_test, y_test, y_pred

df = pd.DataFrame({'Actual Profit': y_test, 'Predicted Profit': y_pred})

In [26]:
df

Unnamed: 0,Actual Profit,Predicted Profit
0,124266.9,128628.912225
1,156122.51,158417.465746
2,105008.31,115286.647133
3,192261.83,194826.702685
4,155752.6,160119.045034
5,77798.83,73404.183511
6,90708.19,73862.430774
7,191792.06,190419.101785
8,107404.34,101661.773498
9,65200.33,63231.193932


#Calculate error margin between the actual and predicted value

MAE (Mean Absolute Error)

MSE (Mean Squared Error)

RMSE (Root Mean Squared Error)


In [30]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [28]:
# MSE
print('MSE', mean_squared_error(y_test, y_pred))

MSE 49741470.36794742


In [29]:
# RMSE

print(np.sqrt(mean_squared_error(y_test, y_pred)))

7052.763314329173


In [31]:
# MAE

print(mean_absolute_error(y_test, y_pred))

5419.168932336992
