### 1 - Reading the data

In [3]:
import pandas as pd
import numpy as np

In [4]:
dataset = pd.read_csv('data/50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
dataset.shape

(50, 5)

In [9]:
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


### 2 - define input & output & features engineering

In [25]:
dataset['State'].value_counts()
# we got a categorical variable, with 3 values.

New York      17
California    17
Florida       16
Name: State, dtype: int64

In [31]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [32]:
# Checking the shape of input and output features
print('Shape of the input features:', X.shape)
print('Shape of the output features:', y.shape)


Shape of the input features: (50, 4)
Shape of the output features: (50,)


In [20]:
# Input feature matrix before categorical encoding
print(X)

[[165349.2 136897.8 471784.1 'New York']
 [162597.7 151377.59 443898.53 'California']
 [153441.51 101145.55 407934.54 'Florida']
 [144372.41 118671.85 383199.62 'New York']
 [142107.34 91391.77 366168.42 'Florida']
 [131876.9 99814.71 362861.36 'New York']
 [134615.46 147198.87 127716.82 'California']
 [130298.13 145530.06 323876.68 'Florida']
 [120542.52 148718.95 311613.29 'New York']
 [123334.88 108679.17 304981.62 'California']
 [101913.08 110594.11 229160.95 'Florida']
 [100671.96 91790.61 249744.55 'California']
 [93863.75 127320.38 249839.44 'Florida']
 [91992.39 135495.07 252664.93 'California']
 [119943.24 156547.42 256512.92 'Florida']
 [114523.61 122616.84 261776.23 'New York']
 [78013.11 121597.55 264346.06 'California']
 [94657.16 145077.58 282574.31 'New York']
 [91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 'New York']
 [76253.86 113867.3 298664.47 'California']
 [78389.47 153773.43 299737.29 'New York']
 [73994.56 122782.75 303319.26 'Florida']
 [67532

**Since there are some categorical values present on the dataset, we need to perform Categorical Encoding.**
- We'll be using one-hot encoding since categories are nominal.

In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

After performing categorical encoding, let's have look at input features again

In [48]:
# Input feature matrix after categorical encoding
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

In [49]:
# splitting the features into train & test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [50]:
# Checking the shape of the training and test sets
print('Shape of the training patterns:', X_train.shape, y_train.shape)

print('Shape of the test patterns:', X_test.shape, y_test.shape)


Shape of the training patterns: (40, 6) (40,)
Shape of the test patterns: (10, 6) (10,)


### 3 - Defining and training the multiple linear regression model

In [43]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [53]:
# Fitting the linear regression model

regressor.fit(X_train, y_train)

LinearRegression()

In [57]:
# Checking the regression coefficients of the fitted model
print(f'The Regression coefficients of the fitted model: {regressor.coef_}')

The Regression coefficients of the fitted model: [ 8.66383692e+01 -8.72645791e+02  7.86007422e+02  7.73467193e-01
  3.28845975e-02  3.66100259e-02]


In [58]:
# Checking the regression coefficients of the fitted model
print(f'The intercept of the fitted model: {regressor.intercept_}')

The intercept of the fitted model: 42467.52924854109


### 4 - Making predictions with the trained model
Now, let us make the predictions on the test data using this trained multiple linear regression model.

In [59]:
# predicting test data
y_pred = regressor.predict(X_test)

In [61]:
preview = pd.DataFrame(data={'Predicted profit':y_pred,
                            'Actual profit': y_test})

See differences between actual values & predicted ones.

In [62]:
preview

Unnamed: 0,Predicted profit,Actual profit
0,103015.201598,103282.38
1,132582.277608,144259.4
2,132447.738452,146121.95
3,71976.098513,77798.83
4,178537.482211,191050.39
5,116161.242302,105008.31
6,67851.692097,81229.06
7,98791.733747,97483.56
8,113969.43533,110352.25
9,167921.065696,166187.94


### 5 - Evaluate the prediction performance

In [65]:
# MSE
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test, y_pred)
print('Mean Squared Error is:', MSE)


Mean Squared Error is: 83502864.0325


In [67]:
# RMSE
import math
RMSE = math.sqrt(MSE)
print('Root Mean Squared Error is:', RMSE)


Root Mean Squared Error is: 9137.990152790711


In [71]:
# R-squared
from sklearn.metrics import r2_score

r2 = r2_score(y_pred, y_test)
print('R-squared is:', r2)

R-squared is: 0.9293749209319268


In [72]:
# Adjusted R-Squared
adj = 1-(
    (1-r2)*(X_train.shape[0]-1)/
    (X_train.shape[0]- X_train.shape[1] -1))

print('Adjusted R-Squared is:', adj)

Adjusted R-Squared is: 0.9165339974650043


#### Results:
1. As we can see above, the value of R-squared is very neat to 1 and that indicates this is the best fit model.
2. The adjusted R-squared is not much less than R-squared, which means, having these independant variables to fit the model, has not much impacted the model.
    - if it was much less than Adjusted r-squared, we'd say the model isn't properly fitted because of the large number of independant variables.
    