In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing the libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# Importing the dataset

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML_algorithms/1- Linear Regression/50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
dataset.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


# Droping the rows with empty cells

In [None]:
nan_num = dataset.isna().sum().sum()
print(f'Number of nan: {nan_num}')
rows_with_nan = list()
for index, row in dataset.iterrows():
    is_nan_series = row.isnull()
    if is_nan_series.any():
        rows_with_nan.append(index)
print(f'NaN_indices: {rows_with_nan}')

Number of nan: 1
NaN_indices: [19]


In [None]:
X_inital = dataset.iloc[:, :-1].values
print(f'Input with NaN:\n{X_inital[rows_with_nan[0]-1:rows_with_nan[0]+2]}')
dataset.dropna(inplace=True)
X_inital = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(f'Input without NaN:\n{X_inital[rows_with_nan[0]-1:rows_with_nan[0]+1]}')

Input with NaN:
[[91749.16 114175.79 294919.57 'Florida']
 [86419.7 153514.11 0.0 nan]
 [76253.86 113867.3 298664.47 'California']]
Input without NaN:
[[91749.16 114175.79 294919.57 'Florida']
 [76253.86 113867.3 298664.47 'California']]


In [None]:
print(f'Input with strings:\n{X_inital[:,3]}')
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2,3,4])], remainder='passthrough')
X_categorical = np.array(ct.fit_transform(X_inital))
print(f'Categorical input:\n{X_categorical[:,0:3]}')

Input with strings:
['New York' 'California' 'Florida' 'New York' 'Florida' 'New York'
 'California' 'Florida' 'New York' 'California' 'Florida' 'California'
 'Florida' 'California' 'Florida' 'New York' 'California' 'New York'
 'Florida' 'California' 'New York' 'Florida' 'Florida' 'New York'
 'California' 'Florida' 'New York' 'Florida' 'New York' 'Florida'
 'New York' 'California' 'Florida' 'California' 'New York' 'Florida'
 'California' 'New York' 'California' 'California' 'Florida' 'California'
 'New York' 'California' 'New York' 'Florida' 'California' 'New York'
 'California']
Categorical input:
[[0.0 0.0 1.0]
 [1.0 0.0 0.0]
 [0.0 1.0 0.0]
 [0.0 0.0 1.0]
 [0.0 1.0 0.0]
 [0.0 0.0 1.0]
 [1.0 0.0 0.0]
 [0.0 1.0 0.0]
 [0.0 0.0 1.0]
 [1.0 0.0 0.0]
 [0.0 1.0 0.0]
 [1.0 0.0 0.0]
 [0.0 1.0 0.0]
 [1.0 0.0 0.0]
 [0.0 1.0 0.0]
 [0.0 0.0 1.0]
 [1.0 0.0 0.0]
 [0.0 0.0 1.0]
 [0.0 1.0 0.0]
 [1.0 0.0 0.0]
 [0.0 0.0 1.0]
 [0.0 1.0 0.0]
 [0.0 1.0 0.0]
 [0.0 0.0 1.0]
 [1.0 0.0 0.0]
 [0.0 1.0 0.0]
 [0.

# Label encoding the data

In [None]:
le = LabelEncoder()
le.fit(dataset['State'].unique().tolist())
dataset['State'] = le.transform(dataset['State'].values)
print(f'String inputs:\n{X_inital[:,3]}')
X = dataset.iloc[:, :-1].values
print(f'Numerical inputs:\n{X[:,3]}')
y = dataset.iloc[:, -1].values

String inputs:
['New York' 'California' 'Florida' 'New York' 'Florida' 'New York'
 'California' 'Florida' 'New York' 'California' 'Florida' 'California'
 'Florida' 'California' 'Florida' 'New York' 'California' 'New York'
 'Florida' 'California' 'New York' 'Florida' 'Florida' 'New York'
 'California' 'Florida' 'New York' 'Florida' 'New York' 'Florida'
 'New York' 'California' 'Florida' 'California' 'New York' 'Florida'
 'California' 'New York' 'California' 'California' 'Florida' 'California'
 'New York' 'California' 'New York' 'Florida' 'California' 'New York'
 'California']
Numerical inputs:
[2. 0. 1. 2. 1. 2. 0. 1. 2. 0. 1. 0. 1. 0. 1. 2. 0. 2. 1. 0. 2. 1. 1. 2.
 0. 1. 2. 1. 2. 1. 2. 0. 1. 0. 2. 1. 0. 2. 0. 0. 1. 0. 2. 0. 2. 1. 0. 2.
 0.]


# Splitting the dataset into the Training set and Test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Data normalization

In [None]:
print(f'Before normalizing:\nMax value: {X_train.max()}\nMin value: {X_train.min()}')
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_norm = scaler.transform(X_train)
X_test_norm = scaler.transform(X_test)
print(f'After normalizing:\nMax value: {X_train_norm.max()}\nMin value: {X_train_norm.min()}')

Before normalizing:
Max value: 471784.1
Min value: 0.0
After normalizing:
Max value: 1.0
Min value: 0.0


# Training the regression model on the Training set

In [None]:
regressor = LinearRegression()
regressor.fit(X_train_norm, y_train)

LinearRegression()

# Predicting the Test set results

In [None]:
y_pred = regressor.predict(X_test_norm)

In [None]:
print(y_pred)
print(y_test)

[ 96827.82060053 170613.01087169 115156.8813957   96281.29608897
  96457.47697284  66734.52455824  88124.05990501  72453.75270116
 160349.45714255 133727.70143094 133570.46858737  96584.58584028
  90148.8354893 ]
[ 99937.59 166187.94 105008.31  97483.56  96778.92  81229.06  96479.51
  77798.83 155752.6  146121.95 144259.4   97427.84  96712.8 ]


# Get the Metrics

In [None]:
R_squared = r2_score(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
RMSE = mean_squared_error(y_test, y_pred, squared=False)
MAE = mean_absolute_error(y_test, y_pred)
MAPE = mean_absolute_percentage_error(y_test, y_pred)
print(f'R-squared: {round(R_squared*100,2)}%')
print(f'MAE: {round(MAE,2)}')
print(f'MSE: {round(MSE,2)}')
print(f'RMSE: {round(RMSE,2)}')
print(f'MAPE: {round(MAPE*100,2)}%')

R-squared: 92.61%
MAE: 6345.34
MSE: 59620584.18
RMSE: 7721.44
MAPE: 5.91%
