# Multiple Linear Regression
- #### It is used when we have multiple independent variables

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## Loading Dataset

In [4]:
dataset = pd.read_csv("../../../Datasets/startups_profits.csv", sep=",")
dataset.columns = ['rd_spend', 'administration', 'marketing_spend', 'state', 'profit']
dataset.head()

Unnamed: 0,rd_spend,administration,marketing_spend,state,profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [11]:
samples_count, features_count = dataset.shape
samples_count, features_count

(50, 5)

## Separating target column from dataset

In [12]:
target = dataset.iloc[:, -1].values
dataset = dataset.iloc[:, :-1].values

## Encoding Categorical Data

In [13]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
dataset = np.array(ct.fit_transform(dataset))
print(dataset)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

## Train & Test split

In [14]:
train_x, test_x, train_y, test_y = train_test_split(dataset, target, test_size=0.2, random_state=0)

## Multiple Linear Regression

In [15]:
regressor = LinearRegression()
regressor.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

## Predicting the test set results


In [20]:
predictions = regressor.predict(test_x)
np.set_printoptions(precision=2)
## comparing predicted results with actual results
print(np.concatenate((predictions.reshape(len(predictions), 1), test_y.reshape(len(test_y), 1)), 1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]


## Prediction by using input data from user

In [30]:
rd_spend = input('rd_spend::')
administration = input('administration::')
marketing_spend = input('marketing_spend::')
state = input('state::')

rd_spend::32456
administration::891234
marketing_spend::341290
state::California


In [31]:
user_sample = [[float(rd_spend), float(administration), float(marketing_spend), state]]
user_sample_encoded = ct.transform(user_sample)
predicted_profit = regressor.predict(user_sample_encoded)[0]
rounded_profit = round(predicted_profit, 2)
print('Profit::', rounded_profit)

Profit:: 109460.33
