# Multiple Linear Regression

## Importing the libraries

In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

## Importing the dataset

In [16]:
dataset = pd.read_csv('data/50_Startups.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


## Encoding categorical data

In [17]:
ct = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), [3])
    ],
    remainder='passthrough'
)
X = np.array(ct.fit_transform(X))

In [18]:
print(X)

[[0.00e+00 0.00e+00 1.00e+00 1.65e+05 1.37e+05 4.72e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.63e+05 1.51e+05 4.44e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.53e+05 1.01e+05 4.08e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.44e+05 1.19e+05 3.83e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.42e+05 9.14e+04 3.66e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.32e+05 9.98e+04 3.63e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.35e+05 1.47e+05 1.28e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.30e+05 1.46e+05 3.24e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.21e+05 1.49e+05 3.12e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.23e+05 1.09e+05 3.05e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.02e+05 1.11e+05 2.29e+05]
 [1.00e+00 0.00e+00 0.00e+00 1.01e+05 9.18e+04 2.50e+05]
 [0.00e+00 1.00e+00 0.00e+00 9.39e+04 1.27e+05 2.50e+05]
 [1.00e+00 0.00e+00 0.00e+00 9.20e+04 1.35e+05 2.53e+05]
 [0.00e+00 1.00e+00 0.00e+00 1.20e+05 1.57e+05 2.57e+05]
 [0.00e+00 0.00e+00 1.00e+00 1.15e+05 1.23e+05 2.62e+05]
 [1.00e+00 0.00e+00 0.00e+00 7.80e+04 1.22e+05 2.64e+05]
 [0.00e+00 0.00e+00 1.00e+00 9.

## Splitting the dataset into the training set and the test set

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Training the multiple linear regression model on the training set

In [20]:
regressor = LinearRegression()
regressor.fit(
    X_train,
    y_train
)

## Predicting the test set results

In [24]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)),1))

[[103015.2  103282.38]
 [132582.28 144259.4 ]
 [132447.74 146121.95]
 [ 71976.1   77798.83]
 [178537.48 191050.39]
 [116161.24 105008.31]
 [ 67851.69  81229.06]
 [ 98791.73  97483.56]
 [113969.44 110352.25]
 [167921.07 166187.94]]
