In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
data = pd.read_csv("Advertising.csv")
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


In [5]:
data.shape

(200, 4)

# Observations:
- The dataset contains 200 rows and 4 columns
- Each row contains the expenditure on different medium for advertising along with Sales

In [6]:
data.isnull().sum()

TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64

In [7]:
X = data.drop("Sales", axis = 1)
y = data['Sales']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

# Apply Linear Regression on X_train and y_train

In [9]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [10]:
lr.fit(X_train, y_train)

# Perform Predictions on X_test data

In [12]:
y_pred = lr.predict(X_test)
y_pred

array([15.77878608, 18.13963419,  9.71988374, 12.35754726, 11.49928311,
       19.61708868, 18.21140615,  5.20501106, 21.08144021, 12.31552854,
       18.2949829 , 15.24007957, 21.08194368, 16.00360445, 19.4292147 ,
       16.44473148, 12.24866175, 18.93242528,  9.63263457, 13.80533867,
       15.69803572, 10.88679356,  9.31700686, 17.16148085, 22.81234026,
       13.01057258,  9.61963047,  9.95583328, 16.94678959, 16.51530909,
       14.90223951, 12.99738447, 10.32722344, 17.84866777,  8.86380094,
        9.71636772, 13.24728558, 22.00587075, 10.63854841, 11.56151208])

In [13]:
X_test

Unnamed: 0,TV,Radio,Newspaper
112,175.7,15.4,2.4
165,234.5,3.4,84.8
12,23.8,35.1,65.9
73,129.4,5.7,31.3
144,96.2,14.8,38.9
20,218.4,27.7,53.4
199,232.1,8.6,8.7
8,8.6,2.1,1.0
39,228.0,37.7,32.0
88,88.3,25.5,73.4


In [20]:
m = lr.coef_
m

array([0.05505166, 0.10289854, 0.00435186])

In [17]:
# Check m slope
m1, m2, m3 = lr.coef_
m1, m2, m3

(0.055051656659943514, 0.10289854060587739, 0.0043518590685206054)

# Note:
- The 3 m values displayed in the above output are for 3 independent variables

In [16]:
c = lr.intercept_
c

4.511128020091029

# Notes:

- The Linear Regression equation for the above dataset will be of the form : y = m1x1 + m2x2 + m3x3 + c
- Sales = m1 * TV + m2 * Radio + m3 * Newspaper + c
- Sales = 0.05 * TV + 0.102 * Radio + 0.004 * Newspaper

In [25]:
# TV = 50 , Radio = 100, Newspaper = 10

lr.predict([[50, 100, 0]])

array([17.55356491])

In [22]:
# TV = 10, Radio = 20, Newspaper = 100

lr.predict([[10, 20, 100]])

array([7.55480131])

In [23]:
m2 > m1 > m3

True

# Observation:
- When we look at the m values, m2 > m1 > m3, which means that advertising on Radio will bring us more Sales compared to Advertising on TV or Newspaper
- Do More expenditure on Radio followed by TV
- Advertising can be stopped on Newspaper, as that is not adding to much to Sales

In [26]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.9006409689782175

In [27]:
## Checking the error

from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_pred, y_test))

1.3045111912297258

In [28]:
lr.predict([[50, 100, 0]])

array([17.55356491])

# Observations:
- As we see that the average error is 1.30.
- For the above prediction, the Sales value should be in the range 16.2 to 18.8 