In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("advertising.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [4]:
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
df

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,9.7
197,177.0,9.3,6.4,12.8
198,283.6,42.0,66.2,25.5


## Problem Statement

- Build a model which predicts sales based on the money spent on different platforms for marketing

In [6]:
df.describe()

Unnamed: 0,TV,radio,newspaper,sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   radio      200 non-null    float64
 2   newspaper  200 non-null    float64
 3   sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


- there are no null values
- all are having datatype of float64
- tv, radio have very small or insignificant skewness
- newspaper has siginificant skewness
- newspaper has mean > median and hence it is right skewed or +ve skewed

In [8]:
df.corr().style.background_gradient()

Unnamed: 0,TV,radio,newspaper,sales
TV,1.0,0.054809,0.056648,0.782224
radio,0.054809,1.0,0.354104,0.576223
newspaper,0.056648,0.354104,1.0,0.228299
sales,0.782224,0.576223,0.228299,1.0


- there is no correlation between tv and radio
- there is no correlation between tv and newspaper
- there is strong correlation between radio and newspaper
- due to this, the 2 columns are correlated with each other
- this is a case of multi collinearity
- so we will have to handle the multi collinearity case of radio and newspaper
- we will solve this issue by removing either of radio or newspaper columns
- but now which one to remove ?
- for that, we can see the least significant column based on its impact on the target variable
- we can see that radio is having a correlation value of 0.57 with target variable
- we can see that newspaper is having a correlation value of 0.228 with target variable
- so, to eliminate this scenario of multi collinearity, we are going to drop the newspaper column

### Building a model without dropping newspaper column 

In [16]:
x = df.iloc[:, 0:3]
y = df.iloc[:, -1]

In [17]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=123)

In [18]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(xtrain, ytrain)
ypred1 = lr.predict(xtest)

In [20]:
from sklearn.metrics import r2_score
r2_score(ytest, ypred1)

0.9205864667717336

### Building a model by dropping newspaper column 

In [21]:
x = df.iloc[:, 0:2]
y = df.iloc[:, -1]

In [22]:
lr2 = LinearRegression()
lr2.fit(xtrain, ytrain)
ypred2 = lr2.predict(xtest)

In [23]:
r2_score(ytest, ypred2)

0.9205864667717336