In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
# To split in test and train
from sklearn.model_selection import train_test_split


In [3]:
ads = pd.read_csv("Advertising Budget and Sales.csv")

In [4]:
ads.head()

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [5]:
ads.describe()

Unnamed: 0.1,Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
count,200.0,200.0,200.0,200.0,200.0
mean,100.5,147.0425,23.264,30.554,14.0225
std,57.879185,85.854236,14.846809,21.778621,5.217457
min,1.0,0.7,0.0,0.3,1.6
25%,50.75,74.375,9.975,12.75,10.375
50%,100.5,149.75,22.9,25.75,12.9
75%,150.25,218.825,36.525,45.1,17.4
max,200.0,296.4,49.6,114.0,27.0


In [6]:
ads.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               200 non-null    int64  
 1   TV Ad Budget ($)         200 non-null    float64
 2   Radio Ad Budget ($)      200 non-null    float64
 3   Newspaper Ad Budget ($)  200 non-null    float64
 4   Sales ($)                200 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [9]:
# no of elements 
len(ads.columns)*len(ads)

1000

Preprocessing and Cleaning Phase


In [13]:
# Check for null values
# Remove any unrequired columns
ads.drop(['Unnamed: 0'],inplace=True, axis=1)

In [14]:
ads.head()

Unnamed: 0,TV Ad Budget ($),Radio Ad Budget ($),Newspaper Ad Budget ($),Sales ($)
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [None]:
# rename the columns to simpler names
ads.rename({'Sales ($)':'Sales'}, inplace=True, axis=1)
# or

In [15]:
ads.columns = ['TV','Radio','News','Sales']
ads.head()

Unnamed: 0,TV,Radio,News,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


Create Objects to Train and Test the Model; Find
the sales figures for each channel

In [16]:
# Create feature object from columns
X = ads.drop('Sales', axis=1)
X

Unnamed: 0,TV,Radio,News
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4
...,...,...,...
195,38.2,3.7,13.8
196,94.2,4.9,8.1
197,177.0,9.3,6.4
198,283.6,42.0,66.2


In [19]:
# Create Target object
# single bracket would have made this into a series
y = ads[['Sales']]
y

Unnamed: 0,Sales
0,22.1
1,10.4
2,9.3
3,18.5
4,12.9
...,...
195,7.6
196,9.7
197,12.8
198,25.5


In [22]:
# splitting into train and test
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5) if 50 percent split
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(150, 3) (50, 3) (150, 1) (50, 1)


In [23]:
X_train.head(10)
# observe indices, randomization is very imp

Unnamed: 0,TV,Radio,News
68,237.4,27.5,11.0
10,66.1,5.8,24.2
170,50.0,11.6,18.4
79,116.0,7.7,23.1
22,13.2,15.9,49.6
182,56.2,5.7,29.7
165,234.5,3.4,84.8
28,248.8,27.1,22.9
116,139.2,14.3,25.6
174,222.4,3.4,13.1


In [24]:
y_train.head(10)
# does y for the same X values

Unnamed: 0,Sales
68,18.9
10,8.6
170,8.4
79,11.0
22,5.6
182,8.7
165,11.9
28,18.9
116,12.2
174,11.5


Model

In [28]:
# linear regression model
lr = LinearRegression()
# fit the best fit line
lr.fit(X_train,y_train)

In [30]:
# Print intercepts and coefficients
print(lr.coef_)
print(lr.intercept_)

[[ 0.0462253   0.18219645 -0.00243461]]
[3.08057668]


In [45]:
# prediction for test dataset
test_predict = lr.predict(X_test)
len(test_predict)

50

In [None]:
# failed at this
df = pd.DataFrame(test_predict, columns=['Column_A'])
y_test1= y_test.reset_index()
df2 = y_test1.drop(['index'],axis=1)
# dataframe for both predicted and correct value
prediction = pd.DataFrame(df2,df)

In [49]:
# Mean squared error
from sklearn.metrics import mean_squared_error,r2_score

In [48]:
mean_squared_error(y_test,test_predict)

2.4353295596663047

In [50]:
# to replicate results (for exact same split), random_state = 3
# to check how good fit the data is:
# r2_score
r2_score(y_test,test_predict)
# 90 percent of variance is being covered by the regression line

0.9072617738507079