In [1]:
"""
Problem Statement

You have been provided with a dataset that contains the costs of advertising on different media channels
and corresponding sales of XYZ firm. Evaluate the dataset to:
1. Find the features or media channels used by the firm.
2. Find the sales figures for each channel.
3. Create a model to predict the sales outcome.
4. Split it into training and testing datasets for the model.
5. Calculate the mean squared error (MSE)
"""

'\nProblem Statement\n\nYou have been provided with a dataset that contains the costs of advertising on different media channels\nand corresponding sales of XYZ firm. Evaluate the dataset to:\n1. Find the features or media channels used by the firm.\n2. Find the sales figures for each channel.\n3. Create a model to predict the sales outcome.\n4. Split it into training and testing datasets for the model.\n5. Calculate the mean squared error (MSE)\n'

In [2]:
#import the required libraries
import pandas as pd

In [3]:
#import the advertising dataset
df_adv_data = pd.read_csv('C:\dataset\Advertising.csv', index_col=0)

In [4]:
#view top 5 records
df_adv_data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


In [5]:
#view dataset size
df_adv_data.size

800

In [6]:
#view shape of the dataset
df_adv_data.shape
#it contains 200 observations and 4 columns

(200, 4)

In [7]:
#view columns of the dataset
df_adv_data.columns

#this shows that the dataset has 3 features(Newspaper,Radion,TV) and 1 response(Sales)

Index(['TV', 'Radio', 'Newspaper', 'Sales'], dtype='object')

In [8]:
#create a feature object from the columns
X_feature = df_adv_data[['Newspaper','Radio','TV']]

In [9]:
#view feature object
X_feature.head()

Unnamed: 0,Newspaper,Radio,TV
1,69.2,37.8,230.1
2,45.1,39.3,44.5
3,69.3,45.9,17.2
4,58.5,41.3,151.5
5,58.4,10.8,180.8


In [10]:
#create target object from sales column which is a response in the dataset
Y_target = df_adv_data[['Sales']]

In [11]:
#view the target object
Y_target.head()

Unnamed: 0,Sales
1,22.1
2,10.4
3,9.3
4,18.5
5,12.9


In [12]:
#view the  feature object shape
X_feature.shape

(200, 3)

In [13]:
#view the target object shape
Y_target.shape

(200, 1)

In [18]:
#split test and training data
#by default 75% training data and 25% testing data

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_feature, Y_target, random_state=1)

In [19]:
#view shape of train and test data for both feature and response
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(150, 3)
(150, 1)
(50, 3)
(50, 1)


In [20]:
#import linear regression model
from sklearn.linear_model import LinearRegression

#create an instance of the model using the estimator shown here
linreg = LinearRegression()

#fit the training dataset into it
linreg.fit(x_train,y_train)

In [24]:
#print the intercepts and the coefficients
print(linreg.intercept_)
print(linreg.coef_)

[2.87696662]
[[0.00345046 0.17915812 0.04656457]]


In [25]:
#predict the response using the test datasets

#create a predict object and pass the data into it
y_pred = linreg.predict(x_test)
y_pred

#it gives you the predicted values in an array
#you can always verify them against the actual values to test if the model is accurate or not

array([[21.70910292],
       [16.41055243],
       [ 7.60955058],
       [17.80769552],
       [18.6146359 ],
       [23.83573998],
       [16.32488681],
       [13.43225536],
       [ 9.17173403],
       [17.333853  ],
       [14.44479482],
       [ 9.83511973],
       [17.18797614],
       [16.73086831],
       [15.05529391],
       [15.61434433],
       [12.42541574],
       [17.17716376],
       [11.08827566],
       [18.00537501],
       [ 9.28438889],
       [12.98458458],
       [ 8.79950614],
       [10.42382499],
       [11.3846456 ],
       [14.98082512],
       [ 9.78853268],
       [19.39643187],
       [18.18099936],
       [17.12807566],
       [21.54670213],
       [14.69809481],
       [16.24641438],
       [12.32114579],
       [19.92422501],
       [15.32498602],
       [13.88726522],
       [10.03162255],
       [20.93105915],
       [ 7.44936831],
       [ 3.64695761],
       [ 7.22020178],
       [ 5.9962782 ],
       [18.43381853],
       [ 8.39408045],
       [14

In [26]:
#A better way to test your model's accuracy is to calculate its MSE (mean square error)
#import the required libraries for calculating MSE (mean square error)
from sklearn import metrics
import numpy as np


In [27]:
#calculate the mean square error(MSE)
#use numpy's square method and pass the built-in MSE function of metrics by applying them on the response test and predict objects.

print(np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
#it will give you the MSE present in the model and you can use it to determine the accuracy of the model.

1.4046514230328957
