In [1]:
# conventional way to import pandas
import pandas as pd
import numpy as np

In [2]:
# read CSV file directly from a URL and save the results
data = pd.read_csv('/home/avinash/Documents/datasets/Advertising.csv')

# display the first 5 rows
data.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
Unnamed: 0    200 non-null int64
TV            200 non-null float64
Radio         200 non-null float64
Newspaper     200 non-null float64
Sales         200 non-null float64
dtypes: float64(4), int64(1)
memory usage: 7.9 KB


In [4]:
# display the last 5 rows
data.tail()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
195,196,38.2,3.7,13.8,7.6
196,197,94.2,4.9,8.1,9.7
197,198,177.0,9.3,6.4,12.8
198,199,283.6,42.0,66.2,25.5
199,200,232.1,8.6,8.7,13.4


In [5]:
# check the shape of the DataFrame (rows, columns)
data.shape

(200, 5)

In [6]:
data.corr()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
Unnamed: 0,1.0,0.017715,-0.11068,-0.154944,-0.051616
TV,0.017715,1.0,0.054809,0.056648,0.782224
Radio,-0.11068,0.054809,1.0,0.354104,0.576223
Newspaper,-0.154944,0.056648,0.354104,1.0,0.228299
Sales,-0.051616,0.782224,0.576223,0.228299,1.0


In [7]:
# use the list to select a subset of the original DataFrame
X = data[['TV', 'Radio', 'Newspaper']]

# print the first 5 rows
X.head()

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [8]:
# check the type and shape of X
print(type(X))

print(X.shape)

<class 'pandas.core.frame.DataFrame'>
(200, 3)


In [9]:
# select a Series from the DataFrame
y = data['Sales']

# equivalent command that works if there are no spaces in the column name
y = data.Sales

# print the first 5 values
y.head()

0    22.1
1    10.4
2     9.3
3    18.5
4    12.9
Name: Sales, dtype: float64

In [13]:
#split training set and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

In [14]:
# default split is 75% for training and 25% for testing
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(150, 3)
(150,)
(50, 3)
(50,)


In [15]:

# import model
from sklearn.linear_model import LinearRegression

# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
# print the intercept and coefficients
print(linreg.intercept_)
print(linreg.coef_)

2.99657301707572
[ 0.04579495  0.19366795 -0.00641799]


In [17]:
feature_cols=['TV', 'Radio', 'Newspaper']
# pair the feature names with the coefficients
zip(feature_cols, linreg.coef_)

[('TV', 0.045794950455703504),
 ('Radio', 0.19366795411234963),
 ('Newspaper', -0.00641798808528736)]

In [18]:
# make predictions on the testing set
y_pred = linreg.predict(X_test)

In [19]:
# calculate MAE using scikit-learn
from sklearn import metrics
print(metrics.mean_absolute_error(y_test,y_pred))

1.147437695943176


In [20]:
#calculate mean squared error
print(metrics.mean_squared_error(y_test, y_pred))

2.290961909377689


In [21]:
# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

1.5135923854782334


In [22]:
# compute the R Square for model
metrics.r2_score(y_test, y_pred)

0.8919805972342759

In [23]:
#Does Newspaper "belong" in our model? In other words, does it improve the quality of our predictions?
#Let's remove it from the model and check the RMSE!
# create a Python list of feature names
feature_cols = ['TV', 'Radio']

# use the list to select a subset of the original DataFrame
X = data[feature_cols]

# select a Series from the DataFrame
y = data.Sales

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# compute the R Square for model
print(metrics.r2_score(y_test, y_pred))

1.3879034699382888
0.9176214942248908


In [None]:
#The RMSE decreased when we removed Newspaper from the model. 
#(Error is something we want to minimize, so a lower number for RMSE is better.) 
#Thus, it is unlikely that this feature is not useful for predicting Sales, and should be removed from the model.