# Linear Regression

### Mounting GDrive locally


In [0]:
# from google.colab import drive
# drive.mount('/content/gdrive')

## Load the data and check basic things about the data

In [0]:
import pandas as pd

In [0]:
tv_adv = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/advertising/tvmarketing.csv')
tv_adv.head()

In [0]:
tv_adv.tail()

In [0]:
tv_adv.info()

In [0]:
tv_adv.shape

In [0]:
tv_adv.describe()

## Visualise data using seaborn library

In [0]:
import seaborn as sns

# To visualise in the notebook
%matplotlib inline

In [0]:
# Visualise the relationship between the features and the response using scatterplots
sns.pairplot(tv_adv, x_vars=['TV'], y_vars='Sales', size=7, aspect=0.7, kind='scatter')

## Performing Simple Linear Regression

### Preparing X and y


*   The scikit-learn library expects X (feature variable) and y (response variable) to be NumPy arrays
*   However, X can be a dataframe as Pandas is built over NumPy



In [0]:
X = tv_adv['TV']

X.head()

In [0]:
y = tv_adv['Sales']
y.head()

### Splitting data into Training and testing sets

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [0]:
print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))

In [0]:
print(len(X_train))
print(len(X_test))

In [0]:
X_train.shape

### Converting feature series into NumPy array

In [0]:
import numpy as np

# This is needed only when you are using a single feature; in this case, 'TV'
X_train = X_train[:,np.newaxis]
X_test = X_test[:,np.newaxis]

In [0]:
print(X_train.shape)

In [0]:
print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))

### Perform Linear regression

In [0]:
from sklearn.linear_model import LinearRegression

# Creating LinearRegression object
lr = LinearRegression()

# You don't need to specify the object to save the result because 'lr' will take results of fit
lr.fit(X_train, y_train)

In [0]:
# Print intercept and coefficients
print(lr.intercept_)
print(lr.coef_)

### Predictions

In [0]:
# Making predictions on the testing set
y_pred = lr.predict(X_test)

In [0]:
type(y_pred)

In [0]:
print(y_pred[:5])

#### Computing RMSE and R^2 values

In [0]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)

In [0]:
r_squared = r2_score(y_test, y_pred)

In [0]:
print('Mean Squared Error: ', mse)
print('R_Squared value: ', r_squared)

#### Actual vs prediction evaluation

In [0]:
# Actual vs Predicted

import matplotlib.pyplot as plt

# Plotting actual and predicted sales
c = [i for i in range(1,61,1)] # generating index
fig = plt.figure()
plt.plot(c, y_test, color='blue', linewidth=2.5,linestyle='-')
plt.plot(c, y_pred, color='red', linewidth=2.5,linestyle='-')
fig.suptitle('Actual  and Predicted', fontsize=20)
plt.xlabel('Index', fontsize=18)
plt.ylabel('Sales', fontsize=16)


In [0]:
# Error terms
c=[i for i in range(1,61,1)]
fig=plt.figure()
plt.plot(c, y_test-y_pred, color='blue',linewidth=2.5,linestyle='-')
fig.suptitle('Error terms', fontsize=20)
plt.xlabel('Index', fontsize=18)
plt.ylabel('y_test - y_pred', fontsize=16)

In [0]:
# histogram for error terms

sns.distplot(y_test - y_pred)
plt.xlabel('Index', fontsize=18)
plt.ylabel('y_test - y_pred', fontsize=16)
plt.show()

# Looks like Normal distribution

In [0]:
# Plot y_test and y_pred

plt.scatter(y_test, y_pred)
plt.xlabel('y_test')
plt.ylabel('y_pred')

# Looks like correlated and Line can be passed through the points to have minimum error.

## Multiple Linear Regression

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Load the data

In [0]:
advertising = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/advertising/advertising.csv')
advertising.head()

In [0]:
advertising.info()

In [0]:
advertising.describe()

### Visualise the data

In [0]:
sns.pairplot(advertising)

In [0]:
sns.pairplot(advertising, x_vars=['TV','Radio','Newspaper'], y_vars='Sales',size=7, aspect=0.7, kind='scatter')

### Preparing X and y

In [0]:
X = advertising[['TV','Radio','Newspaper']]
y= advertising['Sales']

In [0]:
print(type(X))
print(type(y))

### Splitting data into train and test sets

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

In [0]:
print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))

### Performing Linear Regression

In [0]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train, y_train)

In [0]:
# Print intercept and coefficients

print(lr.intercept_)

coeff_df = pd.DataFrame(lr.coef_, X_test.columns, columns=['Coefficient'])
coeff_df

From the above result, we may infer that if TV price increases by 1 unit it will affect sales by 0.045 units.

### Predictions

In [0]:
y_pred = lr.predict(X_test)

In [0]:
print(type(y_pred))

In [0]:
print(y_pred[:10])

#### Calculating Error terms

In [0]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

In [0]:
print('Mean squared error: ', mse)
print('R_square_value: ', r_squared)

#### Optional: Checking for P-value using StatsModels

In [0]:
import statsmodels.api as sm

X_train_sm = X_train

# Unlike sklearn, statsmodels doesn't automatically fit a constant,
# so you need to use the method sm.add_constant(X) in order to add a constant
X_train_sm = sm.add_constant(X_train_sm)
print(X_train_sm.head())

# Create a fitted model in one line
lr_1 = sm.OLS(y_train, X_train_sm).fit()

# print the coefficients
lr_1.params

In [0]:
lr_1.summary()

From the above we can see that Newspaper is insignificant because of high P-value

In [0]:
%matplotlib inline

In [0]:
plt.figure(figsize=(5,5))
sns.heatmap(advertising.corr(), annot=True)

Above heatmap shows Radio and newspaper are highly correlated. We need to check individual R-squared value for newspaper and Sales

#### Simple Linear Regression model on Newspaper vs Sales

In [0]:
import numpy as np
import pandas as pd

news_adv = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/Datasets/advertising/advertising.csv')

X_news = news_adv['Newspaper']
y_news = news_adv['Sales']

from sklearn.model_selection import train_test_split
X_news_train, X_news_test, y_news_train, y_news_test = train_test_split(X_news, y_news, train_size=0.7, random_state = 110)

print(X_news_train.shape)
print(X_news_test.shape)

X_news_train = X_news_train[:,np.newaxis]
X_news_test = X_news_test[:,np.newaxis]

print(X_news_train.shape)
print(X_news_test.shape)

lr_news = LinearRegression()
lr_news.fit(X_news_train, y_news_train)
print(lr_news.intercept_)
print(lr_news.coef_)

y_news_pred = lr_news.predict(X_news_test)

r_squared_news = r2_score(y_news_test, y_news_pred)
print(r_squared_news)

Above, we can see R-Squared value is very low almost close to 0. Hence, it doesn't affect Sales if we remove Newspaper from other features. 

### Implementing the results and running the model again

In [0]:
# Putting feature variable to X
X = advertising[['TV','Radio','Newspaper']]
y = advertising['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=101)

# Removing Newspaper from our dataset
X_train_new = X_train[['TV','Radio']]
X_test_new = X_test[['TV','Radio']]

In [0]:
lr = LinearRegression()
lr.fit(X_train_new, y_train)

print(lr.intercept_)
print(lr.coef_)

In [0]:
y_pred_new = lr.predict(X_test_new)

In [0]:
# Actual vs Predicted

import matplotlib.pyplot as plt

# Plotting actual and predicted sales
c = [i for i in range(1,61,1)] # generating index
fig = plt.figure()
plt.plot(c, y_test, color='blue', linewidth=2.5,linestyle='-')
plt.plot(c, y_pred, color='red', linewidth=2.5,linestyle='-')
fig.suptitle('Actual  and Predicted', fontsize=20)
plt.xlabel('Index', fontsize=18)
plt.ylabel('Sales', fontsize=16)

In [0]:
# Error terms
c=[i for i in range(1,61,1)]
fig=plt.figure()
plt.plot(c, y_test-y_pred, color='blue',linewidth=2.5,linestyle='-')
fig.suptitle('Error terms', fontsize=20)
plt.xlabel('Index', fontsize=18)
plt.ylabel('y_test - y_pred', fontsize=16)

In [0]:
# plotting distribution of the error term
sns.distplot(y_test-y_pred)
plt.xlabel("Error = y - y_pred")

In [0]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred_new)
r_squared = r2_score(y_test, y_pred_new)

In [0]:
print('Mean squared error: ', mse)
print ('R_Squared: ', r_squared)

In [0]:
X_train_final = X_train_new

X_train_final = sm.add_constant(X_train_final)
lm_final = sm.OLS(y_train, X_train_final).fit()

print(lm_final.summary())

Now it looks like perfect model with TV and Radio with high R-Squared value. 