In [26]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.linear_model import LinearRegression

In [27]:
data = pd.read_csv('Advertising.csv')
data.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [28]:
data.describe()

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [29]:
x = data[['TV', 'Radio', 'Newspaper']]
y = data['Sales']

In [30]:
reg = LinearRegression()
reg.fit(x, y)

LinearRegression()

In [31]:
reg.coef_

array([ 0.04576465,  0.18853002, -0.00103749])

In [32]:
reg.intercept_

2.9388893694594085

Calculating R-squared

In [34]:
reg.score(x, y)

0.8972106381789522

### Formula  for adjusted r^2
$R^2_{adj.} = 1 - (1 - R^2)*\frac{n-1}{n-p-1}$

In [40]:
x.shape

(200, 3)

In [41]:
r2 = reg.score(x, y)

n = x.shape[0]
p = x.shape[1]

adjusted_r2 = 1 - (1-r2)* (n-1)/(n-p-1)
adjusted_r2

0.8956373316204668

### Feature Selection

In [42]:
from sklearn.feature_selection import f_regression


In [43]:
f_regression(x, y)

(array([312.14499437,  98.42158757,  10.88729908]),
 array([1.46738970e-42, 4.35496600e-19, 1.14819587e-03]))

p_values = f_regression(x, y)[1]
p_values.round(3)

### Crating a summary Table

In [49]:
reg_summary = pd.DataFrame(data = x.columns.values, columns=['Fearures'])
reg_summary

Unnamed: 0,Fearures
0,TV
1,Radio
2,Newspaper


In [50]:
reg_summary['coefficient'] = reg.coef_
reg_summary['p-values'] = p_values.round(3)
reg_summary

Unnamed: 0,Fearures,coefficient,p-values
0,TV,0.045765,0.0
1,Radio,0.18853,0.0
2,Newspaper,-0.001037,0.001


### Standardization

In [52]:
from sklearn.preprocessing import StandardScaler

In [54]:
scaler = StandardScaler()

In [55]:
scaler.fit(x)

StandardScaler()

In [57]:
x_scaled = scaler.transform(x)

### Regression with scaled features

In [59]:
reg = LinearRegression()
reg.fit(x_scaled, y)

LinearRegression()

In [60]:
reg.coef_

array([ 3.91925365,  2.79206274, -0.02253861])

In [61]:
reg.intercept_

14.0225

### Creating Summary Table

In [65]:
reg_summary = pd.DataFrame([['Tv'], ['Radio'], ['Newspaper']], columns=['Fearures'])
reg_summary['Weights'] = reg.intercept_, reg.coef_[0], reg.coef_[1]

In [66]:
reg_summary

Unnamed: 0,Fearures,Weights
0,Tv,14.0225
1,Radio,3.919254
2,Newspaper,2.792063


### Making Predictions with standardized coefficients(weights)

In [69]:
new_data = pd.DataFrame(data=[[230, 40, 70]], columns=['Tv', 'Radio', 'Newspaper'])
new_data

Unnamed: 0,Tv,Radio,Newspaper
0,230,40,70


In [73]:
new_data_scaled = scaler.transform(new_data)
new_data_scaled

Feature names unseen at fit time:
- Tv
Feature names seen at fit time, yet now missing:
- TV



array([[0.96868458, 1.1300743 , 1.81577092]])

In [75]:
reg.predict(new_data_scaled)

array([20.93333399])