In [51]:
!pip install pandas plotly statsmodels &> /dev/null

In [73]:
import pandas as pd
import numpy as np
import plotly.express as px

import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.options.plotting.backend = "plotly"

### Basic Linear Regression

In [53]:
df = pd.read_csv("advertising.csv")
df.head(3)

Unnamed: 0,tv,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3


In [72]:
# Relationship between TV adverts and Sales
fig = df.plot(kind="scatter", x="tv", y="sales", trendline="ols")
fig.data[1].line={'dash': 'dash', 'color': 'red'}
fig

In [77]:
# Ordinary Least Squares Model (regress sales against TV adverts)
# Can just use plotly to get a univariate statsmodel OLS 
tv_model = px.get_trendline_results(fig).iloc[0][0] # == smf.ols(data=df, formula='sales ~ tv').fit()
tv_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x13af00d90>

In [81]:
full_model = smf.ols( data=df, formula='sales ~ tv + radio + newspaper').fit()
full_model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.897
Model:,OLS,Adj. R-squared:,0.896
Method:,Least Squares,F-statistic:,570.3
Date:,"Tue, 10 May 2022",Prob (F-statistic):,1.58e-96
Time:,19:54:28,Log-Likelihood:,-386.18
No. Observations:,200,AIC:,780.4
Df Residuals:,196,BIC:,793.6
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.9389,0.312,9.422,0.000,2.324,3.554
tv,0.0458,0.001,32.809,0.000,0.043,0.049
radio,0.1885,0.009,21.893,0.000,0.172,0.206
newspaper,-0.0010,0.006,-0.177,0.860,-0.013,0.011

0,1,2,3
Omnibus:,60.414,Durbin-Watson:,2.084
Prob(Omnibus):,0.0,Jarque-Bera (JB):,151.241
Skew:,-1.327,Prob(JB):,1.44e-33
Kurtosis:,6.332,Cond. No.,454.0


In [82]:
interaction_model = smf.ols( data=df, formula='sales ~ tv + radio + tv:radio').fit()
interaction_model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.968
Model:,OLS,Adj. R-squared:,0.967
Method:,Least Squares,F-statistic:,1963.0
Date:,"Tue, 10 May 2022",Prob (F-statistic):,6.68e-146
Time:,19:55:31,Log-Likelihood:,-270.14
No. Observations:,200,AIC:,548.3
Df Residuals:,196,BIC:,561.5
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.7502,0.248,27.233,0.000,6.261,7.239
tv,0.0191,0.002,12.699,0.000,0.016,0.022
radio,0.0289,0.009,3.241,0.001,0.011,0.046
tv:radio,0.0011,5.24e-05,20.727,0.000,0.001,0.001

0,1,2,3
Omnibus:,128.132,Durbin-Watson:,2.224
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1183.719
Skew:,-2.323,Prob(JB):,9.089999999999998e-258
Kurtosis:,13.975,Cond. No.,18000.0


In [90]:
import random

random_row = random.sample(range(199), 1)
df_train = df.drop(random_row, axis=0)
df_test = df.iloc[random_row]

In [91]:
model = smf.ols(data=df, formula = 'sales ~ tv + radio + tv:radio').fit()

In [92]:
(pd.DataFrame(
    dict(prediction=model.predict(df_test.drop('sales', axis=1)), 
         actual=df_test['sales']))
    .assign(diff=lambda df: df.prediction - df.actual))


Unnamed: 0,prediction,actual,diff
145,9.774562,10.3,-0.525438


### --

- TODO add extra examples from private notes and extra next step links:

`https://www.analyticsvidhya.com/blog/2020/03/what-is-multicollinearity/`