In [56]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import warnings
import seaborn as sns
import pandas as pd
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
warnings.filterwarnings('ignore')
%matplotlib inline
%matplotlib widget
import statsmodels.formula.api as sm
from statsmodels.formula.api import ols
from math import sqrt
from mpl_toolkits.mplot3d import Axes3D



In [51]:
advertising_df = pd.read_csv('datasets/Advertising.csv', index_col=0)

In [52]:
advertising_df.loc[0:4]

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5


In [53]:
sns.pairplot(data=advertising_df,kind='reg')

<IPython.core.display.Javascript object>

<seaborn.axisgrid.PairGrid at 0x1a25066d68>

In [6]:
mod = ols(formula='sales ~ TV', data=advertising_df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.612
Model:                            OLS   Adj. R-squared:                  0.610
Method:                 Least Squares   F-statistic:                     312.1
Date:                Tue, 14 Aug 2018   Prob (F-statistic):           1.47e-42
Time:                        09:39:59   Log-Likelihood:                -519.05
No. Observations:                 200   AIC:                             1042.
Df Residuals:                     198   BIC:                             1049.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      7.0326      0.458     15.360      0.0

Esto dice que el intervalo de confianza del 95 % de $\beta_{0}$ es [6.130       7.935] lo que quiere decir que en ausencia de propaganda en TV las ventas se encontraran en ese intervalo.
Tambien dice que el intervalo de confianza del 95 % para $\beta_{1}$ es [0.042       0.053] lo que quiere decir que invirtiendo $1000 en TV se puede ganar entre [42,53] en ventas.

In [7]:
res.pvalues

Intercept    1.406300e-35
TV           1.467390e-42
dtype: float64

Los p_values son bajisimos con lo cual se rechaza la hipotesis nula que los coheficientes son cero por la hipotesis alternativa que los coheficientes son distintos de cero.

In [8]:
sqrt(res.mse_total)

5.217456565710478

In [9]:
mod = ols(formula='sales ~ TV + radio + newspaper', data=advertising_df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     570.3
Date:                Tue, 14 Aug 2018   Prob (F-statistic):           1.58e-96
Time:                        09:40:03   Log-Likelihood:                -386.18
No. Observations:                 200   AIC:                             780.4
Df Residuals:                     196   BIC:                             793.6
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.9389      0.312      9.422      0.0

In [10]:
mod = ols(formula='sales ~ newspaper', data=advertising_df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     10.89
Date:                Tue, 14 Aug 2018   Prob (F-statistic):            0.00115
Time:                        09:40:04   Log-Likelihood:                -608.34
No. Observations:                 200   AIC:                             1221.
Df Residuals:                     198   BIC:                             1227.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     12.3514      0.621     19.876      0.0

In [11]:
advertising_df.corr()

Unnamed: 0,TV,radio,newspaper,sales
TV,1.0,0.054809,0.056648,0.782224
radio,0.054809,1.0,0.354104,0.576223
newspaper,0.056648,0.354104,1.0,0.228299
sales,0.782224,0.576223,0.228299,1.0


Haciendo regresion de sales sobre newspaper se que newspaper tiene influencia sobre sales, pero haciendo regresion sobre todas las variables se ve que tiene poca influencia.

correlation between radio and newspaper is 0.35. This reveals a
tendency to spend more on newspaper advertising in markets where more
is spent on radio advertising

Consider an absurd example to illustrate the point. Running
a regression of shark attacks versus ice cream sales for data collected at
a given beach community over a period of time would show a positive
relationship, similar to that seen between sales and newspaper. Of course
no one (yet) has suggested that ice creams should be banned at beaches
to reduce shark attacks. In reality, higher temperatures cause more people
to visit the beach, which in turn results in more ice cream sales and more
shark attacks. A multiple regression of attacks versus ice cream sales and
temperature reveals that, as intuition implies, the former predictor is no
longer significant after adjusting for temperature.

In [62]:
mod = ols(formula='sales ~ TV + radio', data=advertising_df)
simple_model_fit = mod.fit()
print(simple_model_fit.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.897
Model:                            OLS   Adj. R-squared:                  0.896
Method:                 Least Squares   F-statistic:                     859.6
Date:                Tue, 14 Aug 2018   Prob (F-statistic):           4.83e-98
Time:                        10:57:03   Log-Likelihood:                -386.20
No. Observations:                 200   AIC:                             778.4
Df Residuals:                     197   BIC:                             788.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.9211      0.294      9.919      0.0

In [64]:
mod = ols(formula='sales ~ TV + radio + TV*radio', data=advertising_df)
complex_model_fit = mod.fit()
print(complex_model_fit.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.968
Model:                            OLS   Adj. R-squared:                  0.967
Method:                 Least Squares   F-statistic:                     1963.
Date:                Tue, 14 Aug 2018   Prob (F-statistic):          6.68e-146
Time:                        10:57:47   Log-Likelihood:                -270.14
No. Observations:                 200   AIC:                             548.3
Df Residuals:                     196   BIC:                             561.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      6.7502      0.248     27.233      0.0

In [61]:
def plot_linear_regression_surface(advertising_df, model_fit):
    x_surf, y_surf = np.meshgrid(np.linspace(advertising_df.TV.min(), advertising_df.TV.max(), 100),np.linspace(advertising_df.radio.min(), advertising_df.radio.max(), 100))
    onlyX = pd.DataFrame({'TV': x_surf.ravel(), 'radio': y_surf.ravel()})
    fittedY=model_fit.predict(exog=onlyX).as_matrix()
    fig = plt.figure(figsize=(10,10))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(advertising_df['TV'],advertising_df['radio'],advertising_df['sales'],c='red', marker='o')
    ax.plot_surface(x_surf,y_surf,fittedY.reshape(x_surf.shape), alpha=0.5)
    ax.set_xlabel('TV')
    ax.set_ylabel('Radio')
    ax.set_zlabel('Sales')
    plt.show()

In [None]:
plot_linear_regression_surface()