# Variable Selection Example


04/08/2019 - Jeff Smith

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

## Sample Datasets

In [None]:
# Cerals dataset from Larose and Larose
# Read the raw data file
cereals = pd.read_csv("../data/cereals.csv")
# The Name field has some trailing spaces -- remove them
cereals.Name = cereals.Name.str.strip()
# Get rid of Quaker Oatmeal -- no Sugar values (see the book)
cereals = cereals[cereals.Name != 'Quaker_Oatmeal']
#cereals.head()

## Statsmodels OLS function

### Cereals Dataset

In [None]:
# Cereals
X = cereals[['Sugars']]
y = cereals.Rating
# by default OLS doesn't compute the constant
X = sm.add_constant(X)
m1 = sm.OLS(y, X).fit()
predictions = m1.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(m1.scale)))
print("                       SSR: {:.3f}".format(m1.ssr))
print("                       MSE: {:.3f}".format(m1.mse_resid))
m1.summary()

In [None]:
X = cereals[['Sugars', 'Fiber']]
y = cereals.Rating
X = sm.add_constant(X)
m2 = sm.OLS(y, X).fit()
predictions = m2.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(m2.scale)))
print("                       SSR: {:.3f}".format(m2.ssr))
print("                       MSE: {:.3f}".format(m2.mse_resid))
m2.summary()

In [None]:
# Partial F-test
# From https://stackoverflow.com/questions/45243802/how-do-i-do-an-f-test-to-compare-nested-linear-models-in-python
anovaResults = anova_lm(m1, m2)
print(anovaResults)

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium']]
y = cereals.Rating
X = sm.add_constant(X)
m3 = sm.OLS(y, X).fit()
predictions = m3.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(m3.scale)))
print("                       SSR: {:.3f}".format(m3.ssr))
print("                       MSE: {:.3f}".format(m3.mse_resid))
m3.summary()

In [None]:
# Partial F-test
# From https://stackoverflow.com/questions/45243802/how-do-i-do-an-f-test-to-compare-nested-linear-models-in-python
anovaResults = anova_lm(m2, m3)
print(anovaResults)

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium', 'Fat']]
y = cereals.Rating
X = sm.add_constant(X)
m4 = sm.OLS(y, X).fit()
predictions = m4.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(m4.scale)))
print("                       SSR: {:.3f}".format(m4.ssr))
print("                       MSE: {:.3f}".format(m4.mse_resid))
m4.summary()

In [None]:
# Partial F-test
# From https://stackoverflow.com/questions/45243802/how-do-i-do-an-f-test-to-compare-nested-linear-models-in-python
anovaResults = anova_lm(m3, m4)
print(anovaResults)

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium', 'Fat', 'Carbo']]
y = cereals.Rating
X = sm.add_constant(X)
m5 = sm.OLS(y, X).fit()
predictions = m5.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(m5.scale)))
print("                       SSR: {:.3f}".format(m5.ssr))
print("                       MSE: {:.3f}".format(m5.mse_resid))
m5.summary()

In [None]:
# Partial F-test
# From https://stackoverflow.com/questions/45243802/how-do-i-do-an-f-test-to-compare-nested-linear-models-in-python
anovaResults = anova_lm(m4, m5)
print(anovaResults)