# Multiple Regression and Model Building
Material from Chapter 9 of Larose and Larose and external sources

02/20/2019 - Jeff Smith

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [None]:
# Some support functions - based on w[0] as intercept; w[1] as slope
def cost(y, x, w) :
    return sum((y-(w[0] + w[1]*x))**2)

def show(y, x, w = [0,0], show_reg = 1) :
    if w[0] or w[1]:
        print("Solution: RSS={:,.3f}; w = [{:.4f}, {:.4f}]".format(cost(y, x, w), w[0], w[1]))
    plt.figure(figsize=(8,6))
    plt.scatter(x, y);
    if show_reg:
        axes = plt.gca()
        x_vals = np.array(axes.get_xlim())
        y_vals = w[0] + w[1] * x_vals
        plt.plot(x_vals, y_vals, '--');

## Sample Datasets

In [3]:
# Cerals dataset from Larose and Larose
# Read the raw data file
cereals = pd.read_csv("../data/cereals.csv")
# The Name field has some trailing spaces -- remove them
cereals.Name = cereals.Name.str.strip()
# Get rid of Quaker Oatmeal -- no Sugar values (see the book)
cereals = cereals[cereals.Name != 'Quaker_Oatmeal']
cereals.head()

Unnamed: 0,Name,Manuf,Type,Calories,Protein,Fat,Sodium,Fiber,Carbo,Sugars,...,Weight,Cups,Rating,Cold,Nabisco,Quaker,Kelloggs,GeneralMills,Ralston,AHFP
0,100%_Bran,N,C,70,4,1,130,10.0,5.0,6.0,...,1.0,0.33,68.402973,1,1,0,0,0,0,0
1,100%_Natural_Bran,Q,C,120,3,5,15,2.0,8.0,8.0,...,1.0,1.0,33.983679,1,0,1,0,0,0,0
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,...,1.0,0.33,59.425505,1,0,0,1,0,0,0
3,All-Bran_with_Extra_Fiber,K,C,50,4,0,140,14.0,8.0,0.0,...,1.0,0.5,93.704912,1,0,0,1,0,0,0
4,Almond_Delight,R,C,110,2,2,200,1.0,14.0,8.0,...,1.0,0.75,34.384843,1,0,0,0,0,1,0


In [None]:
cereals.describe()

In [None]:
cereals.columns

In [None]:
# Show the scatter plot
w = [0,0]
show(cereals.Rating,cereals.Sugars,w,0)

In [None]:
# Poverty dataset from https://newonlinecourses.science.psu.edu/stat462/node/101/
# Read the raw data file
poverty = pd.read_csv("../data/teen_birthrate_poverty.csv")
poverty.head()

In [None]:
poverty.describe()

In [None]:
# Show the scatter plot - Brth15to17 ~ PovPct
w =[0,0]
show(poverty.Brth15to17, poverty.PovPct, w, 0)

In [None]:
# Lung Function dataset from https://newonlinecourses.science.psu.edu/stat462/node/101/
# Read the raw data file
lung = pd.read_csv("../data/lung_function.csv")
lung.head()

In [None]:
lung.describe()

In [None]:
# FEV ~ age
w =[0,0]
show(lung.FEV, lung.age, w, 0)

## Statsmodels OLS function

### Cereals Dataset

In [None]:
# Cereals
X = cereals[['Sugars']]
y = cereals.Rating
# by default OLS doesn't compute the constant
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
print("                       SSR: {:.3f}".format(model.ess))
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
print("                       SSR: {:.3f}".format(model.ess))
model.summary()

In [None]:
X = cereals[['Fiber']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
print("                       SSR: {:.3f}".format(model.ess))
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
print("                       SSR: {:.3f}".format(model.ess))
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium', 'Fat']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
print("                       SSR: {:.3f}".format(model.ess))
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium', 'Fat', 'Carbo']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
print("                       SSR: {:.3f}".format(model.ess))
model.summary()

### Regression with Categorical Variables Using Indicator Variables

In [None]:
# Categorial variable with the cereal shelf (1, 2, 3)
cereals['Shelf']

In [None]:
# Create the two indicator variables
cereals['shelf_1'] = 0
cereals['shelf_2'] = 0
# Set the values according to the category for each
cereals.loc[cereals.Shelf==1, 'shelf_1'] = 1
cereals.loc[cereals.Shelf==2, 'shelf_2'] = 1
# show the three variables
cereals[['Shelf', 'shelf_1', 'shelf_2']]

In [None]:
# Regress on the two shelf indicator variables
X = cereals[['shelf_1', 'shelf_2']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
model.summary()

In [None]:
# Add sugars and fiber
X = cereals[['Sugars', 'Fiber', 'shelf_1', 'shelf_2']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
model.summary()

In [None]:
# Remove shelf_1
X = cereals[['Sugars', 'Fiber', 'shelf_2']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
model.summary()

### Poverty Dataset

In [None]:
X = poverty[['PovPct']]
y = poverty.TeenBrth
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
model.summary()

In [None]:
X = poverty[['PovPct', 'ViolCrime']]
y = poverty.TeenBrth
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
model.summary()

## Sequential Sum of Squares

In [32]:
def seq_sum_squares(df, fields, target):
    curr = []
    prev = 0.0
    for col in fields:
        curr.append(col)
        X = df[curr]
        X = sm.add_constant(X)
        y = df[target]
        model = sm.OLS(y, X).fit()
        print("Seq. SS: {:8.1f}; SSR: {:8.1f}; {:}".format(model.ess - prev, model.ess, curr))
        prev = model.ess

In [33]:
cols = ['Sugars', 'Fiber', 'shelf_1', 'shelf_2']
seq_sum_squares(cereals, cols, 'Rating')

Seq. SS:   8711.9; SSR:   8711.9; ['Sugars']
Seq. SS:   3476.6; SSR:  12188.6; ['Sugars', 'Fiber']
Seq. SS:      7.0; SSR:  12195.6; ['Sugars', 'Fiber', 'shelf_1']
Seq. SS:    159.9; SSR:  12355.4; ['Sugars', 'Fiber', 'shelf_1', 'shelf_2']


In [34]:
cols = ['shelf_1', 'shelf_2', 'Sugars', 'Fiber']
seq_sum_squares(cereals, cols, 'Rating')

Seq. SS:    282.7; SSR:    282.7; ['shelf_1']
Seq. SS:   1392.7; SSR:   1675.4; ['shelf_1', 'shelf_2']
Seq. SS:   7179.0; SSR:   8854.4; ['shelf_1', 'shelf_2', 'Sugars']
Seq. SS:   3501.0; SSR:  12355.4; ['shelf_1', 'shelf_2', 'Sugars', 'Fiber']


In [35]:
cols = ['Sugars', 'Fiber', 'Sodium', 'Fat', 'Carbo']
seq_sum_squares(cereals, cols, 'Rating')

Seq. SS:   8711.9; SSR:   8711.9; ['Sugars']
Seq. SS:   3476.6; SSR:  12188.6; ['Sugars', 'Fiber']
Seq. SS:   1575.7; SSR:  13764.3; ['Sugars', 'Fiber', 'Sodium']
Seq. SS:    760.9; SSR:  14525.2; ['Sugars', 'Fiber', 'Sodium', 'Fat']
Seq. SS:      4.4; SSR:  14529.5; ['Sugars', 'Fiber', 'Sodium', 'Fat', 'Carbo']


## Multicollinearity

In [39]:
# Add sugars and fiber
X = cereals[['Fiber', 'Potass']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
print("Std. Error of the estimate: {:.3f}".format(np.sqrt(model.scale)))
model.summary()

  umr_maximum(a, axis, None, out, keepdims),
  umr_minimum(a, axis, None, None, keepdims),


MissingDataError: exog contains inf or nans