# Multiple Regression and Model Building
Material from Chapter 9 of Larose and Larose and external sources

02/20/2019 - Jeff Smith

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import datasets

In [2]:
# Some support functions - based on w[0] as intercept; w[1] as slope
def cost(y, x, w) :
    return sum((y-(w[0] + w[1]*x))**2)

def show(y, x, w = [0,0], show_reg = 1) :
    if w[0] or w[1]:
        print("Solution: RSS={:,.3f}; w = [{:.4f}, {:.4f}]".format(cost(y, x, w), w[0], w[1]))
    plt.figure(figsize=(8,6))
    plt.scatter(x, y);
    if show_reg:
        axes = plt.gca()
        x_vals = np.array(axes.get_xlim())
        y_vals = w[0] + w[1] * x_vals
        plt.plot(x_vals, y_vals, '--');

## Sample Datasets

In [10]:
# Cerals dataset from Larose and Larose
# Read the raw data file
cereals = pd.read_csv("../data/cereals.csv")
# The Name field has some trailing spaces -- remove them
cereals.Name = cereals.Name.str.strip()
# Get rid of Quaker Oatmeal -- no Sugar values (see the book)
cereals = cereals[cereals.Name != 'Quaker_Oatmeal']
cereals.head()

Unnamed: 0,Name,Manuf,Type,Calories,Protein,Fat,Sodium,Fiber,Carbo,Sugars,...,Weight,Cups,Rating,Cold,Nabisco,Quaker,Kelloggs,GeneralMills,Ralston,AHFP
0,100%_Bran,N,C,70,4,1,130,10.0,5.0,6.0,...,1.0,0.33,68.402973,1,1,0,0,0,0,0
1,100%_Natural_Bran,Q,C,120,3,5,15,2.0,8.0,8.0,...,1.0,1.0,33.983679,1,0,1,0,0,0,0
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5.0,...,1.0,0.33,59.425505,1,0,0,1,0,0,0
3,All-Bran_with_Extra_Fiber,K,C,50,4,0,140,14.0,8.0,0.0,...,1.0,0.5,93.704912,1,0,0,1,0,0,0
4,Almond_Delight,R,C,110,2,2,200,1.0,14.0,8.0,...,1.0,0.75,34.384843,1,0,0,0,0,1,0


In [None]:
cereals.describe()

In [None]:
# Show the scatter plot
w = [0,0]
show(cereals.Rating,cereals.Sugars,w,0)

In [12]:
# Poverty dataset from https://newonlinecourses.science.psu.edu/stat462/node/101/
# Read the raw data file
poverty = pd.read_csv("../data/teen_birthrate_poverty.csv")
poverty.head()

Unnamed: 0,Location,PovPct,Brth15to17,Brth18to19,ViolCrime,TeenBrth
0,Alabama,20.1,31.5,88.7,11.2,54.5
1,Alaska,7.1,18.9,73.7,9.1,39.5
2,Arizona,16.1,35.0,102.5,10.4,61.2
3,Arkansas,14.9,31.6,101.7,10.4,59.9
4,California,16.7,22.6,69.1,11.2,41.1


In [None]:
poverty.describe()

In [None]:
# Show the scatter plot - Brth15to17 ~ PovPct
w =[0,0]
show(poverty.Brth15to17, poverty.PovPct, w, 0)

In [None]:
# Lung Function dataset from https://newonlinecourses.science.psu.edu/stat462/node/101/
# Read the raw data file
lung = pd.read_csv("../data/lung_function.csv")
lung.head()

In [None]:
lung.describe()

In [None]:
# FEV ~ age
w =[0,0]
show(lung.FEV, lung.age, w, 0)

In [None]:
# Random, uncorrelated values
# rnd - x ~ U(0,100), y ~ expo(18)
x = np.random.uniform(0, 100, 150)
y = np.random.exponential(18, 150)
rnd = pd.DataFrame(data={"x":x, "y":y})
rnd.head()

In [None]:
rnd.describe()

In [None]:
w = [0,0]
show(rnd.y, rnd.x,w,0)

## Statsmodels OLS function

### Cereals Dataset

In [None]:
# Cereals
X = cereals[['Sugars']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium', 'Fat']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

In [None]:
X = cereals[['Sugars', 'Fiber', 'Sodium', 'Fat', 'Carbo']]
y = cereals.Rating
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

### Poverty Dataset

In [19]:
X = poverty[['PovPct']]
y = poverty.TeenBrth
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,TeenBrth,R-squared:,0.495
Model:,OLS,Adj. R-squared:,0.484
Method:,Least Squares,F-statistic:,47.95
Date:,"Wed, 20 Feb 2019",Prob (F-statistic):,8.67e-09
Time:,17:33:28,Log-Likelihood:,-182.53
No. Observations:,51,AIC:,369.1
Df Residuals:,49,BIC:,372.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,15.6737,4.032,3.888,0.000,7.571,23.776
PovPct,2.0255,0.292,6.925,0.000,1.438,2.613

0,1,2,3
Omnibus:,2.3,Durbin-Watson:,2.221
Prob(Omnibus):,0.317,Jarque-Bera (JB):,1.377
Skew:,0.046,Prob(JB):,0.502
Kurtosis:,2.2,Cond. No.,45.1


In [20]:
X = poverty[['PovPct', 'ViolCrime']]
y = poverty.TeenBrth
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
predictions = model.predict(X)
model.summary()

0,1,2,3
Dep. Variable:,TeenBrth,R-squared:,0.561
Model:,OLS,Adj. R-squared:,0.543
Method:,Least Squares,F-statistic:,30.68
Date:,"Wed, 20 Feb 2019",Prob (F-statistic):,2.61e-09
Time:,17:52:13,Log-Likelihood:,-178.93
No. Observations:,51,AIC:,363.9
Df Residuals:,48,BIC:,369.6
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,17.6848,3.869,4.571,0.000,9.906,25.463
PovPct,1.6304,0.312,5.227,0.000,1.003,2.258
ViolCrime,0.4037,0.150,2.697,0.010,0.103,0.705

0,1,2,3
Omnibus:,1.413,Durbin-Watson:,2.24
Prob(Omnibus):,0.493,Jarque-Bera (JB):,1.077
Skew:,0.071,Prob(JB):,0.584
Kurtosis:,2.302,Cond. No.,56.5
