# Simple Linear Regression
Material from Chapter 8 of Larose and Larose and external sources

02/15/2019 - Jeff Smith

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

In [None]:
# Some support functions - based on w[0] as intercept; w[1] as slope
def cost(y, x, w) :
    return sum((y-(w[0] + w[1]*x))**2)

def show(y, x, w = [0,0], show_reg = 1) :
    if w[0] or w[1]:
        print("Solution: RSS={:,.3f}; w = [{:.4f}, {:.4f}]".format(cost(y, x, w), w[0], w[1]))
    plt.figure(figsize=(8,6))
    plt.scatter(x, y);
    if show_reg:
        axes = plt.gca()
        x_vals = np.array(axes.get_xlim())
        y_vals = w[0] + w[1] * x_vals
        plt.plot(x_vals, y_vals, '--');

## Sample Datasets

In [None]:
# Cerals dataset from Larose and Larose
# Read the raw data file
cereals = pd.read_csv("../data/cereals.csv")
# The Name field has some trailing spaces -- remove them
cereals.Name = cereals.Name.str.strip()
# Get rid of Quaker Oatmeal -- no Sugar values (see the book)
cereals = cereals[cereals.Name != 'Quaker_Oatmeal']
cereals.head()

In [None]:
cereals.describe()

In [None]:
# Show the scatter plot
w = [0,0]
show(cereals.Rating,cereals.Sugars,w,0)

In [None]:
# Poverty dataset from https://newonlinecourses.science.psu.edu/stat462/node/101/
# Read the raw data file
poverty = pd.read_csv("../data/teen_birthrate_poverty.csv")
poverty.head()

In [None]:
poverty.describe()

In [None]:
# Show the scatter plot - Brth15to17 ~ PovPct
w =[0,0]
show(poverty.Brth15to17, poverty.PovPct, w, 0)

In [None]:
# Lung Function dataset from https://newonlinecourses.science.psu.edu/stat462/node/101/
# Read the raw data file
lung = pd.read_csv("../data/lung_function.csv")
lung.head()

In [None]:
lung.describe()

In [None]:
# FEV ~ age
w =[0,0]
show(lung.FEV, lung.age, w, 0)

In [None]:
# Random, uncorrelated values
# rnd - x ~ U(0,100), y ~ expo(18)
x = np.random.uniform(0, 100, 150)
y = np.random.exponential(18, 150)
rnd = pd.DataFrame(data={"x":x, "y":y})
rnd.head()

In [None]:
rnd.describe()

In [None]:
w = [0,0]
show(rnd.y, rnd.x,w,0)

## Our Regression Function

In [None]:
def regress(y, x, show = 0):
    w = [0.0,0.0]
    N = len(y)
    sy = np.sum(y)
    sx = np.sum(x)
    sxy = np.sum(x*y)
    sxx = np.sum(x*x)
    if show:
        print("N = {:}, sy = {:}, sx = {:}, sxy = {:}, sxx = {:}".format(N, sy, sx, sxy, sxx))
    w[1] = sxy - (sx * sy)/N
    w[1] = w[1] / (sxx - (sx * sx)/N)
    w[0] = sy/N - w[1]*sx/N
    if show:
        print("W = ({:}, {:})".format(w[0], w[1]))
    return w

In [None]:
# cereals
w = regress(cereals.Rating, cereals.Sugars)
show(cereals.Rating, cereals.Sugars,w)

In [None]:
# poverty
w = regress(poverty.Brth15to17, poverty.PovPct, 1)
show(poverty.Brth15to17, poverty.PovPct, w)

In [None]:
# lung function
w = regress(lung.FEV, lung.age)
show(lung.FEV, lung.age, w)

In [None]:
# rnd
w = regress(rnd.y, rnd.x, 0)
show(rnd.y, rnd.x, w)

### Hiking Data Example

In [None]:
# Hiking example from Larose and Larose
hike = pd.DataFrame(data={'x':[2,2,3,4,4,5,6,7,8,9], 'y':[10, 11, 12, 13, 14, 15, 20, 18, 22, 25]})
w = regress(hike.y, hike.x,0)
show(hike.y, hike.x, w)

In [None]:
# mean response
np.mean(hike.y)

In [None]:
# (y_j - y_bar)^2
(hike.y - np.mean(hike.y))**2

In [None]:
# SST
sum((hike.y - np.mean(hike.y))**2)

In [None]:
hike['y_j'] = 6 + 2*hike.x
hike['residual'] = hike.y-hike.y_j
hike['perr'] = hike.residual**2
hike['terr'] = (16 - hike.y)**2
hike['reg'] = (16 - hike.y_j)**2 
hike

In [None]:
print("SSE = {:.2f}; SST = {:.2f}, SSR = {:.2f}".format(sum(hike.perr), sum(hike.terr), sum(hike.reg)))

In [None]:
print("r^2 = {:.2f}".format(sum(hike.reg)/sum(hike.terr)))

## Scipy's Basic Regression Function

In [None]:
# Cereals
w = [0,0]
w[1], w[0], rval, pval, stderr = scipy.stats.linregress(cereals.Sugars, cereals.Rating)
print("Intercept: {:.2f}; Slope: {:.2f}; r-Square: {:.2f}; p-value: {:.2f}; se grad.: {:.2f}".
      format(w[0], w[1], rval**2,pval, stderr))
show(cereals.Rating, cereals.Sugars, w)

In [None]:
# Poverty
w = [0,0]
w[1], w[0], rval, pval, stderr = scipy.stats.linregress(poverty.PovPct, poverty.Brth15to17)
print("Intercept: {:.2f}; Slope: {:.2f}; r-Square: {:.2f}; p-value: {:.2f}; se grad.: {:.2f}".
      format(w[0], w[1], rval**2,pval, stderr))
show(poverty.Brth15to17, poverty.PovPct, w)

In [None]:
# Lung Function
w = [0,0]
w[1], w[0], rval, pval, stderr = scipy.stats.linregress(lung.age, lung.FEV)
print("Intercept: {:.2f}; Slope: {:.2f}; r-Square: {:.2f}; p-value: {:.2f}; se grad.: {:.2f}".
      format(w[0], w[1], rval**2,pval, stderr))
show(lung.FEV, lung.age, w)

In [None]:
# rnd
w = [0,0]
w[1], w[0], rval, pval, stderr = scipy.stats.linregress(rnd.x, rnd.y)
print("Intercept: {:.2f}; Slope: {:.2f}; r-Square: {:.2f}; p-value: {:.2f}; se grad.: {:.2f}".
      format(w[0], w[1], rval**2,pval, stderr))
show(rnd.y, rnd.x, w)

In [None]:
# Hiking Data
w = [0,0]
w[1], w[0], rval, pval, stderr = scipy.stats.linregress(hike.x, hike.y)
print("Intercept: {:.2f}; Slope: {:.2f}; r-Square: {:.2f}; p-value: {:.2f}; se grad.: {:.2f}".
      format(w[0], w[1], rval**2,pval, stderr))
show(hike.y, hike.x, w)

## Statsmodels OLS function

In [None]:
import statsmodels.api as sm

In [None]:
# Cereals
X = sm.add_constant(cereals.Sugars)
result = sm.OLS(cereals.Rating, X).fit()
result.summary()

In [None]:
# rnd
X = sm.add_constant(rnd.x)
result = sm.OLS(rnd.y, X).fit()
result.summary()

## Normal Probility Plots of Residuals

In [None]:
# Already computed and stored residulas for the hike example 
res = scipy.stats.probplot(hike.residual, plot=plt)

In [None]:
# Poverty Brth15to17 ~ PovPct
poverty['residuals'] = poverty.Brth15to17 - 4.267 - 1.3733*poverty.PovPct

In [None]:
 res = scipy.stats.probplot(poverty.residuals, plot=plt)

In [None]:
# Rnd data
rnd['residuals'] = rnd.y - 16.69 + 0.03*rnd.x

In [None]:
 res = scipy.stats.probplot(rnd.residuals, plot=plt)