In [1]:
import numpy as np
import statsmodels.formula.api as smf
from numpy import e, log, dot, array, matrix, ones
from numpy.random import normal, binomial
from statsmodels.api import families
from graphviz import Digraph

  from pandas.core import datetools


# Simulation Part 6 - Generalized Linear Models

- $E(y) = \mu = g^{-1}(\boldsymbol{X}\boldsymbol{\beta})$
- $y = distribution(\mu)$

## Let's define some link functions and their inverse functions

In [2]:
def identity(x):
    return x

In [3]:
def inv_identity(y):
    return y

In [4]:
def logit(x):
    return log(x / (1 - x))

In [5]:
def inv_logit(y):
    return e ** y / (1 + e ** y)

## Some helper functions

In [6]:
def binomial_1(p):
    return binomial(1, p)

In [7]:
def design_matrix(xs):
    # Turn into matrix
    xs = array(xs).T
    # Return with intercept column
    return np.c_[ones(xs.shape[0]), xs]

In [8]:
def generate_outcome(X, B, inv_link_func, distribution):
    # Calculate dot product
    XB = dot(X, B)
    # Get estimated value for each obs
    E = inv_link_func(XB)
    # Return values from the distribution
    return distribution(E)

## Generate our data

In [9]:
n = 1000

In [10]:
X = design_matrix([
    normal(size=n),
    normal(size=n)])
X

array([[ 1.        ,  2.77822676, -0.647232  ],
       [ 1.        ,  0.66382539, -1.07268135],
       [ 1.        ,  1.74985856, -1.58401425],
       ...,
       [ 1.        ,  0.79190217,  0.63315631],
       [ 1.        ,  0.86137079,  1.11740233],
       [ 1.        , -0.2425289 ,  2.46920343]])

In [11]:
B = [1, 2, 3]

In [12]:
y_linear = generate_outcome(X, B, inv_identity, normal)

In [13]:
y_logit = generate_outcome(X, B, inv_logit, binomial_1)

## Fit our data

In [14]:
linear_family = families.Gaussian(families.links.identity)
smf.GLM(y_linear, X, family=linear_family).fit().summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,1000.0
Model:,GLM,Df Residuals:,997.0
Model Family:,Gaussian,Df Model:,2.0
Link Function:,identity,Scale:,0.9590970874003126
Method:,IRLS,Log-Likelihood:,-1396.6
Date:,"Mon, 28 May 2018",Deviance:,956.22
Time:,09:16:22,Pearson chi2:,956.0
No. Iterations:,2,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.0250,0.031,33.030,0.000,0.964,1.086
x1,2.0143,0.030,67.271,0.000,1.956,2.073
x2,3.0105,0.029,102.402,0.000,2.953,3.068


In [15]:
logreg_family = families.Binomial(families.links.logit)
smf.GLM(y_logit, X, family=logreg_family).fit().summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,1000.0
Model:,GLM,Df Residuals:,997.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-281.87
Date:,"Mon, 28 May 2018",Deviance:,563.75
Time:,09:16:22,Pearson chi2:,845.0
No. Iterations:,7,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.0611,0.122,8.684,0.000,0.822,1.301
x1,2.1564,0.167,12.934,0.000,1.830,2.483
x2,3.2274,0.222,14.534,0.000,2.792,3.663
