# Chapter 12 - Linear Models

# 13.2 Logistic Regression

In [1]:
import pandas as pd

In [2]:
# Let's look at the data from the ACS(American Community Survey) of New York


acs = pd.read_csv("C:/Users/adri_/Documents/GitHub- Adriana/Pandas for everyone/data/acs_ny.csv")
acs

Unnamed: 0,Acres,FamilyIncome,FamilyType,NumBedrooms,NumChildren,NumPeople,NumRooms,NumUnits,NumVehicles,NumWorkers,OwnRent,YearBuilt,HouseCosts,ElectricBill,FoodStamp,HeatingFuel,Insurance,Language
0,1-10,150,Married,4,1,3,9,Single detached,1,0,Mortgage,1950-1959,1800,90,No,Gas,2500,English
1,1-10,180,Female Head,3,2,4,6,Single detached,2,0,Rented,Before 1939,850,90,No,Oil,0,English
2,1-10,280,Female Head,4,0,2,8,Single detached,3,1,Mortgage,2000-2004,2600,260,No,Oil,6600,Other European
3,1-10,330,Female Head,2,1,2,4,Single detached,1,0,Rented,1950-1959,1800,140,No,Oil,0,English
4,1-10,330,Male Head,3,1,2,5,Single attached,1,0,Mortgage,Before 1939,860,150,No,Gas,660,Spanish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22740,10+,565000,Married,5,3,5,10,Single detached,2,2,Mortgage,1990-1999,1700,370,No,Gas,1000,English
22741,10+,599000,Married,4,0,2,6,Single detached,2,2,Mortgage,Before 1939,1300,100,No,Gas,3500,English
22742,10+,611700,Married,4,1,5,9,Single detached,5,3,Mortgage,Before 1939,410,100,No,Oil,1300,Spanish
22743,10+,621430,Married,3,2,4,11,Single detached,2,3,Mortgage,1970-1979,1600,80,No,Gas,800,Spanish


In [3]:
# Let's see all the column variables

acs.columns

Index(['Acres', 'FamilyIncome', 'FamilyType', 'NumBedrooms', 'NumChildren',
       'NumPeople', 'NumRooms', 'NumUnits', 'NumVehicles', 'NumWorkers',
       'OwnRent', 'YearBuilt', 'HouseCosts', 'ElectricBill', 'FoodStamp',
       'HeatingFuel', 'Insurance', 'Language'],
      dtype='object')

In [4]:
# To model this, we first need to create a binary response variable. 
# we split the FamilyIncome variable into a binary one


acs["ge150k"] = pd.cut(acs["FamilyIncome"],[0, 150000, acs["FamilyIncome"].max()],
                      labels = [0,1])

acs["ge150k_i"] = acs["ge150k"].astype(int)
acs["ge150k_i"].value_counts() 

0    18294
1     4451
Name: ge150k_i, dtype: int64

In [5]:
acs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22745 entries, 0 to 22744
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Acres         22745 non-null  object  
 1   FamilyIncome  22745 non-null  int64   
 2   FamilyType    22745 non-null  object  
 3   NumBedrooms   22745 non-null  int64   
 4   NumChildren   22745 non-null  int64   
 5   NumPeople     22745 non-null  int64   
 6   NumRooms      22745 non-null  int64   
 7   NumUnits      22745 non-null  object  
 8   NumVehicles   22745 non-null  int64   
 9   NumWorkers    22745 non-null  int64   
 10  OwnRent       22745 non-null  object  
 11  YearBuilt     22745 non-null  object  
 12  HouseCosts    22745 non-null  int64   
 13  ElectricBill  22745 non-null  int64   
 14  FoodStamp     22745 non-null  object  
 15  HeatingFuel   22745 non-null  object  
 16  Insurance     22745 non-null  int64   
 17  Language      22745 non-null  object  
 18  ge150k

# 13.2.1 Using statsmodels

In [6]:
# to perform logistic regression we can use the "logit" function

import statsmodels.formula.api as sm 

In [7]:
# Create the model
model = sm.logit("ge150k_i ~ HouseCosts + NumWorkers + OwnRent + NumBedrooms + FamilyType",
                data = acs)


results = model.fit()


Optimization terminated successfully.
         Current function value: 0.391651
         Iterations 7


In [8]:
# See logistic regression results
results. summary()


0,1,2,3
Dep. Variable:,ge150k_i,No. Observations:,22745.0
Model:,Logit,Df Residuals:,22737.0
Method:,MLE,Df Model:,7.0
Date:,"Fri, 14 Aug 2020",Pseudo R-squ.:,0.2078
Time:,11:10:38,Log-Likelihood:,-8908.1
converged:,True,LL-Null:,-11244.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-5.8081,0.120,-48.456,0.000,-6.043,-5.573
OwnRent[T.Outright],1.8276,0.208,8.782,0.000,1.420,2.236
OwnRent[T.Rented],-0.8763,0.101,-8.647,0.000,-1.075,-0.678
FamilyType[T.Male Head],0.2874,0.150,1.913,0.056,-0.007,0.582
FamilyType[T.Married],1.3877,0.088,15.781,0.000,1.215,1.560
HouseCosts,0.0007,1.72e-05,42.453,0.000,0.001,0.001
NumWorkers,0.5873,0.026,22.393,0.000,0.536,0.639
NumBedrooms,0.2365,0.017,13.985,0.000,0.203,0.270


In [9]:
# to interpret the results we will exponentiate them
import numpy as np


odds_ratios = np.exp(results.params)
odds_ratios


Intercept                  0.003003
OwnRent[T.Outright]        6.219147
OwnRent[T.Rented]          0.416310
FamilyType[T.Male Head]    1.332901
FamilyType[T.Married]      4.005636
HouseCosts                 1.000731
NumWorkers                 1.799117
NumBedrooms                1.266852
dtype: float64

# 13.2.2 Using sklearn

In [10]:
# We need to create dummy variables

predictors = pd.get_dummies(acs[["HouseCosts","NumWorkers", "OwnRent", "NumBedrooms","FamilyType"]],
                           drop_first = True)


In [11]:
# Create model
from sklearn import linear_model

lr = linear_model.LogisticRegression()

In [12]:
# Fit the model

results = lr.fit(X = predictors, y = acs["ge150k_i"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [13]:
# Show coeffcients and intercept

print(results.coef_)
print(results.intercept_)

[[ 5.84632930e-04  7.30468510e-01  2.83562884e-01  7.06690063e-02
  -2.12153835e+00 -1.03220120e+00  2.53193845e-01]]
[-4.8312]


In [14]:
# Printing results in a more atractive format

values = np.append(results.intercept_,results.coef_)

# Get the names of the values
names = np.append("intercept", predictors.columns)

In [15]:
# Put everything in a labelede dataframe

results = pd.DataFrame(values, index = names, columns = ["coef"])
results

Unnamed: 0,coef
intercept,-4.8312
HouseCosts,0.000585
NumWorkers,0.730469
NumBedrooms,0.283563
OwnRent_Outright,0.070669
OwnRent_Rented,-2.121538
FamilyType_Male Head,-1.032201
FamilyType_Married,0.253194


In [16]:
# To interpret the coefficients we still need to exponentiate the values

results["or"] = np.exp(results["coef"])
results

Unnamed: 0,coef,or
intercept,-4.8312,0.007977
HouseCosts,0.000585,1.000585
NumWorkers,0.730469,2.076053
NumBedrooms,0.283563,1.327852
OwnRent_Outright,0.070669,1.073226
OwnRent_Rented,-2.121538,0.119847
FamilyType_Male Head,-1.032201,0.356222
FamilyType_Married,0.253194,1.288133


# 13.3 Poisson Regression

# 13.3.1 Using statsmodels

In [17]:
 # Create the model
model = sm.poisson("NumChildren ~ FamilyIncome + FamilyType + OwnRent ",
                data = acs)


results = model.fit()

Optimization terminated successfully.
         Current function value: nan
         Iterations 1


  L = np.exp(np.dot(X,params) + exposure + offset)
  return -np.dot(L*X.T, X)
  return -np.dot(L*X.T, X)
  L = np.exp(np.dot(X,params) + offset + exposure)
  return mu >= 0
  oldparams) > tol)):


In [18]:
#see results

results.summary()

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,NumChildren,No. Observations:,22745.0
Model:,Poisson,Df Residuals:,22739.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 14 Aug 2020",Pseudo R-squ.:,
Time:,11:10:38,Log-Likelihood:,
converged:,True,LL-Null:,-30977.0
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,,,,,,
FamilyType[T.Male Head],,,,,,
FamilyType[T.Married],,,,,,
OwnRent[T.Outright],,,,,,
OwnRent[T.Rented],,,,,,
FamilyIncome,,,,,,
