In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import math

### Outcome is a 2-level Categorial variable
- Outcome $Y_i$, takes the value 1 with probability $p_i$ and the value 0 with probability $1 - p_i$

$\textit{transformation}(p_i) = \beta_0 + \beta_1\textit{x}_{1,i} + \beta_2\textit{x}_{2,i} + ... + \beta_k\textit{x}_{k,i}$

$\textit{logit}(p_i) = \log_e(\frac{p_i}{1-p_i})$

In [126]:
df = pd.read_csv('../data/resume.csv')

In [127]:
df.head()

Unnamed: 0,job_ad_id,job_city,job_industry,job_type,job_fed_contractor,job_equal_opp_employer,job_ownership,job_req_any,job_req_communication,job_req_education,...,honors,worked_during_school,years_experience,computer_skills,special_skills,volunteer,military,employment_holes,has_email_address,resume_quality
0,384,Chicago,manufacturing,supervisor,,1,unknown,1,0,0,...,0,0,6,1,0,0,0,1,0,low
1,384,Chicago,manufacturing,supervisor,,1,unknown,1,0,0,...,0,1,6,1,0,1,1,0,1,high
2,384,Chicago,manufacturing,supervisor,,1,unknown,1,0,0,...,0,1,6,1,0,0,0,0,0,low
3,384,Chicago,manufacturing,supervisor,,1,unknown,1,0,0,...,0,0,6,1,1,1,0,1,1,high
4,385,Chicago,other_service,secretary,0.0,1,nonprofit,1,0,0,...,0,1,22,1,0,0,0,0,1,high


In [128]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4870 entries, 0 to 4869
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   job_ad_id               4870 non-null   int64  
 1   job_city                4870 non-null   object 
 2   job_industry            4870 non-null   object 
 3   job_type                4870 non-null   object 
 4   job_fed_contractor      3102 non-null   float64
 5   job_equal_opp_employer  4870 non-null   int64  
 6   job_ownership           4870 non-null   object 
 7   job_req_any             4870 non-null   int64  
 8   job_req_communication   4870 non-null   int64  
 9   job_req_education       4870 non-null   int64  
 10  job_req_min_experience  2124 non-null   object 
 11  job_req_computer        4870 non-null   int64  
 12  job_req_organization    4870 non-null   int64  
 13  job_req_school          4870 non-null   object 
 14  received_callback       4870 non-null   

In [129]:
X = sm.add_constant(df['honors'])
y = df['received_callback']

mod = sm.Logit(y, X)

In [130]:
res = mod.fit()
print(res.summary2())

Optimization terminated successfully.
         Current function value: 0.277898
         Iterations 6
                          Results: Logit
Model:              Logit             Pseudo R-squared: 0.007     
Dependent Variable: received_callback AIC:              2710.7232 
Date:               2022-01-23 21:43  BIC:              2723.7049 
No. Observations:   4870              Log-Likelihood:   -1353.4   
Df Model:           1                 LL-Null:          -1363.5   
Df Residuals:       4868              LLR p-value:      6.9836e-06
Converged:          1.0000            Scale:            1.0000    
No. Iterations:     6.0000                                        
--------------------------------------------------------------------
           Coef.    Std.Err.      z       P>|z|     [0.025    0.975]
--------------------------------------------------------------------
const     -2.4998     0.0556   -44.9577   0.0000   -2.6088   -2.3908
honors     0.8668     0.1776     4.8800   0.0

In [131]:
res.params

const    -2.499795
honors    0.866827
dtype: float64

In [132]:
output = np.exp(res.params['const'])
print(output)

0.08210180623973737


In [133]:
# p = (1 - p) * output
# p = output - p * output
# p + p * output = output
# p(1 + output) = output
p = output / (1 + output)
print(p)

0.07587253414264045


### Akaike information criterion

$AIC = 2k - 2ln(\hat L)$, where $\hat L$ is the maximum value of the P(y | X) likelihood function of the model



In [134]:
res.aic

2710.7231826312927

### Building the logistic model with many variables

#### Backward / Forward Elimination using p-value or lower AIC

### Diagnostics
- Each outcome  $Y_i$ is independent of other outcomes.
- Each predictor $x_i$ is linearly related to $\textit{logit}(p_i)$ if all other predictors are held constant.