In [1]:
#load all necessary libraries
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
pd.set_option("max_columns", 80)

In [3]:
# Read cleaned dataset
df = pd.read_csv('data/interim_data/cdc_data_cleaned_with_ethnicities.csv')
df.head()

Unnamed: 0,resident_status,level_of_education,month_of_death,age,place_of_death_patient_status,marital_status,day_of_week_of_death,year,manner_of_death,was_autopsy_done,place_of_injury_for_icd_w00_y34_except_y06_and_y07_,icd_code_10th_revision,358_causes_of_death,113_causes_of_death,130_causes_of_infant_death,39_cause_recode,number_of_entity_axis_conditions,entity_condition_1,entity_condition_2,entity_condition_3,number_of_record_axis_conditions,record_condition_1,record_condition_2,record_condition_3,race,race_recode_3,race_recode_5,hispanic_origin,hispanic_originrace_recode,ethnicity
0,RESIDENTS,3.0,January,49,"Hospital, clinic or Medical Center",Divorced,Tuesday,2015,Natural,No,Not Available,O266,All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",9,11O268,21O988,22O992,2,O266,O268,Not Available,White,White,White,Mexican,Mexican,Hispanic
1,RESIDENTS,3.0,January,40,"Hospital, clinic or Medical Center",Married,Sunday,2015,Natural,No,Not Available,O266,All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",6,11O995,21O995,31O266,2,O266,O268,Not Available,White,White,White,Mexican,Mexican,Hispanic
2,RESIDENTS,9.0,April,35,"Hospital, clinic or Medical Center",Married,Friday,2015,Natural,Yes,Not Available,O720,Hemorrhage of pregnancy and childbirth and pla...,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",3,11O994,21O720,22O991,1,O720,Not Available,Not Available,White,White,White,Mexican,Mexican,Hispanic
3,RESIDENTS,6.0,April,41,"Hospital, clinic or Medical Center",Married,Wednesday,2015,Natural,No,Not Available,O721,Hemorrhage of pregnancy and childbirth and pla...,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",3,11O721,12O991,21O95,1,O721,Not Available,Not Available,White,White,White,Mexican,Mexican,Hispanic
4,INTRASTATE NONRESIDENTS,3.0,June,28,"Hospital, clinic or Medical Center",Married,Sunday,2015,Natural,No,Not Available,O038,Spontaneous abortion,Pregnancy with abortive outcome,Not Available,"Pregnancy, childbirth and the puerperium",8,11O996,12O998,13O995,1,O038,Not Available,Not Available,White,White,White,Mexican,Mexican,Hispanic


In [4]:
# drop columns
df = df.drop([
    '39_cause_recode', 'entity_condition_2', 'entity_condition_3', 'record_condition_2', 'record_condition_3',
    'race', 'race_recode_3', 'race_recode_5', 'hispanic_origin', 'hispanic_originrace_recode',
    '130_causes_of_infant_death'], axis = 1)

In [5]:
df

Unnamed: 0,resident_status,level_of_education,month_of_death,age,place_of_death_patient_status,marital_status,day_of_week_of_death,year,manner_of_death,was_autopsy_done,place_of_injury_for_icd_w00_y34_except_y06_and_y07_,icd_code_10th_revision,358_causes_of_death,113_causes_of_death,number_of_entity_axis_conditions,entity_condition_1,number_of_record_axis_conditions,record_condition_1,ethnicity
0,RESIDENTS,3.0,January,49,"Hospital, clinic or Medical Center",Divorced,Tuesday,2015,Natural,No,Not Available,O266,All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",9,11O268,2,O266,Hispanic
1,RESIDENTS,3.0,January,40,"Hospital, clinic or Medical Center",Married,Sunday,2015,Natural,No,Not Available,O266,All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",6,11O995,2,O266,Hispanic
2,RESIDENTS,9.0,April,35,"Hospital, clinic or Medical Center",Married,Friday,2015,Natural,Yes,Not Available,O720,Hemorrhage of pregnancy and childbirth and pla...,"Other complications of pregnancy, childbirth a...",3,11O994,1,O720,Hispanic
3,RESIDENTS,6.0,April,41,"Hospital, clinic or Medical Center",Married,Wednesday,2015,Natural,No,Not Available,O721,Hemorrhage of pregnancy and childbirth and pla...,"Other complications of pregnancy, childbirth a...",3,11O721,1,O721,Hispanic
4,INTRASTATE NONRESIDENTS,3.0,June,28,"Hospital, clinic or Medical Center",Married,Sunday,2015,Natural,No,Not Available,O038,Spontaneous abortion,Pregnancy with abortive outcome,8,11O996,1,O038,Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7130,RESIDENTS,4.0,October,33,"Hospital, clinic or Medical Center",Married,Tuesday,2009,Not Available,No,Not Available,Combination of conditions classifiable to O99....,Indirect obstetric deaths,"Other complications of pregnancy, childbirth a...",1,11O998,1,O998,Black
7131,RESIDENTS,4.0,October,27,"Hospital, clinic or Medical Center",Married,Thursday,2009,Natural,No,Not Available,unspecified pre-eclampsia,Eclampsia and pre-eclampsia,"Other complications of pregnancy, childbirth a...",2,11O149,1,O149,Black
7132,RESIDENTS,6.0,December,25,"Hospital, clinic or Medical Center","Never married, Single",Wednesday,2009,Not Available,No,Not Available,"Exhaustion and fatigue, Peripheral neuritis, o...",All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",2,11O268,1,O268,Black
7133,RESIDENTS,3.0,December,40,"Hospital, clinic or Medical Center","Never married, Single",Saturday,2009,Natural,Yes,Not Available,O96,"Other deaths related to pregnancy, childbirth ...","Other complications of pregnancy, childbirth a...",2,11O96,1,O96,Black


In [7]:
#Recode values to integer: LinearRegression does not take strings

def recode_to_int(column_name):
    df[column_name] = df[column_name].astype('category')
    df[column_name] = df[column_name].cat.codes

for column in df.columns:
    recode_to_int(column)

In [8]:
df

Unnamed: 0,resident_status,level_of_education,month_of_death,age,place_of_death_patient_status,marital_status,day_of_week_of_death,year,manner_of_death,was_autopsy_done,place_of_injury_for_icd_w00_y34_except_y06_and_y07_,icd_code_10th_revision,358_causes_of_death,113_causes_of_death,number_of_entity_axis_conditions,entity_condition_1,number_of_record_axis_conditions,record_condition_1,ethnicity
0,3,1,4,38,2,0,5,6,3,0,1,66,0,0,8,76,1,64,1
1,3,1,4,29,2,2,3,6,3,0,1,66,0,0,5,146,1,64,1
2,3,7,0,24,2,2,0,6,3,2,1,104,3,0,2,145,0,103,1
3,3,4,0,30,2,2,6,6,3,0,1,105,3,0,2,100,0,104,1
4,2,1,6,17,2,2,3,6,3,0,1,21,12,1,7,147,0,13,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7130,3,2,10,22,2,2,5,0,4,0,1,0,4,0,0,149,0,157,0
7131,3,2,10,16,2,2,4,0,3,0,1,157,1,0,1,57,0,43,0
7132,3,4,2,14,2,3,6,0,4,0,1,6,0,0,1,76,0,65,0
7133,3,1,2,29,2,3,2,0,3,2,1,136,11,0,1,127,0,135,0


In [16]:
df['358_causes_of_death'].unique()

array([ 0,  3, 12, 11,  2,  7,  6,  1,  4, 10,  9,  8,  5], dtype=int8)

In [26]:
df_mlr = df[['ethnicity', 'marital_status', '358_causes_of_death']]
df_mlr

Unnamed: 0,ethnicity,marital_status,358_causes_of_death
0,1,0,0
1,1,2,0
2,1,2,3
3,1,2,3
4,1,2,12
...,...,...,...
7130,0,2,4
7131,0,2,1
7132,0,3,0
7133,0,3,11


In [27]:
#Create training and test datasets

X = df_mlr.drop(['marital_status', 'ethnicity'], axis=1) 
y = df_mlr['marital_status']

print(list(X.columns.values)) 

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

['358_causes_of_death']
(5708, 1)
(1427, 1)
(5708,)
(1427,)


In [28]:
model1 = LogisticRegression(
    random_state=0,
    multi_class='multinomial',
    penalty='none',
    solver='newton-cg'
).fit(X_train, y_train)
preds = model1.predict(X_test)

#print the tunable parameters (They were not tuned in, everything kept as default)
params = model1.get_params()
print(params)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'none', 'random_state': 0, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [29]:
#Print model parameters
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)

Intercept: 
 [ 0.28628941 -1.84747825  1.64621018  1.56248096 -1.6475023 ]
Coefficients: 
 [[-0.00894642]
 [-0.01622549]
 [ 0.01895773]
 [ 0.01913111]
 [-0.01291694]]


In [30]:
np.exp(model1.coef_)

array([[0.99109348],
       [0.98390543],
       [1.01913857],
       [1.01931528],
       [0.98716613]])

In [31]:
#Use statsmodels to assess variables

logit_model=sm.MNLogit(y_train,sm.add_constant(X_train))
logit_model
result=logit_model.fit()
stats1=result.summary()
stats2=result.summary2()
print(stats1)
print(stats2)

Optimization terminated successfully.
         Current function value: 1.069263
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:         marital_status   No. Observations:                 5708
Model:                        MNLogit   Df Residuals:                     5700
Method:                           MLE   Df Model:                            4
Date:                Sat, 31 Jul 2021   Pseudo R-squ.:               0.0008285
Time:                        18:42:53   Log-Likelihood:                -6103.4
converged:                       True   LL-Null:                       -6108.4
Covariance Type:            nonrobust   LLR p-value:                   0.03842
   marital_status=1       coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -2.1338      0.183    -11.655      0.000      -2.493      -1.775
35