In [1]:
#load all necessary libraries
import pandas as pd 
import numpy as np 
import scipy as scp
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.metrics import confusion_matrix

import statsmodels.api as sm
import matplotlib.pyplot as plt

In [2]:
pd.set_option("max_columns", 80)

In [3]:
# Read cleaned dataset
df = pd.read_csv('data/interim_data/cdc_data_cleaned_with_ethnicities.csv')
df

Unnamed: 0,resident_status,level_of_education,month_of_death,age,place_of_death_patient_status,marital_status,day_of_week_of_death,year,manner_of_death,was_autopsy_done,place_of_injury_for_icd_w00_y34_except_y06_and_y07_,icd_code_10th_revision,358_causes_of_death,113_causes_of_death,130_causes_of_infant_death,39_cause_recode,number_of_entity_axis_conditions,entity_condition_1,entity_condition_2,entity_condition_3,number_of_record_axis_conditions,record_condition_1,record_condition_2,record_condition_3,race,race_recode_3,race_recode_5,hispanic_origin,hispanic_originrace_recode,ethnicity
0,RESIDENTS,3.0,January,49,"Hospital, clinic or Medical Center",Divorced,Tuesday,2015,Natural,No,Not Available,O266,All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",9,11O268,21O988,22O992,2,O266,O268,Not Available,White,White,White,Mexican,Mexican,Hispanic
1,RESIDENTS,3.0,January,40,"Hospital, clinic or Medical Center",Married,Sunday,2015,Natural,No,Not Available,O266,All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",6,11O995,21O995,31O266,2,O266,O268,Not Available,White,White,White,Mexican,Mexican,Hispanic
2,RESIDENTS,9.0,April,35,"Hospital, clinic or Medical Center",Married,Friday,2015,Natural,Yes,Not Available,O720,Hemorrhage of pregnancy and childbirth and pla...,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",3,11O994,21O720,22O991,1,O720,Not Available,Not Available,White,White,White,Mexican,Mexican,Hispanic
3,RESIDENTS,6.0,April,41,"Hospital, clinic or Medical Center",Married,Wednesday,2015,Natural,No,Not Available,O721,Hemorrhage of pregnancy and childbirth and pla...,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",3,11O721,12O991,21O95,1,O721,Not Available,Not Available,White,White,White,Mexican,Mexican,Hispanic
4,INTRASTATE NONRESIDENTS,3.0,June,28,"Hospital, clinic or Medical Center",Married,Sunday,2015,Natural,No,Not Available,O038,Spontaneous abortion,Pregnancy with abortive outcome,Not Available,"Pregnancy, childbirth and the puerperium",8,11O996,12O998,13O995,1,O038,Not Available,Not Available,White,White,White,Mexican,Mexican,Hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7130,RESIDENTS,4.0,October,33,"Hospital, clinic or Medical Center",Married,Tuesday,2009,Not Available,No,Not Available,Combination of conditions classifiable to O99....,Indirect obstetric deaths,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",1,11O998,Not Available,Not Available,1,O998,Not Available,Not Available,Black,Black,Black,Non – Hispanic,Non - Hispanic black,Black
7131,RESIDENTS,4.0,October,27,"Hospital, clinic or Medical Center",Married,Thursday,2009,Natural,No,Not Available,unspecified pre-eclampsia,Eclampsia and pre-eclampsia,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",2,11O149,12O759,Not Available,1,O149,Not Available,Not Available,Black,Black,Black,Non – Hispanic,Non - Hispanic black,Black
7132,RESIDENTS,6.0,December,25,"Hospital, clinic or Medical Center","Never married, Single",Wednesday,2009,Not Available,No,Not Available,"Exhaustion and fatigue, Peripheral neuritis, o...",All other direct obstetric causes,"Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",2,11O268,21O995,Not Available,1,O268,Not Available,Not Available,Black,Black,Black,Non – Hispanic,Non - Hispanic black,Black
7133,RESIDENTS,3.0,December,40,"Hospital, clinic or Medical Center","Never married, Single",Saturday,2009,Natural,Yes,Not Available,O96,"Other deaths related to pregnancy, childbirth ...","Other complications of pregnancy, childbirth a...",Not Available,"Pregnancy, childbirth and the puerperium",2,11O96,61O96,Not Available,1,O96,Not Available,Not Available,Black,Black,Black,Non – Hispanic,Non - Hispanic black,Black


In [4]:
# drop columns
df = df.drop([
    '39_cause_recode', 'entity_condition_2', 'entity_condition_3', 'record_condition_2', 'record_condition_3',
    'race', 'race_recode_3', 'race_recode_5', 'hispanic_origin', 'hispanic_originrace_recode',
    '130_causes_of_infant_death'], axis = 1)

In [5]:
df.columns

Index(['resident_status', 'level_of_education', 'month_of_death', 'age',
       'place_of_death_patient_status', 'marital_status',
       'day_of_week_of_death', 'year', 'manner_of_death', 'was_autopsy_done',
       'place_of_injury_for_icd_w00_y34_except_y06_and_y07_',
       'icd_code_10th_revision', '358_causes_of_death', '113_causes_of_death',
       'number_of_entity_axis_conditions', 'entity_condition_1',
       'number_of_record_axis_conditions', 'record_condition_1', 'ethnicity'],
      dtype='object')

In [6]:
df['marital_status'].unique()

array(['Divorced', 'Married', 'Never married, Single',
       'Marital Status unknown', 'Widowed'], dtype=object)

In [7]:
#Recode values to integer: LinearRegression does not take strings

# def recode_to_int(column_name):
#     df[column_name] = df[column_name].astype('category')
#     df[column_name] = df[column_name].cat.codes

# for column in df.columns:
#     recode_to_int(column)

In [8]:
df_mlr = df[['ethnicity', 'marital_status']]
df_mlr

Unnamed: 0,ethnicity,marital_status
0,Hispanic,Divorced
1,Hispanic,Married
2,Hispanic,Married
3,Hispanic,Married
4,Hispanic,Married
...,...,...
7130,Black,Married
7131,Black,Married
7132,Black,"Never married, Single"
7133,Black,"Never married, Single"


In [9]:
# can do the same with eclampsia vs not eclampsia deaths, etc
df_mlr['black'] = df_mlr['ethnicity'].apply(lambda x: 1 if x=="Black" else 0)
df_mlr

df_mlr['single'] = df_mlr['marital_status'].apply(lambda x: 1 if x=="Never married, Single" else 0)
df_mlr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mlr['black'] = df_mlr['ethnicity'].apply(lambda x: 1 if x=="Black" else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mlr['single'] = df_mlr['marital_status'].apply(lambda x: 1 if x=="Never married, Single" else 0)


Unnamed: 0,ethnicity,marital_status,black,single
0,Hispanic,Divorced,0,0
1,Hispanic,Married,0,0
2,Hispanic,Married,0,0
3,Hispanic,Married,0,0
4,Hispanic,Married,0,0
...,...,...,...,...
7130,Black,Married,1,0
7131,Black,Married,1,0
7132,Black,"Never married, Single",1,1
7133,Black,"Never married, Single",1,1


In [10]:
df_mlr = df_mlr.drop(['ethnicity', 'marital_status'], axis = 1)
df_mlr

Unnamed: 0,black,single
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
7130,1,0
7131,1,0
7132,1,1
7133,1,1


In [11]:
#Create training and test datasets
X = df_mlr['black'].to_frame()
X

Unnamed: 0,black
0,0
1,0
2,0
3,0
4,0
...,...
7130,1
7131,1
7132,1
7133,1


In [12]:
# variable to predict
y = df_mlr['single']
y

0       0
1       0
2       0
3       0
4       0
       ..
7130    0
7131    0
7132    1
7133    1
7134    0
Name: single, Length: 7135, dtype: int64

In [13]:
print(list(X.columns.values))

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size = 0.20, random_state = 5)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

['black']
(5708, 1)
(1427, 1)
(5708,)
(1427,)


In [14]:
model1 = LogisticRegression(
    random_state=0,
    # multi_class='multinomial'
    multi_class='ovr',
    penalty='none',
    solver='newton-cg'
).fit(X_train, y_train)

log_odds = model1.coef_[0]

pd.DataFrame(log_odds, 
             X.columns, 
             columns=['coef'])\
            .sort_values(by='coef', ascending=False)

Unnamed: 0,coef
black,1.206262


In [15]:
# exponentiating the log odds coefficients
odds = np.exp(model1.coef_[0])
pd.DataFrame(odds, 
             X.columns, 
             columns=['coef'])\
            .sort_values(by='coef', ascending=False)

Unnamed: 0,coef
black,3.340972


In [16]:
preds = model1.predict(X_test)

#print the tunable parameters (They were not tuned in, everything kept as default)
params = model1.get_params()
print(params)

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'ovr', 'n_jobs': None, 'penalty': 'none', 'random_state': 0, 'solver': 'newton-cg', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [17]:
#Print model parameters
print('Intercept: \n', model1.intercept_)
print('Coefficients: \n', model1.coef_)

Intercept: 
 [-0.73033194]
Coefficients: 
 [[1.20626178]]


In [18]:
#Calculate odds ratio estimates
np.exp(model1.coef_)

# odds of dying as a single mom if you're black / odds of dying single if not black

array([[3.340972]])

In [19]:
#Use statsmodels to assess variables

logit_model=sm.MNLogit(y_train,sm.add_constant(X_train))
logit_model
result=logit_model.fit()
stats1=result.summary()
stats2=result.summary2()
print(stats1)
print(stats2)

Optimization terminated successfully.
         Current function value: 0.641757
         Iterations 4
                          MNLogit Regression Results                          
Dep. Variable:                 single   No. Observations:                 5708
Model:                        MNLogit   Df Residuals:                     5706
Method:                           MLE   Df Model:                            1
Date:                Wed, 04 Aug 2021   Pseudo R-squ.:                 0.05558
Time:                        23:55:03   Log-Likelihood:                -3663.2
converged:                       True   LL-Null:                       -3878.7
Covariance Type:            nonrobust   LLR p-value:                 9.093e-96
  single=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.7303      0.034    -21.356      0.000      -0.797      -0.663
black          1.2063      0.

In [20]:
# To check accuracy
# Create a confusion matrix
confusion_matrix(y_test, preds)

array([[633, 198],
       [308, 288]])

In [21]:
#transform confusion matrix into array
confmtrx = np.array(confusion_matrix(y_test, preds))

In [22]:
#Accuracy statistics

print('Accuracy Score:', metrics.accuracy_score(y_test, preds))  

#Create classification report
class_report=classification_report(y_test, preds)
print(class_report)

Accuracy Score: 0.6454099509460406
              precision    recall  f1-score   support

           0       0.67      0.76      0.71       831
           1       0.59      0.48      0.53       596

    accuracy                           0.65      1427
   macro avg       0.63      0.62      0.62      1427
weighted avg       0.64      0.65      0.64      1427

