Applying a multinomial logistic regression model to the ternary outcome of major, minor, none bleeding

Import necessary libraries 

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print("complete")

complete


Read dataset

In [2]:
df = pd.read_csv("/Users/anthonyquint/Desktop/LHSC_Work_Folder/Mina/Bleeding_study/Ibrutinib Data Set, July 13,2021, de-identified data.csv")
df.head()

Unnamed: 0,Age at diagnosis,gender,Diagnosis year,Plt at diagnosis,plt at start of ibrutinib,plt at the time of bleed,Plt Nadir while on Ibrutinib,Platelets < 50 (Y/N),hb at diognosis,hb at start of Ibrutinib,...,action?,post op bleed? /action,INR,past medical history,PMHx bleeding risk (Y/N),Ibrutinib Dose,Comments,other ibrutinib SE,Unnamed: 58,Unnamed: 59
0,48,f,2006,260,15,,15,Y,130,71,...,,,,"deppression, schwanoma of leg",Y,"420mg,",ITP at the time of starting ibrutinib,,,
1,66,m,2017,175,83,155.0,93,N,145,93,...,,,,"cryoglobinemia,MGUS,CAD,HTN,COPD",N,420mg,,,,
2,74,F,2016,189,200,,189,N,116,87,...,,,,"dm2,htn,",N,"420mg,",reaction to first obino so switched to ibrutin...,,,
3,53,F,2002,237,67,,40,Y,135,118,...,,,,"HTN,B12 def,IDA",N,420mg,WAIHA,easy bruising,,
4,60,m,1999,198,85,70.0,49,Y,154,104,...,died,,1.1,"prostitis,mycosis,chronc sinusitis",N,ibrutinib dose reduced to 140 in oct 2015 for ...,"cutaneous oral mucosal involvement w CLL, als...",,,


Cleaning the dataset

In [3]:
##Removing all columns except the columns corresponding to our relevant 
##independent variables (indicated at top of notebook) and dependent variable (Major Bleed (Y/N))

df = df[["Plt Nadir while on Ibrutinib","Anemia (hb < 100) (Y/N)","anticoagulation (Y/N)","Major Bleed (Y/N)"]]

#changing NaN values in "Major Bleed (Y/N)" to M. I.e. Y = major bleed, N = minor bleed, M = no bleed 
df['Major Bleed (Y/N)'].fillna('M', inplace=True)

# Delete index 112 since it has a question mark for age 
#df = df.drop(labels=112, axis=0)

df.head()

Unnamed: 0,Plt Nadir while on Ibrutinib,Anemia (hb < 100) (Y/N),anticoagulation (Y/N),Major Bleed (Y/N)
0,15,Y,N,M
1,93,Y,N,N
2,189,Y,N,M
3,40,N,N,M
4,49,N,N,Y


Counting number of people who had major vs. minor vs. no bleed

In [4]:
df['Major Bleed (Y/N)'].value_counts(dropna=False) 

M    128
N     25
Y     17
Name: Major Bleed (Y/N), dtype: int64

Converting categorical data into numerical representation

In [5]:
number = LabelEncoder()

df['Anemia (hb < 100) (Y/N)'] = number.fit_transform(df['Anemia (hb < 100) (Y/N)'].astype('str'))
df['anticoagulation (Y/N)'] = number.fit_transform(df['anticoagulation (Y/N)'].astype('str'))
df['Major Bleed (Y/N)'] = number.fit_transform(df['Major Bleed (Y/N)'].astype('str'))

df.head()

Unnamed: 0,Plt Nadir while on Ibrutinib,Anemia (hb < 100) (Y/N),anticoagulation (Y/N),Major Bleed (Y/N)
0,15,1,0,0
1,93,1,0,1
2,189,1,0,0
3,40,0,0,0
4,49,0,0,2


Splitting data into independent and dependent variables, then into training and testing set, (and then upsample the training sets?)

In [6]:
clinical_features = ["Plt Nadir while on Ibrutinib","Anemia (hb < 100) (Y/N)","anticoagulation (Y/N)"]

X = df[clinical_features]   #Independent variables 
y = df['Major Bleed (Y/N)']  #Dependent variables 

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0,stratify=y) #Splitting variables into training/testing set

#########

X_Y_train = pd.concat([X_train,y_train],axis=1)

def upsample_minority(df):
    
    df_majority = df.loc[df['Major Bleed (Y/N)'] == 0]
    df_minority_1 = df.loc[df['Major Bleed (Y/N)'] == 1]
    df_minority_2 = df.loc[df['Major Bleed (Y/N)'] == 2]
    
    df_minority_1_upsampled = resample(df_minority_1, replace = True, n_samples=len(df_majority), random_state=123)
    df_minority_2_upsampled = resample(df_minority_2, replace = True, n_samples=len(df_majority), random_state=123)
    
    df_upsampled = pd.concat([df_majority,df_minority_1_upsampled,df_minority_2_upsampled])
    
    df = df_upsampled
    
    return df 

def smote(train_1,train_2):
    
    sm = SMOTE(random_state=27)
    train_1, train_2 = sm.fit_resample(train_1, train_2)
    
    return train_1,train_2 

#upsampling the minority directly
X_Y_train_upsample = upsample_minority(X_Y_train)
y_train_upsample = X_Y_train_upsample['Major Bleed (Y/N)']
X_train_upsample = X_Y_train_upsample.drop('Major Bleed (Y/N)', axis=1)

df_upsample = upsample_minority(df)
y_upsample = df_upsample['Major Bleed (Y/N)']
X_upsample = df_upsample.drop('Major Bleed (Y/N)', axis=1)

#upsampling by generating synthetic samples (smote)
X_train_smote, y_train_smote = smote(X_train, y_train)

X_smote, y_smote = smote(X, y)

# Choosing which of the 2 sets of upsampled data I'll make the model with (upsampling vs. smote). 
# Upsampling produces better results
XX = X_train_upsample
yy = y_train_upsample

#X_Y_train_upsample['Major Bleed (Y/N)'].value_counts(dropna=False)


## https://towardsdatascience.com/methods-for-dealing-with-imbalanced-data-5b761be45a18

Removing uncorrelated independent variables (variables that are uncorrelated for both Major bleed: 1 and 2)

Using statsmodel to assess coefficients 

In [7]:
logit_model=sm.MNLogit(yy,sm.add_constant(XX.astype(float)))
result=logit_model.fit()
stats1=result.summary()
print(stats1)

Optimization terminated successfully.
         Current function value: 0.920290
         Iterations 6
                          MNLogit Regression Results                          
Dep. Variable:      Major Bleed (Y/N)   No. Observations:                  285
Model:                        MNLogit   Df Residuals:                      277
Method:                           MLE   Df Model:                            6
Date:                Wed, 28 Jul 2021   Pseudo R-squ.:                  0.1623
Time:                        22:55:02   Log-Likelihood:                -262.28
converged:                       True   LL-Null:                       -313.10
Covariance Type:            nonrobust   LLR p-value:                 1.139e-19
         Major Bleed (Y/N)=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                           -1.9312      0.440     -4.394      0.000

Implementing the model with LogisticRegression

In [8]:
logreg = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg')
model1 = logreg.fit(XX, yy)
preds = model1.predict(X_test)

Assessing the model

In [9]:
confmtrx = np.array(confusion_matrix(y_test, preds))
pd.DataFrame(confmtrx, index=['None','Minor', 'Major'],
columns=['predicted_None', 'predicted_Minor', 'predicted_Major'])

Unnamed: 0,predicted_None,predicted_Minor,predicted_Major
,17,5,11
Minor,0,2,4
Major,1,2,1


Accuracy score

In [10]:
print('Accuracy Score:', metrics.accuracy_score(y_test, preds))

Accuracy Score: 0.46511627906976744


Classification report

In [11]:
class_report=classification_report(y_test, preds)
print(class_report)

              precision    recall  f1-score   support

           0       0.94      0.52      0.67        33
           1       0.22      0.33      0.27         6
           2       0.06      0.25      0.10         4

    accuracy                           0.47        43
   macro avg       0.41      0.37      0.34        43
weighted avg       0.76      0.47      0.56        43

