# Analysis of Non-DoD_DCR file

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Summarizing the data
df = pd.read_csv("data/DCR.csv")
df.columns.tolist()

['Patient',
 'INR (hospital)',
 'PT (ISR) (s)',
 'Died of wounds (yes 1, no 0)',
 'Fg (mg/dl)',
 'PTT (ISR) (s)',
 'PTT (hospital) (s)',
 'D-Dimer (ug/ml)',
 'pH (ISR)',
 'pH (hospital)',
 'HCO3 (ISR) (mM)',
 'HCO3 (hospital) (mM)',
 'Lactate (ISR) (mM)',
 'Lactate (hospital) (mM)',
 'CAT Lagtime (min)',
 'CAT ETP (nM*min)',
 'CAT Peak (nM)',
 'CAT ttPeak (min)',
 'CT (s)',
 'MCF (mm)',
 'alpha (degrees)',
 'LI30 (%)',
 'LI45 (%)',
 'LI60 (%)',
 'Total protein (ug/ml)',
 'Fibrin degradation products (ng/ml)',
 'Crystalloid (mL given pre-hospital)',
 'Prelab time (estimated) (min)',
 'Sodium',
 'Potassium',
 'BUN',
 'Creatine',
 'Mg',
 'Ica']

In [3]:
#Getting all the feature names and looking for missing values
all_cols = df.columns.tolist()
df.isnull().values.any()

True

In [18]:
# Creating subset of 28 features (removing all the hospital fields and Ica. Getting basic column stats.
cols_isr = ['Patient','PT (ISR) (s)', 'Fg (mg/dl)','PTT (ISR) (s)','D-Dimer (ug/ml)',
            'pH (ISR)','HCO3 (ISR) (mM)', 'Lactate (ISR) (mM)','CAT Lagtime (min)',
            'CAT ETP (nM*min)','CAT Peak (nM)','CAT ttPeak (min)','CT (s)', 'MCF (mm)',
            'alpha (degrees)','LI30 (%)','LI45 (%)','LI60 (%)','Total protein (ug/ml)',
            'Fibrin degradation products (ng/ml)','Crystalloid (mL given pre-hospital)',
            'Prelab time (estimated) (min)', 'Sodium','Potassium','BUN','Creatine','Mg',
            'Died of wounds (yes 1, no 0)']
df[cols_isr].shape

(67, 28)

## Analysis on ISR data and imputing with -1

In [5]:
#Filling the missing fields with -1 value and confirming no missing data
df_isr = df[cols_isr].fillna(-1)
df_isr.isnull().values.any()

False

In [19]:
# Have 67 records for subset of 28 features for prediction
df_isr.shape

(67, 28)

In [7]:
# Records of alive vs dead
print df_isr['Died of wounds (yes 1, no 0)'].value_counts()

0    60
1     7
Name: Died of wounds (yes 1, no 0), dtype: int64


In [8]:
#Preparing X and y
X = df_isr[cols_isr[:-1]]
y = df_isr['Died of wounds (yes 1, no 0)']
print X.shape, y.shape

(67, 27) (67,)


In [9]:
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

In [10]:
# RandomForestClassifier
def RFC(X, y):
    n_est = 10
    max_f = "sqrt" #float, int or log2
    max_d = 8
    random_state = 42
    model = rfc(n_estimators = n_est, max_features=max_f, max_depth= max_d, oob_score= True, random_state=random_state)
    scores = cross_val_score(model, X, y)
    y_pred = cross_val_predict(model, X, y)
    print "Cross validation accuracy: ", scores
    print "Overall accuracy: ", scores.mean()
    model_perf(y, y_pred)
    return model

In [11]:
def model_perf(y, y_pred):
    print "Confusion matrix: \n", confusion_matrix(y, y_pred)
    #print "Precision, Recall, fscore: ", precision_recall_fscore_support(y, y_pred)
    precision, recall, a3, a4 = precision_recall_fscore_support(y, y_pred)
    for i in range(2):
        if i == 0:
            print "precision, recall value (alive): ", precision[0], recall[0]
        else:
            print "precision, recall value (dead): ", precision[1], recall[1]

In [12]:
# Overall accuracy using cross_validation 2/3 data used for training 1/3 used for testing accuracy, run thrice.
model = RFC(X,y)


Cross validation accuracy:  [0.86956522 0.86363636 0.95454545]
Overall accuracy:  0.8959156785243741
Confusion matrix: 
[[58  2]
 [ 5  2]]
precision, recall value (alive):  0.9206349206349206 0.9666666666666667
precision, recall value (dead):  0.5 0.2857142857142857


In [21]:
# Feature Importance sorted in descending order of importance
fi = model.fit(X,y).feature_importances_
feat_imp = pd.DataFrame()
feat_imp['feature'] = cols_isr[:-1]
feat_imp['importance'] = fi
print feat_imp.sort_values(by='importance', ascending=False)

                                feature  importance
1                          PT (ISR) (s)    0.135285
5                              pH (ISR)    0.104257
4                       D-Dimer (ug/ml)    0.098946
0                               Patient    0.090922
12                               CT (s)    0.068926
16                             LI45 (%)    0.063393
20  Crystalloid (mL given pre-hospital)    0.057643
17                             LI60 (%)    0.048838
25                             Creatine    0.040179
23                            Potassium    0.035392
8                     CAT Lagtime (min)    0.034445
15                             LI30 (%)    0.032377
2                            Fg (mg/dl)    0.031382
9                      CAT ETP (nM*min)    0.029723
21        Prelab time (estimated) (min)    0.024187
6                       HCO3 (ISR) (mM)    0.023929
11                     CAT ttPeak (min)    0.022841
7                    Lactate (ISR) (mM)    0.013181
14          

## Analysing after dropping the records with any missing data of ISR cols


In [22]:
df_isr_drop_null = df[cols_isr].dropna()
df_isr_drop_null.isnull().values.any()

False

In [24]:
df_isr_drop_null.shape

(33, 28)

In [25]:
# Records of alive vs dead
print df_isr_drop_null['Died of wounds (yes 1, no 0)'].value_counts()

0    33
Name: Died of wounds (yes 1, no 0), dtype: int64


Since dropping all the records with missing data removes all the rows of dead. Not proceeding with the experiment.