# CS737 Final Project
Author: Anthony Lipphardt

Date: April 23, 2018

In [9]:
import pandas as pd
import numpy as np
from time import time

In [108]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

## Import and Balance Dataset

In [2]:
literal_text = pd.read_csv('literal-text.csv', index_col=0)
literal_text.head()

Unnamed: 0,DMI,literal text
0,0,GASTRIC CARCINOMA
1,0,SMALL CELL CARCINOMA PROSTATE METASTATIC PROST...
2,0,RESPIRATORY FAILURE METASTATIC ANAL SQUAMOUS C...
3,0,ASPIRATION PNEUMONIA ALZHEIMER DEMENTIA
4,0,PULMONARY EDEMA END STAGE RENAL FAILURE STOPPE...


In [24]:
# Create random undersampling of over represented class and create final balanced dataset

undersample = literal_text[literal_text['DMI'] == 0].sample(n=len(literal_text[literal_text['DMI'] == 1]))
DMIrows = literal_text[literal_text['DMI'] == 1]

final = pd.concat([undersample, DMIrows])
final.groupby('DMI').count()

Unnamed: 0_level_0,literal text
DMI,Unnamed: 1_level_1
0,2495
1,2495


# Find Optimal Values for LSA
Using optimal parameters for CountVectorizer, find values for n_components hyperparameter that retains 85, 90, and 95 percent of explained variance in the dataset.

In [50]:
# Convert data using CountVectorizer with optimal parameters and run dimensionality reduction

cv = CountVectorizer(lowercase=False, binary=True, min_df=3, ngram_range=(1, 2))
tf = cv.fit_transform(final['literal text'])
print("Number of words:",len(cv.get_feature_names()))

Number of words: 3368


In [72]:
# Find values for n_components that find 85%, 90%, and 95% variance

for i in (525, 775, 1180):
    svd = TruncatedSVD(n_components=i)
    svd.fit(tf)
    print(np.sum(svd.explained_variance_ratio_))

0.850446039784
0.902411055327
0.950075847035


## Setup Train and Test Data
Split final balanced dataset into target and test sets using a 75:25 split. Data and targets will be separated.

In [93]:
# Split data into training and testing
train, test, traint, testt = train_test_split(final['literal text'], final['DMI'], test_size=0.25)

## Configure Pipelines and Parameters for Grid Search

In [88]:
# Create pipelines for Naive Bayes and SVM workflows

NB_pipeline = Pipeline([
    ('NBvect', CountVectorizer(lowercase=False,binary=True)),
    ('NBclf', BernoulliNB(binarize=None))
])

SVM1_pipeline = Pipeline([
    ('SVMvect', CountVectorizer(lowercase=False,binary=True)),
    ('SVMclf', SVC(kernel='linear'))
])

SVM2_pipeline = Pipeline([
    ('SVMvect', CountVectorizer(lowercase=False,binary=True)),
    ('SVMdim', TruncatedSVD()),
    ('SVMclf', SVC(kernel='linear'))
])

In [135]:
# Create parameter grids for Naive Bayes and SVM workflows

NB_parameters = {
    
    'NBvect__min_df': (3,5),
    'NBvect__ngram_range': ((1,1),(1,2)),
    
    'NBclf__alpha': (0, 0.1, 0.5, 1)
    
}


SVM1_parameters = {
    
    'SVMvect__min_df': (3,5),
    'SVMvect__ngram_range': ((1,1),(1,2)),
  
    'SVMclf__C': (1, 10, 100, 1000)
    
}

SVM2_parameters = {
    
    'SVMvect__min_df': (3,),    
    'SVMvect__ngram_range': ((1,2),),    

    'SVMdim__n_components': (525, 775, 1180),

    'SVMclf__C': (1,)
    
}


In [126]:
def runTests(data, targets, pipeline, parameters):

    """ Perform grid search with specified pipeline and parameters
        on data training set with targets as labels
        
        Evaluate performance based on precision and print parameters
        for best estimator
        
        grid search object is returned for further analysis"""

    grid_search = GridSearchCV(pipeline, parameters, verbose=1, cv=10, scoring='precision')

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    print(parameters)
    t0 = time()
    grid_search.fit(data, targets)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
        
    return grid_search

## Run Grid Search for Each Pipeline

In [136]:
# Run grid search for Naive Bayes
NB_grid_search = runTests(train, traint, NB_pipeline, NB_parameters)

Performing grid search...
pipeline: ['NBvect', 'NBclf']
parameters:
{'NBvect__min_df': (3, 5), 'NBvect__ngram_range': ((1, 1), (1, 2)), 'NBclf__alpha': (0, 0.1, 0.5, 1)}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' 

  'setting alpha = %.1e' % _ALPHA_MIN)


done in 30.803s

Best score: 0.987
Best parameters set:
	NBclf__alpha: 1
	NBvect__min_df: 3
	NBvect__ngram_range: (1, 1)


[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:   30.7s finished


In [128]:
# Run grid search for SVM without dimensionality reduction
SVM1_grid_search = runTests(train, traint, SVM1_pipeline, SVM1_parameters)

Performing grid search...
pipeline: ['SVMvect', 'SVMclf']
parameters:
{'SVMvect__min_df': (3, 5), 'SVMvect__ngram_range': ((1, 1), (1, 2)), 'SVMclf__C': (1, 10, 100, 1000)}
Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  1.1min finished


done in 64.498s

Best score: 0.994
Best parameters set:
	SVMclf__C: 1
	SVMvect__min_df: 3
	SVMvect__ngram_range: (1, 2)


In [131]:
# Run grid search for SVM with dimensionality reduction
SVM2_grid_search = runTests(train, traint, SVM2_pipeline, SVM2_parameters)

Performing grid search...
pipeline: ['SVMvect', 'SVMdim', 'SVMclf']
parameters:
{'SVMvect__min_df': (3,), 'SVMvect__ngram_range': ((1, 2),), 'SVMdim__n_components': (525, 775, 1180), 'SVMclf__C': (1,)}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  3.5min finished


done in 217.647s

Best score: 0.994
Best parameters set:
	SVMclf__C: 1
	SVMdim__n_components: 1180
	SVMvect__min_df: 3
	SVMvect__ngram_range: (1, 2)


# Examine and Export Grid Search Results

In [137]:
# Gather results from grid search
NB_results = pd.DataFrame({'params': NB_grid_search.cv_results_['params'], 'Classifier': 'Naive Bayes', 'mean_test_score': NB_grid_search.cv_results_['mean_test_score']})
SVM1_results = pd.DataFrame({'params': SVM1_grid_search.cv_results_['params'], 'Classifier': 'SVM w/o Reduction', 'mean_test_score': SVM1_grid_search.cv_results_['mean_test_score']}) 
SVM2_results = pd.DataFrame({'params': SVM2_grid_search.cv_results_['params'], 'Classifier': 'SVM with Reduction', 'mean_test_score': SVM2_grid_search.cv_results_['mean_test_score']})
grid_search_results = pd.concat([NB_results, SVM1_results, SVM2_results], ignore_index=True)
grid_search_results

Unnamed: 0,Classifier,mean_test_score,params
0,Naive Bayes,0.972749,"{'NBclf__alpha': 0, 'NBvect__min_df': 3, 'NBve..."
1,Naive Bayes,0.975476,"{'NBclf__alpha': 0, 'NBvect__min_df': 3, 'NBve..."
2,Naive Bayes,0.979112,"{'NBclf__alpha': 0, 'NBvect__min_df': 5, 'NBve..."
3,Naive Bayes,0.980173,"{'NBclf__alpha': 0, 'NBvect__min_df': 5, 'NBve..."
4,Naive Bayes,0.983301,"{'NBclf__alpha': 0.1, 'NBvect__min_df': 3, 'NB..."
5,Naive Bayes,0.984848,"{'NBclf__alpha': 0.1, 'NBvect__min_df': 3, 'NB..."
6,Naive Bayes,0.983265,"{'NBclf__alpha': 0.1, 'NBvect__min_df': 5, 'NB..."
7,Naive Bayes,0.984323,"{'NBclf__alpha': 0.1, 'NBvect__min_df': 5, 'NB..."
8,Naive Bayes,0.985368,"{'NBclf__alpha': 0.5, 'NBvect__min_df': 3, 'NB..."
9,Naive Bayes,0.98589,"{'NBclf__alpha': 0.5, 'NBvect__min_df': 3, 'NB..."


In [146]:
print("Naive Bayes")
print("  Average fit time:",np.mean(NB_grid_search.cv_results_['mean_fit_time']))
print("  Average score time:",np.mean(NB_grid_search.cv_results_['mean_score_time']))

Average fit time: 0.0987403169274
Average score time: 0.0104517951608


In [147]:
print("Support Vector Machine (w/o LSA)")
print("  Average fit time:",np.mean(SVM1_grid_search.cv_results_['mean_fit_time']))
print("  Average score time:",np.mean(SVM1_grid_search.cv_results_['mean_score_time']))

Support Vector Machine (w/o LSA)
  Average fit time: 0.217600390315
  Average score time: 0.0190324887633


In [148]:
print("Support Vector Machine (w/ LSA)")
print("  Average fit time:",np.mean(SVM2_grid_search.cv_results_['mean_fit_time']))
print("  Average score time:",np.mean(SVM2_grid_search.cv_results_['mean_score_time']))

Support Vector Machine (w/ LSA)
  Average fit time: 5.67686701616
  Average score time: 0.132110969226


In [139]:
# Export grid search results to CSV
grid_search_results.to_csv(path_or_buf='grid-search-results.csv',sep=',')

## Run Optimal Classifier Against Test Data
Fit training data to optimal classifier, transform test data, and obtain predictions.

Classification report and confusion matrix will be computed. Focus is on scoring for precision and specificity of drug mention with involvement (DMI) death.

In [166]:
# Fit to target data using optimal parameters in grid search and run on test data
cv = CountVectorizer(lowercase=False,binary=True, min_df=3, ngram_range=(1,2))
tf = cv.fit_transform(train)

svd = TruncatedSVD(n_components=1180)
tf_svd = svd.fit_transform(tf)

clf = SVC(kernel='linear', C=1)
clf.fit(tf_svd, traint)

predicted = clf.predict(svd.transform(cv.transform(test)))

precision = precision_score(testt, predicted, average=None)

print(classification_report(testt, predicted, target_names=['Non-DMI','DMI']))

print("\nConfusion Matrix:")
print(confusion_matrix(testt, predicted))

             precision    recall  f1-score   support

    Non-DMI       0.98      0.99      0.98       621
        DMI       0.99      0.98      0.98       627

avg / total       0.98      0.98      0.98      1248


Confusion Matrix:
[[613   8]
 [ 15 612]]


## Examine Misclassified Records

In [164]:
print("RECORDS MISCLASSIFIED AS DMI")
print("====================================\n")
for record in test[(predicted == 1) & (testt == 0)]:
    print(record,"\n")

RECORDS MISCLASSIFIED AS DMI

APPARENT SUDDEN DEATH DUE DIFLUOROETHANE INHALATION HUFFING DECEASED FOUND DEAD BEDROOM FLOOR MULTIPLE CANS COMPRESSED AIR FOUND NEAR BODY 

CERVICAL SPINE FRACTURE BLUNT IMPACT HEAD FALLEN HITTING HEAD BATHROOM WALL ACUTE ETHANOL INTOXICATION 

ASPHYXIA SUFFOCATION PLASTIC BAG INTENTIONALLY PLACED PLASTIC BAG HEAD SECURED KNOT 

ACUTE RESPIRATORY FAILURE ACUTE CHRONIC AORTIC ILIAC THROMBUS ISCHEMIC FOOT BILATERAL METASTASIS BONE LIVER GENRALIZED WEAKNESS DEHYDRATION SUSPECTED PORTAL VEIN THROMBOSIS METABOLIC ACIDOSIS HYPONATREMIA SYSTEMIC INFLAMMATORY RESPONSE SYNDROME TRANSAMINITIS MODERATE PROTEIN CALORIE MALNUTRITION 

ANOXIC ENCEPHALOPATHY ACUTE ETHANOL INTOXICATION INGESTION LARGE QUANTITIES ALCOHOL BULLOUS EMPHYSEMA CHRONIC ETHANOLISM 

SELF INFLICTED HANDGUN GUNSHOT WOUND HEAD DECEDENT TOOK LIFE FIRING MM HANDGUN BULLET HEAD SUICIDE NOTE RECOVERED SUICIDAL IDEATION PRIOR ATTEMPTS REPORTED DEPRESSION ALCOHOLISM 

ASPHYXIA DUE PLASTIC BAG HEAD FRESHW

In [163]:
print("RECORDS MISCLASSIFIED AS NON-DMI")
print("====================================\n")
for record in test[(predicted == 0) & (testt == 1)]:
    print(record,"\n")

RECORDS MISCLASSIFIED AS NON-DMI

PULMONARY HEMORRHAGE ANTICOAGULATION COUMADIN THERAPY ATRIAL FIBRILLATION LEUKEMIA CONGESTIVE HEART FAILURE 

DROWNING DECEDENT DROWN JACUZZI TUB ELEVATED GABAPENTIN BLOOD CONCENTRATION 

RESPIRATORY ARREST SEPSIS DUE ASPIRATION PNEUMONITIS PNEUMONIA SEVERE CHRONIC OBSTRUCTIVE PULMONARY DISEASE ACUTE CHRONIC HYPOXEMIC HYPERCAPNIC RESPIRATORY FAILURE ENCEPHALOPATHY SECONDARY TOXIC MEDICATIONS HISTORY SEVERE PROTEIN CALORIE MALNUTRITION 

ANATOMICAL CAUSE DEATH UNKNOWN EXTENT DIPHENHYDRAMINE CONTRIBUTED DEATH SUPRA THERAPEUTIC DIPHENHYDRAMINE LEVEL 

ACUTE RENAL FAILURE PROBABLY MEDICATION NEPHROTOXICITY DEMENTIA H PYLORI INFECTION 

ASPHYXIA FRESHWATER DROWING FOLLOWING INJECTED HEROIN DEMTHAMPHETAMINE USAGE DISCOVERED SUBMERGED BATHTUB WITHOUT INFLICTED INJURIES SUBSTANCE ABUSE MANY YEARS 

MASSIVE HEMORRHAGIC SHOCK AORTOENTERIC FISTULA ESOPHAGEAL ULCER BOTOX INJECTION ESOPHAGUS MEDICAL PROCEDURE HYPERTENSION ESOPHAGEAL SPASM 

HEMORRHAGIC STROKE ANTIC