In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, multilabel_confusion_matrix, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings('ignore')

In [8]:
train_df = pd.read_csv("input.csv")
train_df = train_df.drop(columns=['Unnamed: 0'])
train_df

Unnamed: 0,patient_id,sequence,label
0,Z8399761,"(unclear history), venous insufficiency, PVD, ...",3
1,Z11957588,period. It apparently cuts car accident rates ...,1
2,Z10809276,(P) 5 Hearing: no concerns Vision: needs updat...,2
3,Z8539944,"remission, s/p 1 suicide attempt, 3 IPLOC admi...",3
4,Z12696803,should be pursued at that point in time. He an...,1
5,Z7552077,since the hospital visit. The patient is accom...,2
6,Z11702635,71 y.o. female with has a past medical history...,3
7,Z15548862,in the Memory Diagnostic Clinic at McLean Hosp...,3
8,Z7648831,on back wound with significant output per RN. ...,3
9,Z7701777,lobes bilaterally and left parietal lobe. Thes...,3


In [19]:
test_df = pd.read_csv("input_test.csv")
test_df = test_df.drop(columns=['Unnamed: 0'])
test_df

Unnamed: 0,patient_id,sequence,label
0,Z7766129,(akathisia) diphenhydramine (urinary retention...,2
1,Z12274682,is notable for maternal grandmother with epile...,1
2,Z10490939,Bundle branch block Cataract Nonexudative age-...,3


In [10]:
vect = CountVectorizer()
vect.fit(train_df["sequence"].to_list())

CountVectorizer()

In [13]:
print("Vocab Size: {}".format(len(vect.vocabulary_)))
print("Vocab Content:\n {}".format(vect.vocabulary_)) # (word, idx assigned to word)

Vocab Size: 307
Vocab Content:
 {'unclear': 280, 'history': 125, 'venous': 285, 'insufficiency': 138, 'pvd': 233, 'htn': 128, 'gerd': 114, 'gastric': 112, 'ulcerations': 279, 'dvt': 87, 'obesity': 189, 'and': 25, 'memory': 166, 'changes': 54, 'who': 296, 'presents': 224, 'to': 273, 'churchill': 56, 'clinic': 57, 'for': 109, 'follow': 106, 'up': 282, 'of': 190, 'presumed': 225, 'left': 150, 'thigh': 270, 'abscess': 13, 'she': 255, 'period': 209, 'it': 143, 'apparently': 31, 'cuts': 75, 'car': 51, 'accident': 14, 'rates': 234, 'in': 135, 'half': 118, 'now': 188, 'this': 271, 'new': 182, 'study': 263, 'shows': 257, 'reduces': 237, 'dementia': 79, 'risk': 250, 'by': 50, '50': 10, 'admit': 17, 'sounds': 260, 'too': 274, 'good': 116, 'be': 42, 'true': 277, 'but': 48, 'even': 92, 'if': 132, 'the': 268, 'results': 248, 'are': 34, 'only': 194, 'hearing': 121, 'no': 183, 'concerns': 64, 'vision': 289, 'needs': 177, 'updated': 283, 'eye': 98, 'exam': 95, 'cognition': 60, 'negative': 178, 'evidenc

In [15]:
bag_of_words = vect.transform(train_df["sequence"].to_list())
print(bag_of_words)

  (0, 13)	1
  (0, 25)	1
  (0, 54)	1
  (0, 56)	1
  (0, 57)	1
  (0, 87)	1
  (0, 106)	1
  (0, 109)	1
  (0, 112)	1
  (0, 114)	1
  (0, 125)	1
  (0, 128)	1
  (0, 138)	1
  (0, 150)	1
  (0, 166)	1
  (0, 189)	1
  (0, 190)	1
  (0, 224)	1
  (0, 225)	1
  (0, 233)	1
  (0, 255)	1
  (0, 270)	1
  (0, 273)	1
  (0, 279)	1
  (0, 280)	1
  :	:
  (16, 275)	1
  (16, 287)	1
  (17, 9)	1
  (17, 22)	1
  (17, 23)	1
  (17, 25)	1
  (17, 27)	2
  (17, 36)	1
  (17, 43)	1
  (17, 49)	1
  (17, 82)	1
  (17, 85)	1
  (17, 149)	1
  (17, 159)	2
  (17, 162)	1
  (17, 174)	1
  (17, 181)	1
  (17, 195)	1
  (17, 206)	1
  (17, 211)	1
  (17, 231)	1
  (17, 238)	1
  (17, 240)	1
  (17, 273)	1
  (17, 301)	1


In [17]:
print("Bag of Words as Row Vector:\n {}".format(bag_of_words.toarray()))

Bag of Words as Row Vector:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [18]:
vect.get_feature_names() #all unique words in corpus

['10',
 '11',
 '15',
 '16',
 '18',
 '19',
 '2017',
 '22',
 '30',
 '38611851',
 '50',
 '70',
 '71',
 'abscess',
 'accident',
 'accompanied',
 'admissions',
 'admit',
 'after',
 'age',
 'alcohol',
 'alexander',
 'alzheimer',
 'ambulatory',
 'amyloid',
 'and',
 'angiopathy',
 'anna',
 'anticoagulation',
 'antiplatelets',
 'anxiety',
 'apparently',
 'appendectomy',
 'appetite',
 'are',
 'as',
 'assessment',
 'assistance',
 'at',
 'attempt',
 'awareness',
 'back',
 'be',
 'behavioral',
 'benign',
 'bilaterally',
 'blake',
 'bso',
 'but',
 'bwh',
 'by',
 'car',
 'causing',
 'cerebral',
 'changes',
 'cholecystectomy',
 'churchill',
 'clinic',
 'close',
 'code',
 'cognition',
 'cognitive',
 'comment',
 'complete',
 'concerns',
 'condtion',
 'confirmed',
 'connected',
 'consult',
 'consultation',
 'copd',
 'copy',
 'crab',
 'cues',
 'current',
 'cuts',
 'daily',
 'daughter',
 'decline',
 'dementia',
 'diagnostic',
 'discharge',
 'disease',
 'disorder',
 'disorders',
 'disturbance',
 'documented

In [21]:
X_train = vect.fit_transform(train_df["sequence"].to_list())
X_test = vect.transform(test_df["sequence"].to_list())

y_train = train_df["label"].to_list()
y_test = test_df["label"].to_list()

In [28]:
print(X_train)

  (0, 13)	1
  (0, 25)	1
  (0, 54)	1
  (0, 56)	1
  (0, 57)	1
  (0, 87)	1
  (0, 106)	1
  (0, 109)	1
  (0, 112)	1
  (0, 114)	1
  (0, 125)	1
  (0, 128)	1
  (0, 138)	1
  (0, 150)	1
  (0, 166)	1
  (0, 189)	1
  (0, 190)	1
  (0, 224)	1
  (0, 225)	1
  (0, 233)	1
  (0, 255)	1
  (0, 270)	1
  (0, 273)	1
  (0, 279)	1
  (0, 280)	1
  :	:
  (16, 275)	1
  (16, 287)	1
  (17, 9)	1
  (17, 22)	1
  (17, 23)	1
  (17, 25)	1
  (17, 27)	2
  (17, 36)	1
  (17, 43)	1
  (17, 49)	1
  (17, 82)	1
  (17, 85)	1
  (17, 149)	1
  (17, 159)	2
  (17, 162)	1
  (17, 174)	1
  (17, 181)	1
  (17, 195)	1
  (17, 206)	1
  (17, 211)	1
  (17, 231)	1
  (17, 238)	1
  (17, 240)	1
  (17, 273)	1
  (17, 301)	1


In [27]:
lgr = LogisticRegression()
lgr.fit(X_train, y_train)
print("Test Set Score: ", lgr.score(X_test,y_test))

preds = lgr.predict(X_test)
conf_mat = confusion_matrix(y_test, preds)
print("Confusion Matrix: \n {}".format(conf_mat))

Test Set Score:  0.3333333333333333
Confusion Matrix: 
 [[0 0 1]
 [0 0 1]
 [0 0 1]]
