In [55]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 30 12:00:24 2017

@author: alec
"""
import sys, os
import pandas as pd
import glob
import sqlite3 as sqlite
import numpy as np
import sklearn
import re
from sklearn.feature_extraction.text import CountVectorizer

In [56]:
conn= sqlite.connect(os.path.join(os.path.expanduser('~'),'Box Sync',
'Radiology Annotation','Reference Standard','radiology_reports.sqlite'))
cursor=conn.cursor()

#ngram_vectorizer = CountVectorizer(ngram_range=(1,2), token_pattern=r'(?u)\b\w\w+\b', stop_words="english",  min_df=2)

#column names
#(0, 'rowid', 'int', 0, None, 0)

In [57]:
training_notes = pd.read_sql("""SELECT * FROM training_notes""",conn)
#training_notes

In [58]:
training_names = training_notes['name']
training_text = training_notes['text']
training_labels = training_notes['doc_class']
training_content = zip(training_names,training_labels)

In [59]:
train_text = {x[0]:x[1] for x in zip(training_names,training_text)}
train_labels = {x[0]:x[1] for x in zip(training_names,training_labels)}

# Preprocessing

In [60]:
from nltk.corpus import stopwords
stoplist = stopwords.words("english")

In [61]:
#training
documents = [x.lower().strip() for x in training_text]
documents = [re.sub("[^a-zA-Z?]", ' ', x).split() for x in documents]


In [62]:
texts = [[word for word in document if word not in stoplist]
        for document in documents]

In [63]:
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        
texts = [[token for token in text if frequency[token] > 1] for text in texts]

# Convert to vectors

In [64]:
#with sklearn
vectorizer = CountVectorizer(analyzer = "word",tokenizer=None,preprocessor=None,stop_words=None,max_features=None)

In [65]:
texts = [' '.join(x) for x in texts]

In [66]:
train_data_features = vectorizer.fit_transform(texts).toarray()

In [67]:
vocab = vectorizer.get_feature_names()
print(vocab[:20])

['aaa', 'abcess', 'abd', 'abdm', 'abdomen', 'abdomenal', 'abdominal', 'abdominis', 'abdominopelvic', 'aberrant', 'ablation', 'able', 'abn', 'abnomality', 'abnormal', 'abnormalities', 'abnormality', 'abnormally', 'abo', 'aborted']


In [68]:
#Sum up the counts of each vocabulary word
dist = np.sum(train_data_features,axis=0)
for tag, count in zip(vocab[::100], dist[::100]):
    print(count, tag)

36 aaa
7 aeration
51 ap
5 attenuating
2 breasts
2 centimeter
272 collections
2 cont
5 cutoff
8 details
2 double
3 endocarditis
59 excrete
7 fibroids
38 gallstones
11 heavily
86 hyperdense
24 indeterminate
3 intrab
4 laparotomies
55 located
20 median
11 musculature
2 obliterated
45 overall
3 perfed
32 pm
12 presumed
4 questioned
9 reidentified
51 rising
2 sepsispo
39 smv
2 striking
2 sz
80 tissues
4 typical
220 venous
2 wvomiting


# Implement a Random Forest

In [69]:
from sklearn.ensemble import RandomForestClassifier

In [70]:
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, training_labels)

In [71]:
#testing set
testing_notes = pd.read_sql("""SELECT * FROM testing_notes""",conn)
testing_names = testing_notes['name']
testing_text = testing_notes['text']
testing_labels = testing_notes['doc_class']
testing_content = zip(testing_names,testing_labels)

In [72]:

test_documents = [x.lower().strip() for x in testing_text]
test_documents = [re.sub("[^a-zA-Z?]", ' ', x).split() for x in test_documents]
testing_text = [[word for word in document if word not in stoplist]
        for document in test_documents]

In [73]:
test_frequency = defaultdict(int)
for text in testing_text:
    for token in text:
        test_frequency[token] += 1
        
testing_text = [[token for token in text if test_frequency[token] > 1] for text in testing_text]
testing_text = [' '.join(x) for x in testing_text]

In [74]:
test_data_features = vectorizer.transform(testing_text)
test_data_features.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]])

In [75]:
result = forest.predict(test_data_features)

In [76]:
output = pd.DataFrame(data={"file name":testing_names,"prediction":result,"label":testing_labels})

In [77]:
counter = 0
correct = 0
incorrect = []
for i in range(100):
    if output['prediction'][i] == output['label'][i]:
        correct += 1
        counter += 1
    else:
        incorrect.append(output['file name'][i])
        counter += 1

In [78]:
accuracy = correct/counter
accuracy

0.86

# Implement a Naive Bayes

In [80]:
from sklearn.naive_bayes import MultinomialNB

In [81]:
mnb = MultinomialNB()
mnb.fit(train_data_features, training_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [82]:
result = mnb.predict(test_data_features.toarray())#convert to dense array

In [83]:
len(result)

100

In [84]:
output = pd.DataFrame(data={"file name":testing_names,"prediction":result,"label":testing_labels})

In [85]:
counter = 0
correct = 0
incorrect = []
for i in range(100):
    if output['prediction'][i] == output['label'][i]:
        correct += 1
        counter += 1
    else:
        incorrect.append(output['file name'][i])
        counter += 1
accuracy = correct/counter
accuracy #got .67 with a Gaussian Naive Bayes

0.81