In [188]:
import re
import glob
import json
import random
import multiprocessing
import warnings
warnings.simplefilter('ignore')

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.colors as colors

from bokeh.io import output_notebook
from bokeh.plotting import show, figure

import gensim
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objects as go

from sklearn import utils
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import IncrementalPCA
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

In [190]:
jsons = glob.glob('data/jsons/*.json')
reports = []
for jsn in jsons:
    with open(jsn, 'r') as j:
        reports.append(json.loads(j.read()))

In [191]:
report_df = pd.DataFrame.from_dict(reports)
report_df = report_df.rename(columns = {'ReportText': 'exam'})
report_df = report_df.fillna(" ")
report_df = report_df.drop('ExamName', axis=1)

In [174]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [175]:
def is_alpha(report):
    return [word for word in report if word.isalpha()]
def remove_stopwords(report):
    return [word for word in report if word not in stop_words]
def lemmatize(report):
    return [lemmatizer.lemmatize(word) for word in report if len(word) > 2]
def remove_field_name(report):
    return [report[1:]]
def stringify(report):
    return ' '.join([word for word in report])

In [176]:
def clean_report(column):
    report_df[column] = report_df[column].str.lower()
    report_df[column] = report_df[column].apply(word_tokenize)
    report_df[column] = report_df[column].apply(lambda row: is_alpha(row))
    report_df[column] = report_df[column].apply(lambda row: remove_stopwords(row))
    report_df[column] = report_df[column].apply(lambda row: lemmatize(row))
    return report_df

In [192]:
for column in report_df.columns:
    clean_report(column)

In [193]:
report_df

Unnamed: 0,exam,findings,clinicaldata,impression
0,"[exam, chest, radiography, exam, date, clinica...","[finding, mild, pulmonary, vascular, congestio...","[clinical, history, hypoxia]","[impression, mild, pulmonary, vascular, conges..."
1,"[procedure, chest, comparison, none, indicatio...","[finding, lung, normal, significant, pulmonary...","[indication, acute, onset, cough, three, week,...","[conclusion, normal, examination]"
2,"[exam, chest, view, history, chest, pain, tech...","[finding, confluent, infiltrates, pleural, eff...","[history, chest, pain]","[impression, acute, cardiopulmonary, abnormali..."
3,"[exam, chest, radiography, exam, date, clinica...","[finding, focal, opacity, evident, pleural, ef...","[clinical, history, chest, pain]","[impression, normal, single, view, chest]"
4,"[exam, chest, radiography, exam, date, clinica...","[finding, focal, opacity, evident, pleural, ef...","[clinical, history, shortness, breath]","[impression, normal, single, view, chest]"
...,...,...,...,...
982,"[exam, chest, history, acute, respiratory, ill...","[finding, support, device, unchanged, cardiome...","[history, acute, respiratory, illness]","[impression, acute, cardiopulmonary, process, ..."
983,"[exam, chest, view, history, chest, pain, comp...","[finding, lung, clear, heart, size, within, no...","[history, chest, pain]","[impression, acute, finding, electronically, s..."
984,"[exam, chest, radiography, exam, date, clinica...","[finding, focal, opacity, evident, pleural, ef...","[clinical, history, chest, pain]","[impression, normal, single, view, chest]"
985,"[exam, chest, radiography, exam, date, clinica...","[finding, focal, opacity, evident, pleural, ef...","[clinical, history, cough]","[impression, normal, chest, radiography]"


In [305]:
exams = list(report_df['exam'].values)
exams = [report for report in exams if len(report) > 0]
findings = list(report_df['findings'].values)
findings = [report for report in findings if len(report) > 0]

clinicaldata = list(report_df['clinicaldata'].values)
clinicaldata = [report for report in clinicaldata if len(report) > 0]

impression = list(report_df['impression'].values)
impression = [report for report in impression if len(report) > 0]


texts = exams + findings + clinicaldata + impression
labels = ['exam'] * len(exams) + ['findings'] * len(findings) + ['clinicaldata'] * len(clinicaldata) + ['impression'] * len(impression)

In [323]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, stratify=labels)

In [326]:
encoder = LabelEncoder()
y = encoder.fit_transform(y_train)

In [329]:
training = pd.DataFrame(columns=['text', 'label'])
training['text'] = X_train
training['label'] = y
training['len'] = training['text'].apply(lambda row: len(row))

In [333]:
len(training)

3155

In [331]:
model = Word2Vec(sentences=X_train, vector_size=300, workers=2, min_count=1, window=5)

In [317]:
def reduce_dimensions(model):
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv)
    tsne = TSNE(2, random_state=42)
    vectors = tsne.fit_transform(vectors)
    x = [v[0] for v in vectors]
    y = [v[1] for v in vectors]
    tsne = pd.DataFrame(columns=['x', 'y', 'labels'])
    tsne['x'] = x
    tsne['y'] = y
    return tsne

In [318]:
tsne = reduce_dimensions(model)

In [319]:
output_notebook()

plot = figure(title="t-sne clustoring of word2vec model", plot_width=900, plot_height=700)
plot.scatter(x=tsne['x'], y=tsne['y'])
show(plot)


In [350]:
def sentence_embedding(sentence):
    vectors = []
    for word in sentence:
        try:
            vector = model.wv.get_vector(word)
        except KeyError:
            vector = [0] * 300
        vectors.append(vector)
    return sum(vectors) / (len(sentence) + 1e-5)

In [355]:
X = training['text'].apply(lambda row: sentence_embedding(row))
matrix = np.array(X.tolist())

In [356]:
regression = LogisticRegression(multi_class='ovr', solver="liblinear")
regression.fit(matrix, y)

LogisticRegression(multi_class='ovr', solver='liblinear')

In [357]:
X_test = [sentence_embedding(sentence) for sentence in X_test]

In [358]:
X_test

[array([ 0.0188249 ,  0.13225019,  0.08772824,  0.01412583, -0.01230299,
        -0.10816794,  0.18357977,  0.2568228 ,  0.0628721 , -0.11168805,
         0.05038795, -0.08351395,  0.10155453, -0.04393366, -0.10102972,
        -0.14775759,  0.17049491, -0.01189805, -0.01421653, -0.068187  ,
        -0.05774127, -0.06468674,  0.02716522,  0.11907955,  0.03585389,
        -0.03005904, -0.24623902,  0.09425936, -0.20023336, -0.14291035,
        -0.04180857,  0.04033531,  0.12399694, -0.11710308, -0.09120028,
        -0.02473857,  0.09604499, -0.10019661, -0.03410236, -0.09588157,
        -0.17047371,  0.01880915, -0.07563794, -0.02102789,  0.01285414,
         0.1168047 , -0.0069763 ,  0.17211337, -0.01910128,  0.27484635,
         0.10074136,  0.03260199, -0.12779297, -0.01264422, -0.03115668,
         0.11762738,  0.07546429, -0.01354695,  0.02033511,  0.08093242,
        -0.04124631, -0.00225325, -0.00057667,  0.03335144, -0.12391946,
         0.14652824,  0.09387583,  0.05730345, -0.2

In [360]:
y_pred = regression.predict(X_test)

In [361]:
accuracy_score(y_pred, y_test)

0.0

In [363]:
y_test

['clinicaldata',
 'impression',
 'impression',
 'impression',
 'exam',
 'findings',
 'findings',
 'clinicaldata',
 'clinicaldata',
 'findings',
 'exam',
 'findings',
 'findings',
 'impression',
 'clinicaldata',
 'clinicaldata',
 'findings',
 'impression',
 'findings',
 'exam',
 'findings',
 'exam',
 'exam',
 'exam',
 'clinicaldata',
 'findings',
 'clinicaldata',
 'exam',
 'clinicaldata',
 'clinicaldata',
 'impression',
 'exam',
 'clinicaldata',
 'findings',
 'exam',
 'impression',
 'exam',
 'impression',
 'impression',
 'impression',
 'findings',
 'exam',
 'impression',
 'impression',
 'findings',
 'clinicaldata',
 'clinicaldata',
 'findings',
 'exam',
 'exam',
 'impression',
 'impression',
 'findings',
 'findings',
 'clinicaldata',
 'findings',
 'impression',
 'findings',
 'exam',
 'findings',
 'impression',
 'impression',
 'clinicaldata',
 'clinicaldata',
 'clinicaldata',
 'findings',
 'clinicaldata',
 'findings',
 'impression',
 'findings',
 'clinicaldata',
 'exam',
 'impression',
 