<a href="https://colab.research.google.com/github/aleccwilliams810/AI-Healthcare/blob/main/NLP_MIMIC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy==1.24.0

In [None]:
!pip install --upgrade google-cloud-bigquery
!pip install -U spacy
!pip install scispacy
!pip install gensim
!python -m spacy download en_core_web_sm
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz

In [1]:
from google.colab import auth
from google.cloud import bigquery

import spacy
import scispacy
from spacy import displacy

import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np

from gensim.models import word2vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
spacy_model = spacy.load("en_core_web_sm")
scispacy_model = spacy.load("en_core_sci_sm")

In [3]:
auth.authenticate_user()
client = bigquery.Client(project="careful-broker-438616-s1")

icd9_code = '7840' #Filtering MIMIC NoteEvent data for those that have 'Headache' (784.0) as the diagnosis

query = """
SELECT n.TEXT
FROM `physionet-data.mimiciii_notes.noteevents` n
INNER JOIN `physionet-data.mimiciii_clinical.diagnoses_icd` d
ON d.HADM_ID = n.HADM_ID
WHERE d.ICD9_CODE = '7840'
AND n.CATEGORY = 'Discharge summary'
"""

query_job = client.query(query)

df = query_job.result().to_dataframe()

In [None]:
spacy_ents = []
scispacy_ents = []

for text in df['TEXT']:
  spacy_doc = spacy_model(text)
  spacy_ents.append([ent.text for ent in spacy_doc.ents])

  scispacy_doc = scispacy_model(text)
  scispacy_ents.append([ent.text for ent in scispacy_doc.ents])

In [5]:
spacy_w2v = word2vec.Word2Vec(spacy_ents, min_count=1)
scispacy_w2v = word2vec.Word2Vec(scispacy_ents, min_count=1)

In [6]:
def tsne_plot(model, title):
    labels = []
    tokens = []

    for word in model.wv.key_to_index.keys():
        tokens.append(model.wv[word])
        labels.append(word)

    tokens = np.array(tokens)
    tsne_model = TSNE(perplexity=20, early_exaggeration=12, n_components=2, init='pca', n_iter=1000, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = [value[0] for value in new_values]
    y = [value[1] for value in new_values]

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i], y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.title(title)
    plt.show()


In [None]:
tsne_plot(spacy_w2v, "Spacy Word2Vec")

In [None]:
tsne_plot(scispacy_w2v, "SciSpacy Word2Vec")