<a href="https://colab.research.google.com/github/YashvardhanRanawat7/PatientInsights/blob/main/BA820_Project_Jishnu_Moorthy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Importing all neccessary libraries
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
#Connecting to Google Big Query
!pip install google-colab
!pip install google-auth google-auth-oauthlib google-auth-httplib2


Collecting jedi>=0.16 (from ipython==7.34.0->google-colab)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [3]:
from google.colab import auth
from google.cloud import bigquery

# Authenticate and authorize
auth.authenticate_user()

# Create a client to connect to BigQuery
client = bigquery.Client(project='ba820-unsup-ml')

In [4]:
# Constructing the BigQuery query
query = """
    SELECT *
    FROM ba820-unsup-ml.mimic3.noteevents
    ORDER BY SUBJECT_ID
    LIMIT 500000;
"""

# Executing the query and converting the result to a pandas DataFrame
df = client.query(query).to_dataframe()

# Displaying the DataFrame
df = df[df['CATEGORY'] == 'Radiology']

#**Pre-Processing**

In [5]:
# Lowercasing
df['TEXT'] = df['TEXT'].str.lower()

# Removing trailing spaces
df['TEXT'] = df['TEXT'].str.strip()

# Removing punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['TEXT'] = df['TEXT'].apply(remove_punctuation)

In [6]:
df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
3,769079,3,145834,2101-10-24,2101-10-24 16:06:00,NaT,Radiology,CHEST (PORTABLE AP),,,21011024 406 pm\n chest portable ap ...
4,770898,3,,2101-11-15,2101-11-15 09:15:00,NaT,Radiology,ART DUP EXT LO UNI;F/U,,,21011115 915 am\n art dup ext lo unifu clinica...
5,768818,3,145834,2101-10-20,2101-10-20 22:23:00,NaT,Radiology,CHEST (PORTABLE AP),,,21011020 1023 pm\n chest portable ap ...
6,768131,3,,2101-10-11,2101-10-11 21:06:00,NaT,Radiology,CHEST (PRE-OP PA & LAT),,,21011011 906 pm\n chest preop pa lat ...
7,769247,3,145834,2101-10-26,2101-10-26 11:34:00,NaT,Radiology,VIDEO OROPHARYNGEAL SWALLOW,,,21011026 1134 am\n video oropharyngeal swallow...
...,...,...,...,...,...,...,...,...,...,...,...
499881,748320,10993,169806,2143-11-06,2143-11-06 19:53:00,NaT,Radiology,MR-ANGIO HEAD,,,2143116 753 pm\n mrangio head mr reconstructio...
499887,748060,10993,169806,2143-11-03,2143-11-03 08:37:00,NaT,Radiology,CHEST (PA & LAT),,,2143113 837 am\n chest pa lat ...
499889,747836,10993,169806,2143-10-31,2143-10-31 05:57:00,NaT,Radiology,CHEST (PORTABLE AP),,,21431031 557 am\n chest portable ap ...
499910,771763,10994,101721,2141-10-27,2141-10-27 05:17:00,NaT,Radiology,BABYGRAM (CHEST ONLY),,,21411027 517 am\n babygram chest only ...


#**Tokenization**

In [7]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize, WhitespaceTokenizer, RegexpTokenizer
from nltk.tokenize.casual import TweetTokenizer
import nltk
nltk.download('punkt')

tokenized = [word_tokenize(t) for t in df['TEXT']] # word tokenization
# tokenized = [WhitespaceTokenizer().tokenize(t) for t in corpus] # word/punctuation tokenization
# tokenized = [TweetTokenizer().tokenize(t) for t in corpus] #Tweets tokenization
# tokenized = [RegexpTokenizer(r'\d{4}|\d{3}', gaps=False).tokenize(t) for t in corpus] # '\([0-9]{3}\)-[0-9]{3}-[0-9]{4}' #'\d{4}|\d{3}' # Regex tokenization. This keeps phone numbers only
tokenized

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[['21011024',
  '406',
  'pm',
  'chest',
  'portable',
  'ap',
  'clip',
  'clip',
  'number',
  'radiology',
  '69243',
  'reason',
  'please',
  'assess',
  'rij',
  'placement',
  'thankyou',
  'hospital',
  '2',
  'medical',
  'condition',
  '76',
  'year',
  'old',
  'man',
  'with',
  'hypotension',
  'vfib',
  'arrest',
  'lll',
  'infiltrateeffusions',
  'reason',
  'for',
  'this',
  'examination',
  'please',
  'assess',
  'rij',
  'placement',
  'thankyou',
  'final',
  'report',
  'clinical',
  'history',
  'assess',
  'right',
  'jugular',
  'line',
  'placement',
  'chest',
  'the',
  'tip',
  'of',
  'the',
  'right',
  'jugular',
  'line',
  'lies',
  'in',
  'the',
  'proximal',
  'svc',
  'perihilar',
  'edema',
  'and',
  'upper',
  'zone',
  'redistribution',
  'is',
  'still',
  'present',
  'consistent',
  'with',
  'known',
  'failure',
  'it',
  'is',
  'somewhat',
  'improved',
  'since',
  'the',
  'previous',
  'chest',
  'xray',
  '9',
  'hours',
  'previou

#**Bag Of Words**

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv_model = CountVectorizer() #tokenizer= lambda x: word_tokenize(x), stop_words='english'

#Fitting the model to the column TEXT
ft_model = cv_model.fit_transform(df['TEXT'])

print('number of `tokens`', len(cv_model.vocabulary_))
cv_model.vocabulary_

number of `tokens` 181192


{'21011024': 17790,
 '406': 66247,
 'pm': 160288,
 'chest': 125960,
 'portable': 161072,
 'ap': 119425,
 'clip': 126922,
 'number': 155257,
 'radiology': 164403,
 '69243': 90026,
 'reason': 164743,
 'please': 160082,
 'assess': 120640,
 'rij': 167081,
 'placement': 159781,
 'thankyou': 174197,
 'hospital': 142853,
 'medical': 151380,
 'condition': 128144,
 '76': 96142,
 'year': 181020,
 'old': 155939,
 'man': 150728,
 'with': 180185,
 'hypotension': 143706,
 'vfib': 178767,
 'arrest': 120016,
 'lll': 149601,
 'infiltrateeffusions': 145441,
 'for': 139140,
 'this': 174472,
 'examination': 136553,
 'final': 138343,
 'report': 165989,
 'clinical': 126899,
 'history': 142645,
 'right': 167004,
 'jugular': 147836,
 'line': 149305,
 'the': 174213,
 'tip': 175093,
 'of': 155844,
 'lies': 149176,
 'in': 144638,
 'proximal': 163293,
 'svc': 172843,
 'perihilar': 158967,
 'edema': 134117,
 'and': 118534,
 'upper': 177611,
 'zone': 181161,
 'redistribution': 165165,
 'is': 147387,
 'still': 17119

In [None]:
bow = pd.DataFrame(ft_model.toarray(), columns=cv_model.get_feature_names_out())
bow

#**TF-IDF**


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_model = TfidfVectorizer(norm=None)

tfidf_model.fit(df['TEXT'])

df_tfidf_transformed = tfidf_model.transform(df['TEXT'])
tfidf_vectors = pd.DataFrame(df_tfidf_transformed.toarray(), columns=tfidf_model.get_feature_names_out())
tfidf_vectors

#**Unsupervised Machine Learning**


In [None]:
X = df['TEXT']
y = df['DESCRIPTION']

# df = pd.DataFrame(df.text.str.lower()) # We can try lower-casing.

df

In [None]:
from sklearn.model_selection import train_test_split

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = CountVectorizer() #lowercase=False

# create the vectorizer.
X_train_counts = vectorizer.fit_transform(X_train)

# vectorize the test set
X_test_counts = vectorizer.transform(X_test)

In [None]:
X_train_counts.toarray().shape