<a href="https://colab.research.google.com/github/YashvardhanRanawat7/PatientInsights/blob/main/BA820_Project_Jishnu_Moorthy(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Importing all neccessary libraries
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [3]:
#Connecting to Google Big Query
!pip install google-colab
!pip install google-auth google-auth-oauthlib google-auth-httplib2




In [4]:
from google.colab import auth
from google.cloud import bigquery

# Authenticate and authorize
auth.authenticate_user()

# Create a client to connect to BigQuery
client = bigquery.Client(project='ba820-unsup-ml')

In [5]:
# Constructing the BigQuery query
query = """
    SELECT *
    FROM ba820-unsup-ml.mimic3.noteevents
    ORDER BY SUBJECT_ID
    LIMIT 500000;
"""

# Executing the query and converting the result to a pandas DataFrame
df = client.query(query).to_dataframe()

#Sampling the dataframe
df = df.sample(frac=0.2, random_state=42)

# Displaying the DataFrame
df = df[df['CATEGORY'] == 'Radiology']

#**Pre-Processing**

In [6]:
# Lowercasing
df['TEXT'] = df['TEXT'].str.lower()

# Removing trailing spaces
df['TEXT'] = df['TEXT'].str.strip()

# Removing punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['TEXT'] = df['TEXT'].apply(remove_punctuation)

In [7]:
df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
442316,858844,9703,120793,2196-01-31,2196-01-31 01:45:00,NaT,Radiology,CHEST (PORTABLE AP),,,2196131 145 am\n chest portable ap ...
6940,762298,132,160192,2115-05-10,2115-05-10 16:23:00,NaT,Radiology,CHEST (PORTABLE AP),,,2115510 423 pm\n chest portable ap ...
230532,987252,5060,,2180-12-24,2180-12-24 16:23:00,NaT,Radiology,CT HEAD W/O CONTRAST,,,21801224 423 pm\n ct head wo contrast ...
452804,848325,9936,147105,2102-11-06,2102-11-06 09:58:00,NaT,Radiology,CHEST (PORTABLE AP),,,2102116 958 am\n chest portable ap ...
255715,906411,5557,164963,2117-03-08,2117-03-08 21:34:00,NaT,Radiology,CHEST (PORTABLE AP),,,211738 934 pm\n chest portable ap 76 by same p...
...,...,...,...,...,...,...,...,...,...,...,...
147120,954657,3127,144842,2115-03-07,2115-03-07 17:03:00,NaT,Radiology,CT PELVIS W/CONTRAST,,,211537 503 pm\n ct abdomen wcontrast ct pelvis...
17922,940555,402,169538,2155-01-07,2155-01-07 07:14:00,NaT,Radiology,CHEST (PORTABLE AP),,,215517 714 am\n chest portable ap ...
220630,856149,4880,136244,2197-01-26,2197-01-26 10:51:00,NaT,Radiology,BY SAME PHYSICIAN,,,2197126 1051 am\n babygram chest abd together...
66361,758550,1374,137853,2170-04-23,2170-04-23 11:31:00,NaT,Radiology,CT HEAD W/O CONTRAST,,,2170423 1131 am\n ct head wo contrast ...


#**Tokenization**

In [8]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize, WhitespaceTokenizer, RegexpTokenizer
from nltk.tokenize.casual import TweetTokenizer
import nltk
nltk.download('punkt')

tokenized = [word_tokenize(t) for t in df['TEXT']] # word tokenization
tokenized

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['2196131',
  '145',
  'am',
  'chest',
  'portable',
  'ap',
  'clip',
  'clip',
  'number',
  'radiology',
  '39100',
  'reason',
  'pt',
  'cont',
  'w',
  'desat',
  'episodes',
  'admitting',
  'diagnosis',
  'stroketelemetry',
  'hospital',
  '2',
  'medical',
  'condition',
  '75m',
  'sp',
  'intubation',
  'with',
  'recent',
  'repositioning',
  'of',
  'et',
  'tube',
  'reason',
  'for',
  'this',
  'examination',
  'pt',
  'cont',
  'w',
  'desat',
  'episodes',
  'final',
  'report',
  'history',
  'recent',
  'repositioning',
  'of',
  'the',
  'et',
  'tube',
  'with',
  'desaturation',
  'findings',
  'the',
  'et',
  'tube',
  'is',
  '3',
  'cm',
  'above',
  'the',
  'carina',
  'there',
  'has',
  'been',
  'no',
  'significant',
  'change',
  'in',
  'the',
  'subclavian',
  'line',
  'with',
  'tip',
  'in',
  'the',
  'svc',
  'or',
  'ng',
  'tube',
  'with',
  'tip',
  'probably',
  'in',
  'the',
  'descending',
  'portion',
  'of',
  'the',
  'duodenum',
  

#**Bag Of Words**

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv_model = CountVectorizer() #tokenizer= lambda x: word_tokenize(x), stop_words='english'

#Fitting the model to the column TEXT
ft_model = cv_model.fit_transform(df['TEXT'])

print('number of `tokens`', len(cv_model.vocabulary_))
cv_model.vocabulary_

number of `tokens` 74447


{'2196131': 23862,
 '145': 3645,
 'am': 46576,
 'chest': 49891,
 'portable': 65324,
 'ap': 47082,
 'clip': 50315,
 'number': 62777,
 'radiology': 66777,
 '39100': 29898,
 'reason': 66933,
 'pt': 66378,
 'cont': 51058,
 'desat': 52252,
 'episodes': 54165,
 'admitting': 46231,
 'diagnosis': 52406,
 'stroketelemetry': 70033,
 'hospital': 57305,
 'medical': 60991,
 'condition': 50841,
 '75m': 39614,
 'sp': 69359,
 'intubation': 59110,
 'with': 74002,
 'recent': 66990,
 'repositioning': 67507,
 'of': 63029,
 'et': 54314,
 'tube': 72329,
 'for': 55675,
 'this': 71376,
 'examination': 54495,
 'final': 55337,
 'report': 67495,
 'history': 57207,
 'the': 71272,
 'desaturation': 52267,
 'findings': 55346,
 'is': 59221,
 'cm': 50370,
 'above': 45847,
 'carina': 49402,
 'there': 71297,
 'has': 56609,
 'been': 48197,
 'no': 62410,
 'significant': 68981,
 'change': 49787,
 'in': 58038,
 'subclavian': 70153,
 'line': 60060,
 'tip': 71666,
 'svc': 70614,
 'or': 63264,
 'ng': 62360,
 'probably': 66027,

In [11]:
bow = pd.DataFrame(ft_model.toarray(), columns=cv_model.get_feature_names_out())
bow

Unnamed: 0,00,000,0001,00035,0007,001,0012,0014,0015,0016,...,zoster,zosyn,zosynvanc,zscore,zxanthrochromia,zygoma,zygomas,zygomatic,zygomaticbuccal,zygomaticomaxillary
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21854,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21855,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21856,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21857,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


#**TF-IDF**


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
tfidf_model = TfidfVectorizer(norm=None)

tfidf_model.fit(df['TEXT'])

df_tfidf_transformed = tfidf_model.transform(df['TEXT'])
tfidf_vectors = pd.DataFrame(df_tfidf_transformed.toarray(), columns=tfidf_model.get_feature_names_out())
tfidf_vectors

Unnamed: 0,00,000,0001,00035,0007,001,0012,0014,0015,0016,...,zoster,zosyn,zosynvanc,zscore,zxanthrochromia,zygoma,zygomas,zygomatic,zygomaticbuccal,zygomaticomaxillary
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21854,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21855,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21856,5.447236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21857,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0




#**Unsupervised Machine Learning**


In [None]:
X = df['TEXT']
y = df['DESCRIPTION']

# df = pd.DataFrame(df.text.str.lower()) # We can try lower-casing.

df

In [None]:
from sklearn.model_selection import train_test_split

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = CountVectorizer() #lowercase=False

# create the vectorizer.
X_train_counts = vectorizer.fit_transform(X_train)

# vectorize the test set
X_test_counts = vectorizer.transform(X_test)

In [None]:
X_train_counts.toarray().shape