# 0. Import data

In [None]:
#@title Run this cell and complete the authentication step!
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np
import pickle

from google.colab import auth

auth.authenticate_user()

In [None]:
#@title Then, run this cell to import data files.


!gsutil cp gs://mlhc-mimic/pset2/adult_dc_concepts.csv ./
!gsutil cp gs://mlhc-mimic/pset2/adult_dc_summaries.csv ./
!gsutil cp gs://mlhc-mimic/pset2/cooccurrence_info.p ./
!gsutil cp gs://mlhc-mimic/pset2/male_cooccurrence_info.p ./
!gsutil cp gs://mlhc-mimic/pset2/female_cooccurrence_info.p ./
!gsutil cp gs://mlhc-mimic/pset2/disease_symptom_names.p ./


concepts = pd.read_csv('adult_dc_concepts.csv')
discharge = pd.read_csv('adult_dc_summaries.csv')

discharge['index'] = discharge.index
df = discharge.merge(concepts, on='index', how='right')


---




# Part 1: What are clinical concepts? How do they work?

### Explore the data

In [None]:
df.head(1)

In [None]:
df.iloc[678]['dc_chart']

In [None]:
print(len(df['index'].value_counts()), 'discharge summaries')
print(len(df), 'extracted concepts')

In [None]:
df.groupby('index')['index'].value_counts().mean()


In [None]:
df.groupby('index')['index'].value_counts().hist(bins=25)

In [None]:
# TODO: 1 Calculate the average number of extracted concepts per discharge summary in the dataframe. Round to the nearest integer.

In [None]:
# TODO: 2 Plot a histogram of the number of concepts per discharge summary using binsize 25. Select the correct plot.

In [None]:
print(len(df[(df['icustay_id'] == 232593)]), 'extracted concepts for icustay_id = 232593')
df[df['icustay_id'] == 232593][['trigger', 'semtypes', 'preferred_name','cui']].head()

### Difficulty of Concept Extraction

In [None]:
# TODO: 3 CUI C0015967 indicates Fever.
# Give at least 1 additional term that triggers CUI C0015967. 
# Please give your answer in lowercase with spaces 
# (i.e. with no uppercase letters, even if the trigger has uppercase letters).
df[df['cui']=='C0015967'][100:110]


In [None]:
# TODO: 4 Several words in the English language may have multiple meanings 
# when used in a clinical context. Which of the following best describes the 
# relationship between CUIs C0009264, C0009443, and C0009269? 
df[df['cui']=='C0009264']

In [None]:
df[df['cui']=='C0009443']


In [None]:
df[df['cui']=='C0009269']

# 2. Relating Symptoms and Diseases

In [None]:
df.loc[df['semtypes'] == '[dsyn]'].head(1)[['semtypes', 'preferred_name', 'cui']]

In [None]:
# TODO: 5 Which of the following is the most frequent disease CUI in the dataset?
# By "most frequent", we mean "occurring in the greatest number of rows".
df[df['semtypes']=='[dsyn]']['cui'].value_counts()

In [None]:
# TODO: 6 Which of the following is the most frequent symptom CUI in the dataset?
# By "most frequent", we mean "occurring in the greatest number of rows".
df[df['semtypes']=='[sosy]']['cui'].value_counts()

## Disease-Symptom Co-occurrence 

In [None]:
#@title Function for creating a co-occurrence matrix
def create_cooccurrence_matrix(df, diseases, symptoms):
  cooccur = np.zeros((len(diseases), len(symptoms)))
  disease_count = np.zeros((len(diseases)))
  symptom_count = np.zeros((len(symptoms)))
  stay_ids = df['icustay_id'].unique()
  for i, stay in enumerate(stay_ids):
      sub_df = df[df['icustay_id'] == stay]
      # Update disease counts
      for d_idx, d in enumerate(diseases):   
          d_in_uid = (d in sub_df['preferred_name'].values)
          if d_in_uid:
              disease_count[d_idx] += 1
      # Update symptom counts
      for s_idx, s in enumerate(symptoms):
          s_in_uid = (s in sub_df['preferred_name'].values)
          if s_in_uid:
              symptom_count[s_idx] += 1
      # Update combined counts
      for d_idx, d in enumerate(diseases):   
          d_in_uid = (d in sub_df['preferred_name'].values)
          for s_idx, s in enumerate(symptoms):
              s_in_uid = (s in sub_df['preferred_name'].values)
              if d_in_uid and s_in_uid:
                  cooccur[d_idx][s_idx] += 1
  return cooccur, disease_count, symptom_count, len(stay_ids)

In [None]:
# Cooccur is a matrix of (num_diseases, num_symptoms), representing the number of ICU stays where a disease and
# a symptom were mentioned
# Disease_counts is a list of length num_diseases, equaling the number of ICU stays where the disease was mentioned
# Symptom_counts is a list of length nym_symptoms, equaling the number of ICU stays where the symptom was mentioned
# N is the number of patients in the cohort

cooccur, disease_counts, symptom_counts, N = pickle.load(open('./cooccurrence_info.p', 'rb')) 
disease_names, symptom_names = pickle.load(open('disease_symptom_names.p', 'rb'))

In [None]:
print(len(disease_names))
print(len(cooccur))
print(len(symptom_names))
print(len(cooccur[0]))


In [None]:
cooccur[3][25]

In [None]:
# TODO 7 Which of the following is the correct interpretation of cooccur[3][25]?
print(disease_names[3])
print(symptom_names[25])

In [None]:
disease_names[1]

In [None]:
disease_counts[1]

In [None]:
symptom_names[0]

In [None]:
symptom_counts[0]

In [None]:
cooccur[1][0]

In [None]:
# TODO 8 Fill in the function below to calculate lift. You should return a 
# matrix of size (disease_ct, symptom_ct) containing the lift for each pair.
# What disease-symptom pair has the highest lift?

def calculate_lift(cooccur, disease_ct, symptom_ct, N):
  ## TODO: fill in
  m=[[0 for i in range(len(symptom_ct))] for j in range(len(disease_ct))]

  for i in range(len(disease_ct)):
    for j in range(len(symptom_ct)):
      m[i][j]=cooccur[i][j]/(disease_ct[i]*symptom_ct[j])
  
  return m

In [None]:
result=calculate_lift(cooccur,disease_counts,symptom_counts,N)

In [None]:
import numpy

In [None]:
print(numpy.where(result == numpy.amax(result)))

In [None]:
print(disease_names[64])
print(symptom_names[74])

In [None]:
# First, we load occurrences matrices we made separately for men and women.  

male_cooccur, male_disease_ct, male_symptom_ct, male_N = pickle.load(open('./male_cooccurrence_info.p', 'rb')) 
female_cooccur, female_disease_ct, female_symptom_ct, female_N = pickle.load(open('./female_cooccurrence_info.p', 'rb')) 

In [None]:
#TODO: 11, 12 Rerun your lift calculation on the male and female cohorts separately.
# Then, look at the lifts between each of the symptoms and heart attack
# (denoted Myocardial Infarction in the disease names list). 

# Select the answer which lists the 3 symptoms with the highest lift for the male cohort.
# Select the answer which lists the 3 symptoms with the highest lift for the female cohort.
male=calculate_lift(male_cooccur,male_disease_ct,male_symptom_ct,male_N)
female=calculate_lift(female_cooccur,female_disease_ct,female_symptom_ct,female_N)

In [None]:
disease_names[len(disease_names)-11]

In [None]:
l=[]
for i in range(len(male[len(disease_names)-11])):
  l.append([male[len(disease_names)-11][i],i])

In [None]:
print(sorted(l))

In [None]:
print(symptom_names[86])
print(symptom_names[60])
print(symptom_names[62])

In [None]:
l=[]
for i in range(len(female[len(disease_names)-11])):
  l.append([female[len(disease_names)-11][i],i])

In [None]:
print(sorted(l))

In [None]:
print(symptom_names[58])
print(symptom_names[60])
print(symptom_names[89])