In [1]:
from os import listdir
from os.path import isfile, join, basename
import csv
import re
import pandas as pd
from collections import defaultdict

In [2]:
textDirs = [
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/discharge_summary",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/discharge_summary_-_general",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/discharge_summary_-_medical",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/discharge_summary_-_orthopedic_surgery",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/discharge_summary_-_stroke_neurology",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/discharge_summary_-_surgery_short",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/discharge_summary_thoracic_surgery",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/inpatient_consultation",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/inpatient_consult_report",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/inpatient_operative~procedure_report",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/history_and_physical",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/history_and_physical_examination",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/neurological_diagnostics",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/transfer_summary",
"S:/ICD11000193/ICD11000193_Documents/Elliot/AllTextFiles/text/urological_diagnostics"
]

linkerFile = "linker.csv"
labelsFile = "chart_review.csv"
outputFile = "out.csv"

In [3]:
# make a dictionary mapping visit id to chart number
mapVisitiIdToChartNumber = dict()
with open(linkerFile, 'r') as file:
    csvReader = csv.reader(file, delimiter=',')

    for i, line in enumerate(csvReader):
        if i == 0:
            continue
        matches = re.search('\d+', line[2])
        chartNumber = int(matches.group(0))
        visitId = int(float(line[13]))
        mapVisitiIdToChartNumber[visitId] = chartNumber

In [4]:
# make a list of paths to the discharge summary files
textFilePaths = list()
for textDir in textDirs:
    textFilePaths += [join(textDir, f) for f in listdir(textDir) if isfile(join(textDir, f))]

print(f"Found {len(textFilePaths)} files in the specified directories.")


Found 5115 files in the specified directories.


In [5]:
# make a dictionary mapping chart number to list of file path (each chart number could be associated with multiple files)
mapChartNumberToFilePath = defaultdict(list)
for textFilePath in textFilePaths:
    fileName = basename(textFilePath)
    matches = re.search('visitid_(\d+)-docid_\d+.txt', fileName)
    visitId = int(matches.group(1))
    chartNumber = mapVisitiIdToChartNumber.get(visitId)
    
    if chartNumber:
        mapChartNumberToFilePath[chartNumber].append(textFilePath)
    else:
        print(f"Could not find chart number for file {textFilePath}")

In [6]:
df = pd.read_csv(labelsFile)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [7]:
# Create a value map of every column, save as csv
#df.apply(pd.value_counts).to_csv('value_count.csv', index=True)

In [8]:
df['Discharge Summary File Path'] = df['Chart Number (RHRN)'].map(mapChartNumberToFilePath)

In [9]:
# based on value map of columns, the following columns are selected
df['Dyslipidemia present? ']
df['Fluid and electrolyte disorder present?']
df['Obesity present?']
df['Cancer present']
df['Peptic ulcer disease present']
df['Hypertension present']
df['Did this patient have an inpatient I/P visit within 1 year before this admission?']

0        No
1        No
2        No
3        No
4       Yes
       ... 
3068     No
3069     No
3070     No
3071     No
3072     No
Name: Did this patient have an inpatient I/P visit within 1 year before this admission?, Length: 3073, dtype: object

In [10]:
# remove all the other columns
df = df.loc[:, ['Chart Number (RHRN)','Discharge Summary File Path', 'Dyslipidemia present? ', 'Fluid and electrolyte disorder present?', 'Obesity present?', 'Cancer present', 'Peptic ulcer disease present', 'Hypertension present', 'Did this patient have an inpatient I/P visit within 1 year before this admission?']]

In [11]:
# Remove rows where text file path is not available (empty list)
df = df[df['Discharge Summary File Path'].map(lambda x: len(x)) > 0]

In [13]:
# grab text data
mapChartNumberToFreeText = dict()
for index, row in df.iterrows():
    filePaths = row['Discharge Summary File Path']
    text = ""
    for filePath in filePaths:
        with open(filePath, 'r', encoding='utf8') as file:
            text += "\n" + file.read()
    mapChartNumberToFreeText[row['Chart Number (RHRN)']] = text
df['text'] = df['Chart Number (RHRN)'].map(mapChartNumberToFreeText)

In [6]:
# clean up column names
df.drop('Discharge Summary File Path', axis=1, inplace=True)
df = df.rename(columns={'Chart Number (RHRN)': 'chart', 'Dyslipidemia present? ':'dyslipidemia','Fluid and electrolyte disorder present?':'fluid_electrolyte_disorder','Obesity present?':'obesity', 'Cancer present':'cancer', 'Peptic ulcer disease present':'peptic_ulcer','Hypertension present':'hypertension', 'Did this patient have an inpatient I/P visit within 1 year before this admission?': 'readmission'})

In [12]:
# value distribution for each variable
import matplotlib.pyplot as plt
dist = df.drop(['chart', 'text'], axis=1, inplace=False).apply(pd.value_counts)
dist.plot.bar()

Unnamed: 0,dyslipidemia,fluid_electrolyte_disorder,obesity,cancer,peptic_ulcer,hypertension,readmission
Maybe,3,,4,11,6,2,
No,1948,2256.0,2240,2078,1988,1512,2199.0
Yes,1011,706.0,718,873,968,1448,763.0


In [8]:
# Save data to csv
df.to_csv('data.csv', index=False)