# Dependencies

In [2]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Downloading ollama...
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


Inside terminal run the following:

```
ollama serve &
ollama run llama3
```


In [3]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.3-py3-none-any.whl (2.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m2.1/2.2 MB[0m [31m30.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain<0.3.0,>=0.2.0 (from langchain_community)
  Downloading langchain-0.2.2-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.6/973.6 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-c

In [4]:
# Load Google Drive because it stores /content/drive/My Drive/ards-cohort-notes/ards-cohort-notes.csv
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Imports

In [5]:
import pandas as pd
import time
import random
import csv
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

# Functions to load data, specify LLM prompt, and perform LLM inference

In [6]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df.fillna('', inplace=True)
    return df

def select_random_start(num_rows, min_rows=15):
    if num_rows < min_rows:
        raise ValueError(f"The dataset must contain at least {min_rows} rows to process.")
    return random.randint(0, num_rows - min_rows)

def create_prompt_template():
    return PromptTemplate(
        template=(
            "Context: You are a clinician receiving chunks of clinical text for patients in an ICU. Please do the reviewing as quickly as possible.\n"
            "Task: Determine if the patient suffered from aspiration.\n"
            "Instructions: Answer with 'Yes' or 'No'. If there is not enough information, answer 'No'.\n"
            "Discharge Text:\n{discharge_text}\n\n"
            "Query: Does the chunk of text mention that the patient suffered from aspiration? Answer strictly in 'Yes' or 'No'."
        ),
        input_variables=["discharge_text"]
    )

def chunk_text(text, chunk_size, overlap):
    start = 0
    chunks = []
    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

def check_for_aspiration(discharge_text, llm, prompt_template, chunk_size, chunk_overlap):
    chunks = chunk_text(discharge_text, chunk_size, chunk_overlap)
    results = []
    for chunk in chunks:
        prompt = prompt_template.format(discharge_text=chunk)
        try:
            response = llm.invoke(prompt)
            results.append(response.strip())
        except Exception as e:
            results.append(f"Error invoking model: {e}")
    aspiration_mentions = [res for res in results if "Yes" in res]
    if aspiration_mentions:
        return "Yes", aspiration_mentions[0], len(discharge_text) # Return aspiration result, explanation, and length of discharge_text
    else:
        return "No", results[0] if results else "No sufficient data", len(discharge_text) # Return aspiration result, explanation, and length of discharge_text

def process_patients(df, start_index, num_patients, llm, prompt_template, chunk_size, chunk_overlap, output_csv_file, progress_report_file):
    processing_time = []
    with open(output_csv_file, 'a', newline='') as csvfile, open(progress_report_file, 'a') as report_file:
        # Open the CSV file for writing
        fieldnames = ['hadm_id', 'discharge_text_length', 'aspiration_detected', 'time_taken']
        # Define the column names
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Only write header if the file is empty
        if csvfile.tell() == 0:
            writer.writeheader()  # Write the header row

        for i in range(start_index, start_index + num_patients):
            current_hadm_id = df['hadm_id'].values[i]
            start_time = time.time()
            data = df[df['hadm_id'] == current_hadm_id]
            if data.empty:
                result = f"No data found for hadm_id: {current_hadm_id}"
            else:
                discharge_text = data['discharge_text'].values[0]
                aspiration_result, explanation, discharge_text_length = check_for_aspiration(discharge_text, llm, prompt_template, chunk_size, chunk_overlap)
                end_time = time.time()
                elapsed_time = end_time - start_time
                minutes, seconds = divmod(elapsed_time, 60)
                processing_time.append(elapsed_time)
                # Write data to CSV file
                writer.writerow({
                    'hadm_id': current_hadm_id,
                    'discharge_text_length': discharge_text_length,
                    'aspiration_detected': aspiration_result,
                    'time_taken': round(elapsed_time)
                })
                csvfile.flush()  # Flush the buffer to ensure data is written

                # Write data to progress report file
                report_file.write(f"Patient Number: {i}, HADM ID: {current_hadm_id}, Discharge Text Length: {discharge_text_length}, Aspiration Detected: {aspiration_result}, Time Taken: {round(elapsed_time)}\n")
                report_file.flush()  # Flush the buffer to ensure data is written

                print(f"Processed Patient Number {i}\n")

# Main (calls all the functions above)

In [8]:
def main(file_path, model_name, chunk_size, chunk_overlap, output_csv_file, progress_report_file, num_patients):
    df = load_data(file_path)
    #start_index = select_random_start(len(df))
    start_index = 2001
    prompt_template = create_prompt_template()
    llm = Ollama(model=model_name, callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
    process_patients(df, start_index, num_patients, llm, prompt_template, chunk_size, chunk_overlap, output_csv_file, progress_report_file)

if __name__ == "__main__":
    main(
        file_path='/content/drive/My Drive/ards-cohort-notes/ards-cohort-notes.csv',
        model_name="llama3",
        chunk_size=4096,
        chunk_overlap=100,
        output_csv_file='aspiration-trial-2020.csv',  # Change the output file name to a CSV file
        progress_report_file='aspiration-trial-2020.txt',  # Path to the progress report file
        num_patients = 20
    )


NoNo.NoProcessed Patient Number 2001

No.NoNoYesNo.Processed Patient Number 2002

No.No.YesNo.No.Processed Patient Number 2003

NoNoNoYesYesProcessed Patient Number 2004

NoNo.NoProcessed Patient Number 2005

NoNo.Processed Patient Number 2006

No.NoYesNoNoProcessed Patient Number 2007

No.NoNoNoYesNoNoProcessed Patient Number 2008

NoNoNoNo.No.Processed Patient Number 2009

NoNoYesNoProcessed Patient Number 2010

NoNoNoYesNoNo.NoProcessed Patient Number 2011

NoNo.NoNoNoNoProcessed Patient Number 2012

NoNoNoNoProcessed Patient Number 2013

No.NoNo.Processed Patient Number 2014

No.NoYesNoNo.NoYesNoProcessed Patient Number 2015

NoNo.Processed Patient Number 2016

No.NoNoNoProcessed Patient Number 2017

NoNoNoNoProcessed Patient Number 2018

NoNoNoNoNoNoProcessed Patient Number 2019

No.NoNoProcessed Patient Number 2020

