# Set-up

import necessary libraries

In [17]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import json

import data and load the model + pipeline

In [18]:
data = pd.read_csv("clinical_notes_records.csv")
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
generate_pipeline = pipeline("text2text-generation", model=gen_model, tokenizer=tokenizer)

Device set to use cuda:0


# Step 1: extract terms

In [19]:
def extract_terms(note):
    prompt = "Extract the most alcohol-related single term like 'wine', 'beer', or 'vodka', etc., from the text: " + note + "\n. The output contains words seperated by a comma."
    extract = generate_pipeline(prompt, max_length=50, num_return_sequences=1)
    generated_text = extract[0]["generated_text"].strip().lower()
    extracted_terms = [term.strip() for term in generated_text.split(",")]
    print(extracted_terms) #control the extraction
    return extracted_terms[0]


# Step 2: answer the true/false questions




In [20]:
def classify_attributes_generative(note):
    #define the questions
    attributes = {
        "Family": "Does the alcohol use concern family?",
        "Historic": "Is the alcohol use historic?",
        "Negation Status": "Is the alcohol use negated?"
    }

    classifications = {}

    #generate answers
    for attr, question in attributes.items():
        prompt = f"Context: {note}\nQuestion: {question}\nAnswer (only yes or no):"
        result = generate_pipeline(prompt, max_length=10, num_return_sequences=1)
        generated_text = result[0]["generated_text"].strip().lower()
        classifications[attr] = generated_text.startswith("yes")

    return classifications

# Step 3: combine the 2 previous steps into 1 function

In [21]:
def process_note_generative(note):
    # step 1
    terms = extract_terms(note)

    # step 2
    attributes = classify_attributes_generative(note)

    # combine
    return {
        "Term": terms,
        "Concept": "Alcohol Abuse" if terms else None,
        **attributes
}

# Apply everything to the whole dataset

In [22]:
data["processed"] = data["Clinical Note"].apply(process_note_generative)

['beer']
['whiskey']
['whiskey']
['vodka']
['alcohol']
['alcohol']
['alcohol']
['beers']
['vodka']
['beer']


# Lastly, format JSON output

In [23]:
def format_output(row):
    return {
        "Patient ID": row["Patient ID"],
        "Note ID": row["Note ID"],
        "Extracted Terms":[{**row["processed"]}]
    }

output = data.apply(format_output, axis=1).tolist()

with open("output.json", "w") as f:
    json.dump(output, f, indent=4)

print("done (check output.json)!")

done (check output.json)!
