### Prompts analysis

In [5]:
import pandas as pd

In [10]:
tasks = {
    "asthma_classification": "Obesity Co-Morbidity (Asthma)",
    "cohort_alcohol_abuse_classification": "Cohort Alcohol Abuse",
    "cad_classification": "Obesity Co-Morbidity (CAD)",
    "cohort_drug_abuse_classification": "Cohort Drug Abuse",
    "cohort_english_classification": "Cohort English",
    "cohort_make_decisions_classification": "Cohort Make Decisions",
    "cohort_abdominal_classification": "Cohort Abdominal",
    "diabetes_mellitus_classification": "Obesity Co-Morbidity (Diabetes Mellitus)",
    "obesity_classification": "Obesity",
}

ext_tasks = [
    "drug_extraction",
    "medication_extraction",
    "concept_treatment_extraction",
    "concept_problem_extraction",
    "concept_test_extraction",
    "risk_factor_cad_extraction",
]


inst = pd.read_csv("../instructions/instructions_from_experts.csv")


In [12]:
import pandas as pd
import textstat

# Function to calculate readability score for a given text
def calculate_readability(text):
    try:
        return textstat.flesch_reading_ease(text)
    except Exception as e:
        print(f"Error calculating readability: {e}")
        return None

# Function to process each column and calculate readability scores
def process_columns(df):
    for column in df.columns:
        if df[column].dtype == 'object':  # Check if column contains text data
            print(f"Processing column '{column}'")
            readability_scores = df[column].apply(calculate_readability)
            df[f"{column}_readability_score"] = readability_scores
            print(f"Readability scores for column '{column}':")
            print(readability_scores)
            print()

# Read CSV into a pandas DataFrame
def main(csv_file):
    try:
        df = pd.read_csv(csv_file)
        process_columns(df)
        # Optionally, save the modified DataFrame back to CSV
        df.to_csv("output_with_readability_scores.csv", index=False)
    except Exception as e:
        print(f"Error reading or processing CSV file: {e}")

# Example usage
if __name__ == "__main__":
    csv_file = "../instructions/instructions_from_experts.csv"  # Replace with your CSV file path
    main(csv_file)


Processing column 'Name'
Readability scores for column 'Name':
0    -132.59
1    -132.59
2    -132.59
3    -132.59
4    -132.59
5    -132.59
6    -132.59
7    -132.59
8    -132.59
9    -132.59
10   -132.59
11   -132.59
Name: Name, dtype: float64

Processing column 'Cohort Drug Abuse Classification
Input: medical notes of a single patient (assume the notes are given above the instruction.)

Given a corpus of longitudinal medical records of a single patient, your aim is to have the model classify if the patient meets or does not meet the definition of drug abuse.

Please write your instruction in such a way that the model will include one of "Yes" or "No" in the response.'
Readability scores for column 'Cohort Drug Abuse Classification
Input: medical notes of a single patient (assume the notes are given above the instruction.)

Given a corpus of longitudinal medical records of a single patient, your aim is to have the model classify if the patient meets or does not meet the definition of

### Plots for logit based

In [6]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
from sklearn.metrics import roc_auc_score
import seaborn as sns

import numpy as np

df_results = pd.read_csv("/work/frink/private_datasets/final_classification.csv")


def convert_to_float_list(string_list):
    return [float(x) for x in string_list.strip("[]").split(", ")]


# Apply the function to the 'probabilities' column
df_results["probabilities"] = df_results["probabilities"].apply(convert_to_float_list)
df_results["positive_probabilities"] = [p[1] for p in df_results["probabilities"]]
all_datasets = df_results["dataset"].unique()
all_models = df_results["model"].unique()
all_annotators = df_results["annotator"].unique()
df_results = df_results.sort_values(by=["dataset", "model", "annotator"])
all_auroc = []

df_list = []

for dataset in all_datasets:
    for model in all_models:
        for annotator in all_annotators:
            df = df_results[
                (df_results["dataset"] == dataset)
                & (df_results["model"] == model)
                & (df_results["annotator"] == annotator)
            ].copy()
            labels = df["gold_class"].unique()
            auroc = roc_auc_score(df["gold_class"], df["positive_probabilities"])
            df["auroc"] = auroc
            df_list.append(df)

df_results = pd.concat(df_list)
df_results = df_results.drop(["instance", "probabilities"], axis=1)

df_results_grouped = (
    df_results.groupby(["task_type", "dataset", "model", "annotator"])
    .mean()
    .reset_index()
)

KeyboardInterrupt: 

In [2]:
def get_results_stacked(df_results_grouped, how="max"):
    datasets = df_results_grouped["dataset"].unique()
    models = df_results_grouped["model"].unique()
    if how == "max":
        df_results_grouped_max = (
            df_results_grouped[["dataset", "model", "auroc"]]
            .groupby(["dataset", "model"])
            .max()
            .reset_index()
        )
    elif how == "min":
        df_results_grouped_max = (
            df_results_grouped[["dataset", "model", "auroc"]]
            .groupby(["dataset", "model"])
            .min()
            .reset_index()
        )
    else:
        df_results_grouped_max = (
            df_results_grouped[["dataset", "model", "auroc"]]
            .groupby(["dataset", "model"])
            .mean()
            .reset_index()
        )

    df_results_grouped_max = (
        df_results_grouped_max[["model", "auroc"]].groupby("model").mean()
    )

    return df_results_grouped_max

In [3]:
df_results_grouped = df_results_grouped[["dataset", "model", "annotator", "auroc"]]

annotators = df_results_grouped["annotator"].unique()
datasets = df_results_grouped["dataset"].unique()
models = df_results_grouped["model"].unique()

# for each model,dataset pair, get the annotator with the highest f1

In [4]:
df_results_grouped_max_cnt = get_results_stacked(df_results_grouped, "max")
df_results_grouped_min_cnt = get_results_stacked(df_results_grouped, "min")
df_results_grouped_mean_cnt = get_results_stacked(df_results_grouped, "mean")
df_results_grouped_max_cnt["type"] = "Best"
df_results_grouped_min_cnt["type"] = "Worst"
df_results_grouped_mean_cnt["type"] = "Median"

# stack

results = pd.concat(
    [
        df_results_grouped_max_cnt,
        df_results_grouped_mean_cnt,
        df_results_grouped_min_cnt,
    ]
)


ORDER_MAP = {
    "Llama-2-7b-chat-hf": 2,
    "Llama-2-13b-chat": 1,
    "alpaca-7b": 3,
    "mistral-7b": 0,
    "asclepius": 6,
    "clinical-camel-7b": 5,
    "medalpaca-7b": 7,
}

results["order"] = results.index.map(ORDER_MAP)
results = results.sort_values(by=["order", "type"])

In [8]:
colors

In [5]:
MODEL_MAP = {
    "mistral-7b": "Mistral (7b)",
    "Llama-2-7b-chat-hf": "Llama-2 (7b)",
    "asclepius": "Asclepius (7b)",
    "Llama-2-13b-chat": "Llama-2 (13b)",
    "clinical-camel-7b": "Clin-Camel (13b)",
    "alpaca-7b": "Alpaca (7b)",
    "medalpaca-7b": "MedAlpaca (7b)",
}
models_a = ["Mistral (7b)", "Llama-2 (7b)", "Llama-2 (13b)", "Alpaca (7b)"]
models_b = ["Asclepius (7b)", "Clin-Camel (13b)", "MedAlpaca (7b)"]
results = results.reset_index()

results["model"] = results.model.map(MODEL_MAP)
results.columns = ["Model", "AUROC", "Prompt", "order"]

# EXTRACTION

In [11]:
df_results_grouped_extraction = pd.read_csv(
    "/work/frink/private_datasets/extraction.csv"
).drop(columns=["Unnamed: 0"])
df_results_grouped_extraction = df_results_grouped_extraction.rename(
    columns={"f1_score": "F1"}
)
df_results_grouped = pd.concat(
    [df_results_grouped, df_results_grouped_extraction], axis=0
)

df_results_grouped_extraction["model"].unique()

array(['Llama-2-7b-chat-hf', 'Llama-2-13b-chat', 'alpaca-7b', 'asclepius',
       'clinical-camel-7b', 'medalpaca-7b', 'mistral-7b'], dtype=object)

In [12]:
def get_results_stacked(df_results_grouped, how="max"):
    datasets = df_results_grouped["dataset"].unique()
    models = df_results_grouped["model"].unique()
    if how == "max":
        df_results_grouped_max = (
            df_results_grouped[["dataset", "model", "F1"]]
            .groupby(["dataset", "model"])
            .max()
            .reset_index()
        )
    elif how == "min":
        df_results_grouped_max = (
            df_results_grouped[["dataset", "model", "F1"]]
            .groupby(["dataset", "model"])
            .min()
            .reset_index()
        )
    else:
        df_results_grouped_max = (
            df_results_grouped[["dataset", "model", "F1"]]
            .groupby(["dataset", "model"])
            .mean()
            .reset_index()
        )

    df_results_grouped_max = (
        df_results_grouped_max[["model", "F1"]].groupby("model").mean()
    )

    return df_results_grouped_max

In [13]:
columns = ["dataset", "model", "annotator", "F1"]

df_results_grouped_extraction = pd.read_csv(
    "/work/frink/private_datasets/extraction.csv"
).drop(columns=["Unnamed: 0"])

df_results_grouped_extraction = df_results_grouped_extraction.rename(
    columns={"f1_score": "F1"}
).reset_index()

df_results_grouped_max_cnt = get_results_stacked(df_results_grouped_extraction, "max")
df_results_grouped_min_cnt = get_results_stacked(df_results_grouped_extraction, "min")
df_results_grouped_mean_cnt = get_results_stacked(df_results_grouped_extraction, "mean")
df_results_grouped_max_cnt["type"] = "Best"
df_results_grouped_min_cnt["type"] = "Worst"
df_results_grouped_mean_cnt["type"] = "Median"
results = [
    df_results_grouped_max_cnt,
    df_results_grouped_mean_cnt,
    df_results_grouped_min_cnt,
]
results = pd.concat(results)
results

Unnamed: 0_level_0,F1,type
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Llama-2-13b-chat,0.38788,Best
Llama-2-7b-chat-hf,0.431393,Best
alpaca-7b,0.230052,Best
asclepius,0.347388,Best
clinical-camel-7b,0.414732,Best
medalpaca-7b,0.344597,Best
mistral-7b,0.360897,Best
Llama-2-13b-chat,0.296618,Median
Llama-2-7b-chat-hf,0.352255,Median
alpaca-7b,0.134112,Median


In [14]:
results["order"] = results.index.map(ORDER_MAP)
results = results.sort_values(by=["order", "type"])
MODEL_MAP = {
    "Llama-2-7b-chat-hf": "Llama-2 (7b)",
    "asclepius": "Asclepius (7b)",
    "Llama-2-13b-chat": "Llama-2 (13b)",
    "clinical-camel-7b": "Clin-Camel (13b)",
    "alpaca-7b": "Alpaca (7b)",
    "medalpaca-7b": "MedAlpaca (7b)",
    "mistral-7b": "Mistral (7b)",
}
models_a = ["Mistral (7b)", "Llama-2 (7b)", "Llama-2 (13b)", "Alpaca (7b)"]
models_b = ["Asclepius (7b)", "Clin-Camel (13b)", "MedAlpaca (7b)"]
results = results.reset_index()

results["model"] = results.model.map(MODEL_MAP)
results.columns = ["Model", "F1", "Prompt", "order"]