In [42]:
import os
import openai
import pandas as pd
import pdfplumber
import json
import numpy as np

import config

In [136]:
from openpyxl import load_workbook

file_path = './truth_excel/test_all_studies_final.xlsx'
workbook = load_workbook(file_path)
sheet = workbook.active

# Convert to DataFrame
data = sheet.values
columns = next(data)
df_truth = pd.DataFrame(data, columns=columns)

In [44]:
import openai
from openai import OpenAI
client = OpenAI()

In [45]:
def get_file_names(directory):
    """Gets a list of file names in the specified directory.

    Args:
        directory (str): The path to the directory.

    Returns:
        list: A list of file names.
    """

    file_names = []
    for entry in os.scandir(directory):
        if entry.is_file():
            file_names.append(entry.name)
    return file_names

directory_path = "../Desktop/Testing"
file_names = get_file_names(directory_path)
print(file_names)

['rynn2008.pdf', 'lennox2003.pdf', 'kasper2014.pdf', 'hartford2007.pdf', 'boyer2004.pdf', 'merideth2012.pdf', 'mahablesh2013.pdf', 'davidson2004.pdf', 'pollock2001.pdf', 'nicolini2009.pdf', 'allgulander2004.pdf', 'pollock2008a.pdf', 'wu2011.pdf', 'bose2008.pdf', 'rickels2003.pdf', 'rothschild2012.pdf', 'stein2008.pdf', 'khan2011.pdf', 'alaka2014.pdf', 'ball2015.pdf', 'nimatoudis2004.pdf']


# Pollock2008a was omitted due to heavy data discrepencies 

In [76]:
from pdf2image import convert_from_path
import pdfplumber
import pytesseract
import pandas as pd

# Function to extract text and tables
def extract_pdf_content(file_path):
    text = ""
    tables = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            # Extract text
            text += page.extract_text() + "\n"
            
            # Extract tables
            for table in page.extract_tables():
                tables.append(pd.DataFrame(table))
    return text, tables

In [74]:
from pdf2image import convert_from_path
import base64
from io import BytesIO
from openai import OpenAI

# Initialize OpenAI client
client = OpenAI()

def convert_pdf_to_base64_images(pdf_path, dpi=200, image_format='JPEG'):
    images = convert_from_path(pdf_path, dpi=dpi)
    base64_images = []
    for img in images:
        buffer = BytesIO()
        img.save(buffer, format=image_format)
        buffer.seek(0)
        image_data = buffer.getvalue()
        image_base64 = base64.b64encode(image_data).decode('utf-8')
        base64_images.append(image_base64)
    return base64_images

def query_gpt4_full(text, tables, pdf_path):
    # Convert all PDF pages to base64-encoded images
    base64_images = convert_pdf_to_base64_images(pdf_path)
    
    structured_prompt_text = (
        f"Clinical Trial Report Analysis:\n\n"
        f"Extracted Text:\n{text}\n\n"
        f"Extracted Tables:\n{tables}\n\n"
        f"This is a clinical trial report. For EACH intervention in the trial (including placebo), "
        f"please extract the following characteristics and format the response as valid JSON using this exact example structure:\n\n"
        f"Example format:\n"
        f"{{\n"
        f'    "Last Name of Main Author and Year": "Doe et al., 2021",\n'
        f'    "Full Population Sample Size": "451",\n'
        f'    "Intervention": "Duloxetine: 50 mg/day",\n'
        f'    "Main Race": "White",\n'
        f'    "Percent of Intervention Population that is Female (%)": "61.5",\n'
        f'    "Mean HAMA Score": "24.5",\n'
        f'    "Mean Population Age (Year)": "43.2",\n'
        f'    "Attrition Rate (%)": "30.2",\n'
        f'    "Full Sponsor Name": "ABC Pharmaceuticals",\n'
        f'    "Follow-up Time (Weeks)": "10",\n'
        f'    "Diagnostic Criteria": "DSM-IV"\n'
        f"}}\n\n"
        f"'Full Population Sample Size' should refer to the TOTAL population enrolled in the study, across all interventions and groups, not just the population size for the specific intervention.\n"
        f"'Intervention' should be in mg/day, not any other unit of measurement.\n"
        f"'Follow-up Time' should refer to total length of the treatment period, omitting washout periods.\n"
        f"'Mean HAMA' should be the mean HAMA score at the beginning of the study for the specific intervention.\n"
        f"'Attrition Rate' should be % of patients who failed to complete the treatment after assignment\n"
        f" Make sure each JSON object follows this format exactly. For any missing or unavailable data, input 'NA'. If you are unsure about an answer, input 'NA'."
    )

    # Construct the message content as a list: first the text, then the images
    message_content = []
    message_content.append({
        "type": "text",
        "text": structured_prompt_text
    })

    # Add each page of the PDF as an image message
    for img_b64 in base64_images:
        message_content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{img_b64}"
            }
        })

    # Create the chat completion request
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a data extraction assistant extracting data from clinical trial reports."},
            {"role": "user", "content": message_content}
        ],
    )

    return response

In [47]:
# def query_gpt4_text(text, tables, ocr_results):
#     structured_prompt = (
#         f"Clinical Trial Report Analysis:\n\n"
#         f"Extracted Text:\n{text}\n\n"
#         f"Extracted OCR Text from Images:\n{ocr_results}\n\n"
#         f"Extracted Tables:\n{tables}\n\n"
#         f"This is a clinical trial report. For EACH intervention in the trial (including placebo), please extract the following characteristics and format the response as valid JSON using this exact example structure:\n\n"
#         f"Example format:\n"
#         f"{{\n"
#         f'    "Last Name of Main Author and Year": "Doe et al., 2021",\n'
#         f'    "Full Population Sample Size": "451",\n'
#         f'    "Intervention": "Duloxetine: 50 mg/day",\n'
#         f'    "Main Race": "White",\n'
#         f'    "Percent of Intervention Population that is Female (%)": "61.5",\n'
#         f'    "Mean HAMA Score": "24.5",\n'
#         f'    "Mean Population Age (Year)": "43.2",\n'
#         f'    "Attrition Rate (%)": "30.2",\n'
#         f'    "Full Sponsor Name": "ABC Pharmaceuticals",\n'
#         f'    "Follow-up Time (Weeks)": "10",\n'
#         f'    "Diagnostic Criteria": "DSM-IV"\n'
#         f"}}\n\n"
#         f"'Full Population Sample Size' should refer to the TOTAL population enrolled in the study, across all interventions and groups, not just the population size for the specific intervention.\n"
#         f"'Intervention' should be in mg/day, not any other unit of measurement.\n"
#         f"'Follow-up Time' should refer to total length of the treatment period, omitting washout periods.\n"
#         f"'Mean HAMA' should be the mean HAMA score at the beginning of the study for the specific intervention.\n"
#         f"'Attrition Rate' should be % of patients who failed to complete the treatment after assignment\n"
#         f" Make sure each JSON object follows this format exactly. For any missing or unavailable data, input 'NA'. If you are unsure about an answer, input 'NA'."
#     )
#     response = completion = client.chat.completions.create(
#         model="gpt-4o",
#         messages=[{"role": "system", "content": "You are a data extraction assistant extracting data from clinical trial reports."},
#                   {"role": "user", "content": structured_prompt}],
#     )
#     return response

In [48]:
def rename_columns(df):
    import pandas as pd
    
    # Rename columns in df to match df_truth
    df = df.rename(columns={
        "Last Name of Main Author and Year": "References",
        "Full Population Sample Size": "Sample size",
        "Main Race": "Main race",
        "Intervention": "Interventions",
        "Percent of Study Population that is Female (%)": "Female (%)",
        "Percent of Intervention Population that is Female (%)": "Female (%)",
        "Mean HAMA Score": "Mean HAMA",
        "Mean Population Age (Year)": "Mean age (Year)",
        "Attrition Rate (%)": "Attrition rate (%)",
        "Full Sponsor Name": "Sponsor",
        "Follow-up Time (Weeks)": "Follow-up time (weeks)",
        "Diagnostic Criteria": "Diagnosis criteria"
    })
    return df

In [49]:
def preprocess_numerical_columns(df, numerical_cols):
    """
    Converts specified numerical columns to numeric types in a DataFrame.
    Keeps 'NA' as a string and does not convert it to NaN.
    """
    for col in numerical_cols:
        if col in df.columns:
            # Preserve 'NA' and convert the rest to numeric
            df[col] = df[col].apply(lambda x: x if str(x).strip().lower() == "na" else pd.to_numeric(x, errors="coerce"))
    return df

# Update the numerical columns list
numerical_cols = ["Sample size", "Female (%)", "Mean HAMA", "Mean age (Year)", "Attrition rate (%)", "Follow-up time (weeks)"]


In [109]:
def preprocess_interventions_and_diag(df):
    """
    Normalizes the 'Interventions' column in the DataFrame for consistent filtering.
    Replaces all dashes (e.g., en dash, em dash) with a standard hyphen.
    """
    df = df.copy()
    df["Interventions"] = (
        df["Interventions"]
        .str.strip()
        .str.lower()
        .str.replace("–", "-", regex=False)  # Replace en dash with hyphen
        .str.replace("—", "-", regex=False)  # Replace em dash with hyphen
        .str.replace("\u00a0", " ")  # Replace non-breaking space with regular space
        .str.replace(r":(?=\d)", ": ", regex=True)
        .str.replace(r" sr:", ":", regex = True)
        .str.replace(r" xr:", ":", regex = True)
        .str.replace(r" xl:", ":", regex = True)

    )
    df["Diagnosis criteria"] = (
    df["Diagnosis criteria"]
        .str.strip()
        .str.lower()
        .str.replace("–", "-", regex=False)  # Replace en dash with hyphen
        .str.replace("—", "-", regex=False)  # Replace em dash with hyphen
        .str.replace("\u00a0", " ")  # Replace non-breaking space with regular space
    )
    return df

In [110]:
def testing(df_truth, df):
    grouped_truth = df_truth.groupby("References")
    grouped_pred = df.groupby("References")
    
    results = []
    
    for reference in grouped_truth.groups:
        if reference in grouped_pred.groups:
            # Get groups
            truth_group = grouped_truth.get_group(reference)
            pred_group = grouped_pred.get_group(reference)
    
            # Normalize and find common interventions
            truth_interventions = set(truth_group["Interventions"].tolist())
            pred_interventions = set(pred_group["Interventions"].tolist())
            common_interventions = truth_interventions & pred_interventions  # Only common interventions
    
            for intervention in common_interventions:
                # Filter for the specific intervention
                truth_row = truth_group[truth_group["Interventions"] == intervention]
                pred_row = pred_group[pred_group["Interventions"] == intervention]
    
                if not truth_row.empty and not pred_row.empty:
                    truth_row = truth_row.iloc[0]
                    pred_row = pred_row.iloc[0]
    
                    # Categorical columns
                    categorical_cols = ["Main race"]
                    categorical_match = {
                        col: truth_row[col].strip().lower() == pred_row[col].strip().lower()
                        for col in categorical_cols
                    }
    
                    # Special handling for "Diagnosis criteria"
                    diagnosis_criteria_truth = truth_row["Diagnosis criteria"].strip().lower()
                    diagnosis_criteria_pred = pred_row["Diagnosis criteria"].strip().lower()
                    categorical_match["Diagnosis criteria"] = diagnosis_criteria_truth in diagnosis_criteria_pred

                    # Special handling for "Sponsor"
                    sponsor_truth = truth_row["Sponsor"].strip().lower()
                    sponsor_pred = pred_row["Sponsor"].strip().lower()
                    categorical_match["Sponsor"] = sponsor_truth in sponsor_pred

                    # Numerical columns
                    numerical_cols = ["Sample size", "Female (%)", "Mean HAMA", "Mean age (Year)", "Attrition rate (%)", "Follow-up time (weeks)"]
                    numerical_match = {}
                    for col in numerical_cols:
                        truth_val = truth_row[col]
                        pred_val = pred_row[col]
                        truth_is_na = str(truth_val).strip().lower() == "na"
                        pred_is_na = str(pred_val).strip().lower() == "na"

                        # Handle different scenarios for NA
                        if truth_is_na and pred_is_na:
                            numerical_match[col] = True  # Both are NA, match is True
                        elif truth_is_na or pred_is_na:
                            numerical_match[col] = False  # Only one is NA, match is False
                        else:
                            # Perform numerical comparison
                            numerical_match[col] = np.isclose(
                                float(truth_val), float(pred_val), atol=0.5, equal_nan=True
                            )
    
                    # Collect results
                    results.append({
                        "References": reference,
                        "Interventions": intervention,
                        **categorical_match,
                        **numerical_match,
                    })
    results_df = pd.DataFrame(results)
    return results_df

In [77]:
def json_parsing(output):
    text = output.choices[0].message.content
    text = text.replace("```json\n{\n", "```json\n[\n    {\n")
    text = text.replace("\n}\n```", "\n    }\n]\n```")
    text = text.replace("}\n{","},\n{")
    
    if text.startswith("```json"):
        text = text[len("```json"):].strip()
    if text.endswith("```"):
        text = text[:-len("```")].strip()
    
    # Step 2: Parse the JSON list directly
    try:
        data = json.loads(text)  # Parse the JSON list directly
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        data = []
    
    # Step 3: Convert the list of dictionaries into a pandas DataFrame
    return pd.DataFrame(data)

In [78]:
def ensure_all_truth_interventions_present(results_df, df_truth):
    """
    Ensures all interventions in `df_truth` are present in `results_df`, grouped by reference.
    If an intervention is missing, adds a row with False for all metrics.
    """
    # Normalize `Interventions` in `df_truth` to match `results_df`
    df_truth = preprocess_interventions_and_diag(df_truth)
    df_truth = df_truth[df_truth["References"].isin(results_df["References"])]

    # Group by reference in both dataframes
    grouped_truth = df_truth.groupby("References")
    grouped_results = results_df.groupby("References")

    # Initialize a list to store missing rows
    missing_rows = []

    # Iterate through each reference in `df_truth`
    for reference, truth_group in grouped_truth:
        # Get the interventions for the current reference in `df_truth`
        truth_interventions = set(truth_group["Interventions"].tolist())

        # Get the interventions for the same reference in `results_df`, if it exists
        if reference in grouped_results.groups:
            results_group = grouped_results.get_group(reference)
            results_interventions = set(results_group["Interventions"].tolist())
        else:
            results_interventions = set()

        # Find missing interventions for this reference
        missing_interventions = truth_interventions - results_interventions

        # Create rows for missing interventions
        for intervention in missing_interventions:
            missing_rows.append({
                "References": reference,
                "Interventions": intervention,
                **{col: False for col in results_df.columns if col not in ["References", "Interventions"]}
            })

    # Add missing rows to results_df
    if missing_rows:
        results_df = pd.concat([results_df, pd.DataFrame(missing_rows)], ignore_index=True)

    return results_df

In [137]:
df_truth = preprocess_numerical_columns(df_truth, numerical_cols)
df_truth = preprocess_interventions_and_diag(df_truth)

# Rynn 2008

In [75]:
file_path = "../Desktop/Testing/rynn2008.pdf"

# Extract content from PDF
rynn_2008_text, rynn_2008_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_rynn_2008 = query_gpt4_full(rynn_2008_text, rynn_2008_tables, "../Desktop/Testing/rynn2008.pdf")

In [81]:
rynn_2008_df = json_parsing(structured_data_rynn_2008)

In [85]:
rynn_2008_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Rynn et al., 2008",327,Duloxetine: 60-120 mg/day,Caucasian,61.3,22.6,42.2,44.6,Eli Lilly and Company; Boehringer Ingelheim,10,DSM-IV
1,"Rynn et al., 2008",327,Placebo,Caucasian,62.3,23.5,41.0,31.4,Eli Lilly and Company; Boehringer Ingelheim,10,DSM-IV


In [86]:
df_truth[df_truth["References"] == "Rynn et al., 2008"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
78,"Rynn et al., 2008",327,duloxetine: 60-120 mg/day,Caucasian,61.3,22.6,42.2,44.6,Eli Lilly,10,dsm-iv
79,"Rynn et al., 2008",327,placebo,Caucasian,62.3,23.5,41.0,31.4,Eli Lilly,10,dsm-iv


In [87]:
rynn_2008_df = rename_columns(rynn_2008_df)
rynn_2008_df = preprocess_interventions_and_diag(rynn_2008_df)
rynn_2008_df = preprocess_numerical_columns(rynn_2008_df, numerical_cols)

In [88]:
rynn2008_results = testing(df_truth, rynn_2008_df)

In [89]:
rynn2008_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rynn et al., 2008",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
1,"Rynn et al., 2008",placebo,True,True,True,True,True,True,True,True,True


In [90]:
rynn2008_results_final = ensure_all_truth_interventions_present(rynn2008_results,df_truth)

In [92]:
rynn2008_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rynn et al., 2008",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
1,"Rynn et al., 2008",placebo,True,True,True,True,True,True,True,True,True


In [93]:
file_path = os.path.join("data_test", "rynn2008data.csv")  
rynn_2008_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "rynn2008results.csv")  
rynn2008_results_final.to_csv(file_path, index=False)  


# Lennox 2003

In [94]:
file_path = "../Desktop/Testing/lennox2003.pdf"

# Extract content from PDF
lennox_2003_text, lennox_2003_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_lennox_2003 = query_gpt4_full(lennox_2003_text, lennox_2003_tables, "../Desktop/Testing/lennox2003.pdf")

In [95]:
lennox_2003_df = json_parsing(structured_data_lennox_2003)

In [96]:
lennox_2003_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Lenox-Smith et al., 2003",244,Venlafaxine XL: 75-150 mg/day,,61.5,28,48,12.3,Wyeth Pharmaceuticals,24,DSM-IV
1,"Lenox-Smith et al., 2003",244,Placebo,,56.6,28,46,20.5,Wyeth Pharmaceuticals,24,DSM-IV


In [117]:
df_truth[df_truth["References"] == "Lenox-Smith et al., 2003"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
0,"Lenox-Smith et al., 2003",244,venlafaxine: 75-150 mg/day,,61.5,28,48,12.3,Wyeth,24,dsm-iv
1,"Lenox-Smith et al., 2003",244,placebo,,56.6,28,46,20.5,Wyeth,24,dsm-iv


In [111]:
lennox_2003_df = rename_columns(lennox_2003_df)
lennox_2003_df = preprocess_interventions_and_diag(lennox_2003_df)
lennox_2003_df = preprocess_numerical_columns(lennox_2003_df, numerical_cols)

In [112]:
lennox_2003_df

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
0,"Lenox-Smith et al., 2003",244,venlafaxine: 75-150 mg/day,,61.5,28,48,12.3,Wyeth Pharmaceuticals,24,dsm-iv
1,"Lenox-Smith et al., 2003",244,placebo,,56.6,28,46,20.5,Wyeth Pharmaceuticals,24,dsm-iv


In [118]:
lennox2003_results = testing(df_truth, lennox_2003_df)

In [119]:
lennox2003_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Lenox-Smith et al., 2003",venlafaxine: 75-150 mg/day,True,True,True,True,True,True,True,True,True
1,"Lenox-Smith et al., 2003",placebo,True,True,True,True,True,True,True,True,True


In [120]:
lennox2003_results_final = ensure_all_truth_interventions_present(lennox2003_results,df_truth)

In [121]:
lennox2003_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Lenox-Smith et al., 2003",venlafaxine: 75-150 mg/day,True,True,True,True,True,True,True,True,True
1,"Lenox-Smith et al., 2003",placebo,True,True,True,True,True,True,True,True,True


In [122]:
file_path = os.path.join("data_test", "lennox2003data.csv")  
lennox_2003_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "lennox2003results.csv")  
lennox2003_results_final.to_csv(file_path, index=False)  


# Kasper 2014

In [123]:
file_path = "../Desktop/Testing/kasper2014.pdf"

# Extract content from PDF
kasper_2014_text, kasper_2014_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_kasper_2014 = query_gpt4_full(kasper_2014_text, kasper_2014_tables, "../Desktop/Testing/kasper2014.pdf")

In [124]:
kasper_2014_df = json_parsing(structured_data_kasper_2014)

In [125]:
kasper_2014_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Kasper et al., 2014",539,Silexan: 160 mg/day,Caucasian,73.6,26.0,47.1,17.9,Dr Willmar Schwabe GmbH & Co. KG,10,DSM-5
1,"Kasper et al., 2014",539,Silexan: 80 mg/day,Caucasian,70.4,25.8,45.7,11.9,Dr Willmar Schwabe GmbH & Co. KG,10,DSM-5
2,"Kasper et al., 2014",539,Paroxetine: 20 mg/day,Caucasian,77.3,25.8,45.8,21.2,Dr Willmar Schwabe GmbH & Co. KG,10,DSM-5
3,"Kasper et al., 2014",539,Placebo,Caucasian,73.3,25.1,44.6,13.2,Dr Willmar Schwabe GmbH & Co. KG,10,DSM-5


In [138]:
df_truth[df_truth["References"] == "Kasper et al., 2014"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
67,"Kasper et al., 2014",539,silexan: 80 mg/day,Caucasian,70.4,25.8,45.7,11.9,Schwabe,10,dsm-5
68,"Kasper et al., 2014",539,silexan: 160 mg/day,Caucasian,73.6,26.0,47.1,18.0,Schwabe,10,dsm-5
69,"Kasper et al., 2014",539,paroxetine: 20 mg/day,Caucasian,77.3,25.8,45.8,21.2,Schwabe,10,dsm-5
70,"Kasper et al., 2014",539,placebo,Caucasian,73.7,25.1,44.6,13.2,Schwabe,10,dsm-5


In [139]:
kasper_2014_df = rename_columns(kasper_2014_df)
kasper_2014_df = preprocess_interventions_and_diag(kasper_2014_df)
kasper_2014_df = preprocess_numerical_columns(kasper_2014_df, numerical_cols)

In [140]:
kasper2014_results = testing(df_truth, kasper_2014_df)

In [141]:
kasper2014_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Kasper et al., 2014",silexan: 160 mg/day,True,True,True,True,True,True,True,True,True
1,"Kasper et al., 2014",silexan: 80 mg/day,True,True,True,True,True,True,True,True,True
2,"Kasper et al., 2014",placebo,True,True,True,True,True,True,True,True,True
3,"Kasper et al., 2014",paroxetine: 20 mg/day,True,True,True,True,True,True,True,True,True


In [142]:
kasper2014_results_final = ensure_all_truth_interventions_present(kasper2014_results,df_truth)

In [143]:
kasper2014_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Kasper et al., 2014",silexan: 160 mg/day,True,True,True,True,True,True,True,True,True
1,"Kasper et al., 2014",silexan: 80 mg/day,True,True,True,True,True,True,True,True,True
2,"Kasper et al., 2014",placebo,True,True,True,True,True,True,True,True,True
3,"Kasper et al., 2014",paroxetine: 20 mg/day,True,True,True,True,True,True,True,True,True


In [144]:
file_path = os.path.join("data_test", "kasper2014data.csv")  
kasper_2014_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "kasper2014results.csv")  
kasper2014_results_final.to_csv(file_path, index=False)  


# Hartford 2007

In [145]:
file_path = "../Desktop/Testing/hartford2007.pdf"

# Extract content from PDF
hartford_2007_text, hartford_2007_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_hartford_2007 = query_gpt4_full(hartford_2007_text, hartford_2007_tables, "../Desktop/Testing/hartford2007.pdf")

In [146]:
hartford_2007_df = json_parsing(structured_data_hartford_2007)

In [147]:
hartford_2007_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Hartford et al., 2007",487,Duloxetine: 60-120 mg/day,Caucasian,64.2,25.6,40.4,45.7,Eli Lilly and Company and Boehringer Ingelheim,10,DSM-IV
1,"Hartford et al., 2007",487,Venlafaxine XR: 75-225 mg/day,Caucasian,62.2,24.9,40.1,37.8,Eli Lilly and Company and Boehringer Ingelheim,10,DSM-IV
2,"Hartford et al., 2007",487,Placebo,Caucasian,61.5,25.0,41.9,38.5,Eli Lilly and Company and Boehringer Ingelheim,10,DSM-IV


In [148]:
df_truth[df_truth["References"] == "Hartford et al., 2007"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
18,"Hartford et al., 2007",487,duloxetine: 60-120 mg/day,Caucasian,64.2,25.6,40.4,45.7,Eli Lilly,10,dsm-iv
19,"Hartford et al., 2007",487,venlafaxine: 75-225 mg/day,Caucasian,62.2,24.9,40.1,37.8,Eli Lilly,10,dsm-iv
20,"Hartford et al., 2007",487,placebo,Caucasian,61.5,25.0,41.9,38.5,Eli Lilly,10,dsm-iv


In [149]:
hartford_2007_df = rename_columns(hartford_2007_df)
hartford_2007_df = preprocess_interventions_and_diag(hartford_2007_df)
hartford_2007_df = preprocess_numerical_columns(hartford_2007_df, numerical_cols)

In [150]:
hartford2007_results = testing(df_truth, hartford_2007_df)

In [151]:
hartford2007_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Hartford et al., 2007",venlafaxine: 75-225 mg/day,True,True,True,True,True,True,True,True,True
1,"Hartford et al., 2007",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
2,"Hartford et al., 2007",placebo,True,True,True,True,True,True,True,True,True


In [152]:
hartford2007_results_final = ensure_all_truth_interventions_present(hartford2007_results,df_truth)

In [153]:
hartford2007_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Hartford et al., 2007",venlafaxine: 75-225 mg/day,True,True,True,True,True,True,True,True,True
1,"Hartford et al., 2007",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
2,"Hartford et al., 2007",placebo,True,True,True,True,True,True,True,True,True


In [154]:
file_path = os.path.join("data_test", "hartford2007data.csv")  
hartford_2007_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "hartford2007results.csv")  
hartford2007_results_final.to_csv(file_path, index=False)  


# Boyer 2004

In [155]:
file_path = "../Desktop/Testing/boyer2004.pdf"

# Extract content from PDF
boyer_2004_text, boyer_2004_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_boyer_2004 = query_gpt4_full(boyer_2004_text, boyer_2004_tables, "../Desktop/Testing/boyer2004.pdf")