In [1]:
import os
import openai
import pandas as pd
import pdfplumber
import json
import numpy as np

import config

In [276]:
from openpyxl import load_workbook

file_path = './truth_excel/test_all_studies_final.xlsx'
workbook = load_workbook(file_path)
sheet = workbook.active

# Convert to DataFrame
data = sheet.values
columns = next(data)
df_truth = pd.DataFrame(data, columns=columns)

In [3]:
import openai
from openai import OpenAI
client = OpenAI()

In [4]:
def get_file_names(directory):
    """Gets a list of file names in the specified directory.

    Args:
        directory (str): The path to the directory.

    Returns:
        list: A list of file names.
    """

    file_names = []
    for entry in os.scandir(directory):
        if entry.is_file():
            file_names.append(entry.name)
    return file_names

directory_path = "../Desktop/Testing"
file_names = get_file_names(directory_path)
print(file_names)

['rynn2008.pdf', 'lennox2003.pdf', 'kasper2014.pdf', 'hartford2007.pdf', 'boyer2004.pdf', 'merideth2012.pdf', 'mahablesh2013.pdf', 'davidson2004.pdf', 'pollock2001.pdf', 'nicolini2009.pdf', 'allgulander2004.pdf', 'pollock2008a.pdf', 'wu2011.pdf', 'bose2008.pdf', 'rickels2003.pdf', 'rothschild2012.pdf', 'stein2008.pdf', 'khan2011.pdf', 'alaka2014.pdf', 'ball2015.pdf', 'nimatoudis2004.pdf']


# Pollock2008a was omitted due to heavy data discrepencies, meaning the author likely received this data from reaching out to the author of Pollock2008a

# Nicolini 2009 was omitted due to having an reference file with patient charecteristics linked, which means GPT-4o could not access. 

In [5]:
from pdf2image import convert_from_path
import pdfplumber
import pytesseract
import pandas as pd

# Function to extract text and tables
def extract_pdf_content(file_path):
    text = ""
    tables = []
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            # Extract text
            text += page.extract_text() + "\n"
            
            # Extract tables
            for table in page.extract_tables():
                tables.append(pd.DataFrame(table))
    return text, tables

In [6]:
from pdf2image import convert_from_path
import base64
from io import BytesIO
from openai import OpenAI

# Initialize OpenAI client
client = OpenAI()

def convert_pdf_to_base64_images(pdf_path, dpi=200, image_format='JPEG'):
    images = convert_from_path(pdf_path, dpi=dpi)
    base64_images = []
    for img in images:
        buffer = BytesIO()
        img.save(buffer, format=image_format)
        buffer.seek(0)
        image_data = buffer.getvalue()
        image_base64 = base64.b64encode(image_data).decode('utf-8')
        base64_images.append(image_base64)
    return base64_images

def query_gpt4_full(text, tables, pdf_path):
    # Convert all PDF pages to base64-encoded images
    base64_images = convert_pdf_to_base64_images(pdf_path)
    
    structured_prompt_text = (
        f"Clinical Trial Report Analysis:\n\n"
        f"Extracted Text:\n{text}\n\n"
        f"Extracted Tables:\n{tables}\n\n"
        f"This is a clinical trial report. For EACH intervention in the trial (including placebo), "
        f"please extract the following characteristics and format the response as valid JSON using this exact example structure:\n\n"
        f"Example format:\n"
        f"{{\n"
        f'    "Last Name of Main Author and Year": "Doe et al., 2021",\n'
        f'    "Full Population Sample Size": "451",\n'
        f'    "Intervention": "Duloxetine: 50 mg/day",\n'
        f'    "Main Race": "White",\n'
        f'    "Percent of Intervention Population that is Female (%)": "61.5",\n'
        f'    "Mean HAMA Score": "24.5",\n'
        f'    "Mean Population Age (Year)": "43.2",\n'
        f'    "Attrition Rate (%)": "30.2",\n'
        f'    "Full Sponsor Name": "ABC Pharmaceuticals",\n'
        f'    "Follow-up Time (Weeks)": "10",\n'
        f'    "Diagnostic Criteria": "DSM-IV"\n'
        f"}}\n\n"
        f"'Full Population Sample Size' should refer to the TOTAL population enrolled in the study and randomized, across all interventions and groups, not just the population size for the specific intervention.\n"
        f"'Intervention' should be in mg/day, not any other unit of measurement. If a single value is provided, extract it as a single value (e.g., “duloxetine: 75 mg/day”). If a range is specified, extract the full range in the format “lower value-upper value mg/day” (e.g., “duloxetine: 70-150 mg/day”). Do not combine separate interventions into a range. \n"
        f"'Follow-up Time' should refer to total length of the treatment period and any follow-ups, omitting washout periods.\n"
        f"'Mean HAMA' should be the mean HAMA score at the beginning of the study for the specific intervention.\n"
        f"'Attrition Rate' should be % of patients who failed to complete the treatment after assignment\n"
        f" Make sure each JSON object follows this format exactly. If a characteristic is only reported at the study-wide level and not linked to a specific intervention, input ‘NA’ for that intervention. Only include characteristics under specific interventions if the information explicitly ties them to that intervention. If you are unsure about an answer, input ‘NA’."
    )

    # Construct the message content as a list: first the text, then the images
    message_content = []
    message_content.append({
        "type": "text",
        "text": structured_prompt_text
    })

    # Add each page of the PDF as an image message
    for img_b64 in base64_images:
        message_content.append({
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{img_b64}"
            }
        })

    # Create the chat completion request
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a data extraction assistant extracting data from clinical trial reports."},
            {"role": "user", "content": message_content}
        ],
    )

    return response

In [7]:
# def query_gpt4_text(text, tables, ocr_results):
#     structured_prompt = (
#         f"Clinical Trial Report Analysis:\n\n"
#         f"Extracted Text:\n{text}\n\n"
#         f"Extracted OCR Text from Images:\n{ocr_results}\n\n"
#         f"Extracted Tables:\n{tables}\n\n"
#         f"This is a clinical trial report. For EACH intervention in the trial (including placebo), please extract the following characteristics and format the response as valid JSON using this exact example structure:\n\n"
#         f"Example format:\n"
#         f"{{\n"
#         f'    "Last Name of Main Author and Year": "Doe et al., 2021",\n'
#         f'    "Full Population Sample Size": "451",\n'
#         f'    "Intervention": "Duloxetine: 50 mg/day",\n'
#         f'    "Main Race": "White",\n'
#         f'    "Percent of Intervention Population that is Female (%)": "61.5",\n'
#         f'    "Mean HAMA Score": "24.5",\n'
#         f'    "Mean Population Age (Year)": "43.2",\n'
#         f'    "Attrition Rate (%)": "30.2",\n'
#         f'    "Full Sponsor Name": "ABC Pharmaceuticals",\n'
#         f'    "Follow-up Time (Weeks)": "10",\n'
#         f'    "Diagnostic Criteria": "DSM-IV"\n'
#         f"}}\n\n"
#         f"'Full Population Sample Size' should refer to the TOTAL population enrolled in the study, across all interventions and groups, not just the population size for the specific intervention.\n"
#         f"'Intervention' should be in mg/day, not any other unit of measurement.\n"
#         f"'Follow-up Time' should refer to total length of the treatment period, omitting washout periods.\n"
#         f"'Mean HAMA' should be the mean HAMA score at the beginning of the study for the specific intervention.\n"
#         f"'Attrition Rate' should be % of patients who failed to complete the treatment after assignment\n"
#         f" Make sure each JSON object follows this format exactly. For any missing or unavailable data, input 'NA'. If you are unsure about an answer, input 'NA'."
#     )
#     response = completion = client.chat.completions.create(
#         model="gpt-4o",
#         messages=[{"role": "system", "content": "You are a data extraction assistant extracting data from clinical trial reports."},
#                   {"role": "user", "content": structured_prompt}],
#     )
#     return response

In [8]:
def rename_columns(df):
    import pandas as pd
    
    # Rename columns in df to match df_truth
    df = df.rename(columns={
        "Last Name of Main Author and Year": "References",
        "Full Population Sample Size": "Sample size",
        "Main Race": "Main race",
        "Intervention": "Interventions",
        "Percent of Study Population that is Female (%)": "Female (%)",
        "Percent of Intervention Population that is Female (%)": "Female (%)",
        "Mean HAMA Score": "Mean HAMA",
        "Mean Population Age (Year)": "Mean age (Year)",
        "Attrition Rate (%)": "Attrition rate (%)",
        "Full Sponsor Name": "Sponsor",
        "Follow-up Time (Weeks)": "Follow-up time (weeks)",
        "Diagnostic Criteria": "Diagnosis criteria"
    })
    return df

In [9]:
def preprocess_numerical_columns(df, numerical_cols):
    """
    Converts specified numerical columns to numeric types in a DataFrame.
    Keeps 'NA' as a string and does not convert it to NaN.
    """
    for col in numerical_cols:
        if col in df.columns:
            # Preserve 'NA' and convert the rest to numeric
            df[col] = df[col].apply(lambda x: x if str(x).strip().lower() == "na" else pd.to_numeric(x, errors="coerce"))
    return df

# Update the numerical columns list
numerical_cols = ["Sample size", "Female (%)", "Mean HAMA", "Mean age (Year)", "Attrition rate (%)", "Follow-up time (weeks)"]


In [10]:
def preprocess_interventions_and_diag(df):
    """
    Normalizes the 'Interventions' column in the DataFrame for consistent filtering.
    Replaces all dashes (e.g., en dash, em dash) with a standard hyphen.
    """
    df = df.copy()
    df["Interventions"] = (
        df["Interventions"]
        .str.strip()
        .str.lower()
        .str.replace("–", "-", regex=False)  # Replace en dash with hyphen
        .str.replace("—", "-", regex=False)  # Replace em dash with hyphen
        .str.replace("\u00a0", " ")  # Replace non-breaking space with regular space
        .str.replace(r":(?=\d)", ": ", regex=True)
        .str.replace(r" sr:", ":", regex = True)
        .str.replace(r" xr:", ":", regex = True)
        .str.replace(r" xl:", ":", regex = True)
        .str.replace(r" er:", ":", regex = True)


    )
    df["Diagnosis criteria"] = (
    df["Diagnosis criteria"]
        .str.strip()
        .str.lower()
        .str.replace("–", "-", regex=False)  # Replace en dash with hyphen
        .str.replace("—", "-", regex=False)  # Replace em dash with hyphen
        .str.replace("\u00a0", " ")  # Replace non-breaking space with regular space
    )
    return df

In [11]:
# def testing(df_truth, df):
#     grouped_truth = df_truth.groupby("References")
#     grouped_pred = df.groupby("References")
    
#     results = []
    
#     for reference in grouped_truth.groups:
#         if reference in grouped_pred.groups:
#             # Get groups
#             truth_group = grouped_truth.get_group(reference)
#             pred_group = grouped_pred.get_group(reference)
    
#             # Normalize and find common interventions
#             truth_interventions = set(truth_group["Interventions"].tolist())
#             pred_interventions = set(pred_group["Interventions"].tolist())
#             common_interventions = truth_interventions & pred_interventions  # Only common interventions
    
#             for intervention in common_interventions:
#                 # Filter for the specific intervention
#                 truth_row = truth_group[truth_group["Interventions"] == intervention]
#                 pred_row = pred_group[pred_group["Interventions"] == intervention]
    
#                 if not truth_row.empty and not pred_row.empty:
#                     truth_row = truth_row.iloc[0]
#                     pred_row = pred_row.iloc[0]
    
#                     # Categorical columns
#                     categorical_cols = ["Main race"]
#                     categorical_match = {
#                         col: truth_row[col].strip().lower() == pred_row[col].strip().lower()
#                         for col in categorical_cols
#                     }
    
#                     # Special handling for "Diagnosis criteria"
#                     diagnosis_criteria_truth = truth_row["Diagnosis criteria"].strip().lower()
#                     diagnosis_criteria_pred = pred_row["Diagnosis criteria"].strip().lower()
#                     categorical_match["Diagnosis criteria"] = diagnosis_criteria_truth in diagnosis_criteria_pred

#                     # Special handling for "Sponsor"
#                     sponsor_truth = truth_row["Sponsor"].strip().lower()
#                     sponsor_pred = pred_row["Sponsor"].strip().lower()
#                     categorical_match["Sponsor"] = sponsor_truth in sponsor_pred

#                     # Numerical columns
#                     numerical_cols = ["Sample size", "Female (%)", "Mean HAMA", "Mean age (Year)", "Attrition rate (%)", "Follow-up time (weeks)"]
#                     numerical_match = {}
#                     for col in numerical_cols:
#                         truth_val = truth_row[col]
#                         pred_val = pred_row[col]
#                         truth_is_na = str(truth_val).strip().lower() == "na"
#                         pred_is_na = str(pred_val).strip().lower() == "na"

#                         # Handle different scenarios for NA
#                         if truth_is_na and pred_is_na:
#                             numerical_match[col] = True  # Both are NA, match is True
#                         elif truth_is_na or pred_is_na:
#                             numerical_match[col] = False  # Only one is NA, match is False
#                         else:
#                             # Perform numerical comparison
#                             numerical_match[col] = np.isclose(
#                                 float(truth_val), float(pred_val), atol=0.5, equal_nan=True
#                             )
    
#                     # Collect results
#                     results.append({
#                         "References": reference,
#                         "Interventions": intervention,
#                         **categorical_match,
#                         **numerical_match,
#                     })
#     results_df = pd.DataFrame(results)
#     return results_df

In [197]:
def testing(df_truth, df):
    grouped_truth = df_truth.groupby("References")
    grouped_pred = df.groupby("References")
    
    results = []
    
    for reference in grouped_truth.groups:
        if reference in grouped_pred.groups:
            # Get groups
            truth_group = grouped_truth.get_group(reference)
            pred_group = grouped_pred.get_group(reference)
    
            # Normalize and find common interventions
            truth_interventions = set(truth_group["Interventions"].tolist())
            pred_interventions = set(pred_group["Interventions"].tolist())
            common_interventions = truth_interventions & pred_interventions  # Only common interventions
    
            for intervention in common_interventions:
                # Filter for the specific intervention
                truth_row = truth_group[truth_group["Interventions"] == intervention]
                pred_row = pred_group[pred_group["Interventions"] == intervention]
    
                if not truth_row.empty and not pred_row.empty:
                    truth_row = truth_row.iloc[0]
                    pred_row = pred_row.iloc[0]
                    categorical_match = {}
                    # Categorical columns
                    # Special handling for "Main race"
                    main_race_truth = truth_row["Main race"].strip().lower()
                    main_race_pred = pred_row["Main race"].strip().lower()
                    categorical_match["Main race"] = main_race_truth in main_race_pred
    
                    # Special handling for "Diagnosis criteria"
                    diagnosis_criteria_truth = truth_row["Diagnosis criteria"].strip().lower()
                    diagnosis_criteria_pred = pred_row["Diagnosis criteria"].strip().lower()
                    categorical_match["Diagnosis criteria"] = diagnosis_criteria_truth in diagnosis_criteria_pred

                    # Special handling for "Sponsor"
                    sponsor_truth = truth_row["Sponsor"].strip().lower()
                    sponsor_pred = pred_row["Sponsor"].strip().lower()
                    categorical_match["Sponsor"] = sponsor_truth in sponsor_pred

                    # Numerical columns
                    numerical_cols = ["Sample size", "Female (%)", "Mean HAMA", "Mean age (Year)", "Attrition rate (%)", "Follow-up time (weeks)"]
                    numerical_match = {}
                    for col in numerical_cols:
                        truth_val = truth_row[col]
                        pred_val = pred_row[col]
                        truth_is_na = str(truth_val).strip().lower() == "na"
                        pred_is_na = str(pred_val).strip().lower() == "na"

                        # Handle different scenarios for NA
                        if truth_is_na and pred_is_na:
                            numerical_match[col] = True  # Both are NA, match is True
                        elif truth_is_na or pred_is_na:
                            numerical_match[col] = False  # Only one is NA, match is False
                        else:
                            # Perform numerical comparison
                            numerical_match[col] = np.isclose(
                                float(truth_val), float(pred_val), atol=0.5, equal_nan=True
                            )
    
                    # Collect results
                    results.append({
                        "References": reference,
                        "Interventions": intervention,
                        **categorical_match,
                        **numerical_match,
                    })
    results_df = pd.DataFrame(results)
    return results_df

In [12]:
def json_parsing(output):
    text = output.choices[0].message.content
    text = text.replace("```json\n{\n", "```json\n[\n    {\n")
    text = text.replace("\n}\n```", "\n    }\n]\n```")
    text = text.replace("}\n{","},\n{")
    text = text.replace("\n```\n\n```json\n","")
    text = text.replace("][",",")
    
    if text.startswith("```json"):
        text = text[len("```json"):].strip()
    if text.endswith("```"):
        text = text[:-len("```")].strip()
    
    # Step 2: Parse the JSON list directly
    try:
        data = json.loads(text)  # Parse the JSON list directly
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        data = []
    
    # Step 3: Convert the list of dictionaries into a pandas DataFrame
    return pd.DataFrame(data)

In [13]:
def ensure_all_truth_interventions_present(results_df, df_truth):
    """
    Ensures all interventions in `df_truth` are present in `results_df`, grouped by reference.
    If an intervention is missing, adds a row with False for all metrics.
    """
    # Normalize `Interventions` in `df_truth` to match `results_df`
    df_truth = preprocess_interventions_and_diag(df_truth)
    df_truth = df_truth[df_truth["References"].isin(results_df["References"])]

    # Group by reference in both dataframes
    grouped_truth = df_truth.groupby("References")
    grouped_results = results_df.groupby("References")

    # Initialize a list to store missing rows
    missing_rows = []

    # Iterate through each reference in `df_truth`
    for reference, truth_group in grouped_truth:
        # Get the interventions for the current reference in `df_truth`
        truth_interventions = set(truth_group["Interventions"].tolist())

        # Get the interventions for the same reference in `results_df`, if it exists
        if reference in grouped_results.groups:
            results_group = grouped_results.get_group(reference)
            results_interventions = set(results_group["Interventions"].tolist())
        else:
            results_interventions = set()

        # Find missing interventions for this reference
        missing_interventions = truth_interventions - results_interventions

        # Create rows for missing interventions
        for intervention in missing_interventions:
            missing_rows.append({
                "References": reference,
                "Interventions": intervention,
                **{col: False for col in results_df.columns if col not in ["References", "Interventions"]}
            })

    # Add missing rows to results_df
    if missing_rows:
        results_df = pd.concat([results_df, pd.DataFrame(missing_rows)], ignore_index=True)

    return results_df

In [277]:
df_truth = preprocess_numerical_columns(df_truth, numerical_cols)
df_truth = preprocess_interventions_and_diag(df_truth)

# Rynn 2008

In [15]:
file_path = "../Desktop/Testing/rynn2008.pdf"

# Extract content from PDF
rynn_2008_text, rynn_2008_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_rynn_2008 = query_gpt4_full(rynn_2008_text, rynn_2008_tables, "../Desktop/Testing/rynn2008.pdf")

In [16]:
text = structured_data_rynn_2008.choices[0].message.content

In [17]:
rynn_2008_df = json_parsing(structured_data_rynn_2008)

In [18]:
rynn_2008_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Rynn et al., 2008",327,Duloxetine: 60-120 mg/day,Caucasian,61.3,22.6,42.2,44.6,Eli Lilly and Company; Boehringer Ingelheim,10,DSM-IV
1,"Rynn et al., 2008",327,Placebo,Caucasian,62.3,23.5,41.0,31.4,Eli Lilly and Company; Boehringer Ingelheim,10,DSM-IV


In [19]:
df_truth[df_truth["References"] == "Rynn et al., 2008"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
78,"Rynn et al., 2008",327,duloxetine: 60-120 mg/day,Caucasian,61.3,22.6,42.2,44.6,Eli Lilly,10,dsm-iv
79,"Rynn et al., 2008",327,placebo,Caucasian,62.3,23.5,41.0,31.4,Eli Lilly,10,dsm-iv


In [20]:
rynn_2008_df = rename_columns(rynn_2008_df)
rynn_2008_df = preprocess_interventions_and_diag(rynn_2008_df)
rynn_2008_df = preprocess_numerical_columns(rynn_2008_df, numerical_cols)

In [21]:
rynn_2008_df

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
0,"Rynn et al., 2008",327,duloxetine: 60-120 mg/day,Caucasian,61.3,22.6,42.2,44.6,Eli Lilly and Company; Boehringer Ingelheim,10,dsm-iv
1,"Rynn et al., 2008",327,placebo,Caucasian,62.3,23.5,41.0,31.4,Eli Lilly and Company; Boehringer Ingelheim,10,dsm-iv


In [22]:
rynn2008_results = testing(df_truth, rynn_2008_df)

In [23]:
rynn2008_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rynn et al., 2008",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
1,"Rynn et al., 2008",placebo,True,True,True,True,True,True,True,True,True


In [24]:
rynn2008_results_final = ensure_all_truth_interventions_present(rynn2008_results,df_truth)

In [25]:
rynn2008_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rynn et al., 2008",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
1,"Rynn et al., 2008",placebo,True,True,True,True,True,True,True,True,True


In [26]:
file_path = os.path.join("data_test", "rynn2008data.csv")  
rynn_2008_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "rynn2008results.csv")  
rynn2008_results_final.to_csv(file_path, index=False)  


# Lennox 2003

In [27]:
file_path = "../Desktop/Testing/lennox2003.pdf"

# Extract content from PDF
lennox_2003_text, lennox_2003_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_lennox_2003 = query_gpt4_full(lennox_2003_text, lennox_2003_tables, "../Desktop/Testing/lennox2003.pdf")

In [28]:
lennox_2003_df = json_parsing(structured_data_lennox_2003)

In [29]:
lennox_2003_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Lenox-Smith et al., 2003",244,Venlafaxine XL: 75-150 mg/day,,61.5,28,48,12.3,Wyeth Pharmaceuticals,24,DSM-IV
1,"Lenox-Smith et al., 2003",244,Placebo,,56.6,28,46,20.5,Wyeth Pharmaceuticals,24,DSM-IV


In [30]:
df_truth[df_truth["References"] == "Lenox-Smith et al., 2003"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
0,"Lenox-Smith et al., 2003",244,venlafaxine: 75-150 mg/day,,61.5,28,48,12.3,Wyeth,24,dsm-iv
1,"Lenox-Smith et al., 2003",244,placebo,,56.6,28,46,20.5,Wyeth,24,dsm-iv


In [31]:
lennox_2003_df = rename_columns(lennox_2003_df)
lennox_2003_df = preprocess_interventions_and_diag(lennox_2003_df)
lennox_2003_df = preprocess_numerical_columns(lennox_2003_df, numerical_cols)

In [32]:
lennox_2003_df

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
0,"Lenox-Smith et al., 2003",244,venlafaxine: 75-150 mg/day,,61.5,28,48,12.3,Wyeth Pharmaceuticals,24,dsm-iv
1,"Lenox-Smith et al., 2003",244,placebo,,56.6,28,46,20.5,Wyeth Pharmaceuticals,24,dsm-iv


In [33]:
lennox2003_results = testing(df_truth, lennox_2003_df)

In [34]:
lennox2003_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Lenox-Smith et al., 2003",venlafaxine: 75-150 mg/day,True,True,True,True,True,True,True,True,True
1,"Lenox-Smith et al., 2003",placebo,True,True,True,True,True,True,True,True,True


In [35]:
lennox2003_results_final = ensure_all_truth_interventions_present(lennox2003_results,df_truth)

In [36]:
lennox2003_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Lenox-Smith et al., 2003",venlafaxine: 75-150 mg/day,True,True,True,True,True,True,True,True,True
1,"Lenox-Smith et al., 2003",placebo,True,True,True,True,True,True,True,True,True


In [37]:
file_path = os.path.join("data_test", "lennox2003data.csv")  
lennox_2003_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "lennox2003results.csv")  
lennox2003_results_final.to_csv(file_path, index=False)  


# Kasper 2014

In [38]:
file_path = "../Desktop/Testing/kasper2014.pdf"

# Extract content from PDF
kasper_2014_text, kasper_2014_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_kasper_2014 = query_gpt4_full(kasper_2014_text, kasper_2014_tables, "../Desktop/Testing/kasper2014.pdf")

In [39]:
kasper_2014_df = json_parsing(structured_data_kasper_2014)

In [40]:
kasper_2014_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Kasper et al., 2014",539,Silexan: 160 mg/day,Caucasian,73.6,26.0,47.1,18.0,Dr. Willmar Schwabe GmbH & Co. KG,10,DSM-IV-TR
1,"Kasper et al., 2014",539,Silexan: 80 mg/day,Caucasian,70.4,25.8,45.7,11.9,Dr. Willmar Schwabe GmbH & Co. KG,10,DSM-IV-TR
2,"Kasper et al., 2014",539,Paroxetine: 20 mg/day,Caucasian,77.3,25.8,45.8,21.2,Dr. Willmar Schwabe GmbH & Co. KG,10,DSM-IV-TR
3,"Kasper et al., 2014",539,Placebo,Caucasian,73.3,25.1,44.6,13.2,Dr. Willmar Schwabe GmbH & Co. KG,10,DSM-IV-TR


In [41]:
df_truth[df_truth["References"] == "Kasper et al., 2014"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
67,"Kasper et al., 2014",539,silexan: 80 mg/day,Caucasian,70.4,25.8,45.7,11.9,Schwabe,10,dsm-5
68,"Kasper et al., 2014",539,silexan: 160 mg/day,Caucasian,73.6,26.0,47.1,18.0,Schwabe,10,dsm-5
69,"Kasper et al., 2014",539,paroxetine: 20 mg/day,Caucasian,77.3,25.8,45.8,21.2,Schwabe,10,dsm-5
70,"Kasper et al., 2014",539,placebo,Caucasian,73.7,25.1,44.6,13.2,Schwabe,10,dsm-5


In [42]:
kasper_2014_df = rename_columns(kasper_2014_df)
kasper_2014_df = preprocess_interventions_and_diag(kasper_2014_df)
kasper_2014_df = preprocess_numerical_columns(kasper_2014_df, numerical_cols)

In [43]:
kasper2014_results = testing(df_truth, kasper_2014_df)

In [44]:
kasper2014_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Kasper et al., 2014",paroxetine: 20 mg/day,True,False,True,True,True,True,True,True,True
1,"Kasper et al., 2014",silexan: 80 mg/day,True,False,True,True,True,True,True,True,True
2,"Kasper et al., 2014",silexan: 160 mg/day,True,False,True,True,True,True,True,True,True
3,"Kasper et al., 2014",placebo,True,False,True,True,True,True,True,True,True


In [45]:
kasper2014_results_final = ensure_all_truth_interventions_present(kasper2014_results,df_truth)

In [46]:
kasper2014_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Kasper et al., 2014",paroxetine: 20 mg/day,True,False,True,True,True,True,True,True,True
1,"Kasper et al., 2014",silexan: 80 mg/day,True,False,True,True,True,True,True,True,True
2,"Kasper et al., 2014",silexan: 160 mg/day,True,False,True,True,True,True,True,True,True
3,"Kasper et al., 2014",placebo,True,False,True,True,True,True,True,True,True


In [47]:
file_path = os.path.join("data_test", "kasper2014data.csv")  
kasper_2014_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "kasper2014results.csv")  
kasper2014_results_final.to_csv(file_path, index=False)  


# Hartford 2007

In [48]:
file_path = "../Desktop/Testing/hartford2007.pdf"

# Extract content from PDF
hartford_2007_text, hartford_2007_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_hartford_2007 = query_gpt4_full(hartford_2007_text, hartford_2007_tables, "../Desktop/Testing/hartford2007.pdf")

In [49]:
hartford_2007_df = json_parsing(structured_data_hartford_2007)

In [50]:
hartford_2007_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Hartford et al., 2007",487,Duloxetine: 60-120 mg/day,Caucasian,64.2,25.6,40.4,45.7,Eli Lilly and Company and Boehringer Ingelheim,10,DSM-IV
1,"Hartford et al., 2007",487,Venlafaxine XR: 75-225 mg/day,Caucasian,62.2,24.9,40.1,37.8,Eli Lilly and Company and Boehringer Ingelheim,10,DSM-IV
2,"Hartford et al., 2007",487,Placebo,Caucasian,61.5,25.0,41.9,38.5,Eli Lilly and Company and Boehringer Ingelheim,10,DSM-IV


In [51]:
df_truth[df_truth["References"] == "Hartford et al., 2007"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
18,"Hartford et al., 2007",487,duloxetine: 60-120 mg/day,Caucasian,64.2,25.6,40.4,45.7,Eli Lilly,10,dsm-iv
19,"Hartford et al., 2007",487,venlafaxine: 75-225 mg/day,Caucasian,62.2,24.9,40.1,37.8,Eli Lilly,10,dsm-iv
20,"Hartford et al., 2007",487,placebo,Caucasian,61.5,25.0,41.9,38.5,Eli Lilly,10,dsm-iv


In [52]:
hartford_2007_df = rename_columns(hartford_2007_df)
hartford_2007_df = preprocess_interventions_and_diag(hartford_2007_df)
hartford_2007_df = preprocess_numerical_columns(hartford_2007_df, numerical_cols)

In [53]:
hartford2007_results = testing(df_truth, hartford_2007_df)

In [54]:
hartford2007_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Hartford et al., 2007",venlafaxine: 75-225 mg/day,True,True,True,True,True,True,True,True,True
1,"Hartford et al., 2007",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
2,"Hartford et al., 2007",placebo,True,True,True,True,True,True,True,True,True


In [55]:
hartford2007_results_final = ensure_all_truth_interventions_present(hartford2007_results,df_truth)

In [56]:
hartford2007_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Hartford et al., 2007",venlafaxine: 75-225 mg/day,True,True,True,True,True,True,True,True,True
1,"Hartford et al., 2007",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
2,"Hartford et al., 2007",placebo,True,True,True,True,True,True,True,True,True


In [57]:
file_path = os.path.join("data_test", "hartford2007data.csv")  
hartford_2007_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "hartford2007results.csv")  
hartford2007_results_final.to_csv(file_path, index=False)  


# Boyer 2004

In [58]:
file_path = "../Desktop/Testing/boyer2004.pdf"

# Extract content from PDF
boyer_2004_text, boyer_2004_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_boyer_2004 = query_gpt4_full(boyer_2004_text, boyer_2004_tables, "../Desktop/Testing/boyer2004.pdf")

In [59]:
boyer_2004_df = json_parsing(structured_data_boyer_2004)

In [60]:
boyer_2004_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Boyer et al., 2004",544,Placebo,,42,26.7,46,,Wyeth Research,24,DSM-IV
1,"Boyer et al., 2004",544,Venlafaxine ER: 37.5 mg/day,,42,26.6,45,,Wyeth Research,24,DSM-IV
2,"Boyer et al., 2004",544,Venlafaxine ER: 75 mg/day,,39,26.3,44,,Wyeth Research,24,DSM-IV
3,"Boyer et al., 2004",544,Venlafaxine ER: 150 mg/day,,35,26.3,45,,Wyeth Research,24,DSM-IV


In [61]:
df_truth[df_truth["References"] == "Boyer et al., 2004"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
54,"Boyer et al., 2004",544,venlafaxine: 37.5 mg/day,,42,26.6,45,,Wyeth,24,dsm-iv
55,"Boyer et al., 2004",544,venlafaxine: 75 mg/day,,39,26.3,44,,Wyeth,24,dsm-iv
56,"Boyer et al., 2004",544,venlafaxine: 150 mg/day,,35,26.3,45,,Wyeth,24,dsm-iv
57,"Boyer et al., 2004",544,placebo,,42,26.7,46,,Wyeth,24,dsm-iv


In [62]:
boyer_2004_df = rename_columns(boyer_2004_df)
boyer_2004_df = preprocess_interventions_and_diag(boyer_2004_df)
boyer_2004_df = preprocess_numerical_columns(boyer_2004_df, numerical_cols)

In [63]:
boyer2004_results = testing(df_truth, boyer_2004_df)

In [64]:
boyer2004_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Boyer et al., 2004",venlafaxine: 75 mg/day,True,True,True,True,True,True,True,True,True
1,"Boyer et al., 2004",venlafaxine: 37.5 mg/day,True,True,True,True,True,True,True,True,True
2,"Boyer et al., 2004",venlafaxine: 150 mg/day,True,True,True,True,True,True,True,True,True
3,"Boyer et al., 2004",placebo,True,True,True,True,True,True,True,True,True


In [65]:
boyer2004_results_final = ensure_all_truth_interventions_present(boyer2004_results,df_truth)

In [66]:
boyer2004_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Boyer et al., 2004",venlafaxine: 75 mg/day,True,True,True,True,True,True,True,True,True
1,"Boyer et al., 2004",venlafaxine: 37.5 mg/day,True,True,True,True,True,True,True,True,True
2,"Boyer et al., 2004",venlafaxine: 150 mg/day,True,True,True,True,True,True,True,True,True
3,"Boyer et al., 2004",placebo,True,True,True,True,True,True,True,True,True


In [67]:
file_path = os.path.join("data_test", "boyer2004data.csv")  
boyer_2004_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "boyer2004results.csv")  
boyer2004_results_final.to_csv(file_path, index=False)  

# Merideth 2012

In [68]:
file_path = "../Desktop/Testing/merideth2012.pdf"

# Extract content from PDF
merideth_2012_text, merideth_2012_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_merideth_2012 = query_gpt4_full(merideth_2012_text, merideth_2012_tables, "../Desktop/Testing/merideth2012.pdf")

In [69]:
merideth_2012_df = json_parsing(structured_data_merideth_2012)

In [70]:
merideth_2012_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Merideth et al., 2011",854,Quetiapine XR: 150 mg/day,White,68,25.0,38.2,28.8,AstraZeneca Pharmaceuticals,10,DSM-IV-TR
1,"Merideth et al., 2011",854,Quetiapine XR: 300 mg/day,White,71,25.2,39.0,39.1,AstraZeneca Pharmaceuticals,10,DSM-IV-TR
2,"Merideth et al., 2011",854,Escitalopram: 10 mg/day,White,66,24.6,40.4,27.7,AstraZeneca Pharmaceuticals,10,DSM-IV-TR
3,"Merideth et al., 2011",854,Placebo,White,64,25.3,36.6,21.4,AstraZeneca Pharmaceuticals,10,DSM-IV-TR


In [71]:
df_truth[df_truth["References"] == "Merideth et al., 2012"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
28,"Merideth et al., 2012",854,quetiapine: 150 mg/day,White,68,25.0,38.2,28.8,AstraZeneca,10,dsm-iv
29,"Merideth et al., 2012",854,quetiapine: 300 mg/day,White,71,25.2,39.0,39.1,AstraZeneca,10,dsm-iv
30,"Merideth et al., 2012",854,escitalopram: 10 mg/day,White,66,24.6,40.4,27.7,AstraZeneca,10,dsm-iv
31,"Merideth et al., 2012",854,placebo,White,64,25.3,36.6,21.4,AstraZeneca,10,dsm-iv


In [72]:
merideth_2012_df = rename_columns(merideth_2012_df)
merideth_2012_df = preprocess_interventions_and_diag(merideth_2012_df)
merideth_2012_df = preprocess_numerical_columns(merideth_2012_df, numerical_cols)

In [86]:
merideth2012_results = testing(df_truth, merideth_2012_df)

In [87]:
merideth2012_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Merideth et al., 2011",quetiapine: 150 mg/day,True,True,True,True,True,True,True,True,True
1,"Merideth et al., 2011",quetiapine: 300 mg/day,True,True,True,True,True,True,True,True,True
2,"Merideth et al., 2011",escitalopram: 10 mg/day,True,True,True,True,True,True,True,True,True
3,"Merideth et al., 2011",placebo,True,True,True,True,True,True,True,True,True


In [88]:
merideth2012_results_final = ensure_all_truth_interventions_present(merideth2012_results,df_truth)

In [89]:
merideth2012_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Merideth et al., 2011",quetiapine: 150 mg/day,True,True,True,True,True,True,True,True,True
1,"Merideth et al., 2011",quetiapine: 300 mg/day,True,True,True,True,True,True,True,True,True
2,"Merideth et al., 2011",escitalopram: 10 mg/day,True,True,True,True,True,True,True,True,True
3,"Merideth et al., 2011",placebo,True,True,True,True,True,True,True,True,True


In [90]:
file_path = os.path.join("data_test", "merideth2012data.csv")  
merideth_2012_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "merideth2012results.csv")  
merideth2012_results_final.to_csv(file_path, index=False) 

# Mahablesh 2013

In [91]:
file_path = "../Desktop/Testing/mahablesh2013.pdf"

# Extract content from PDF
mahablesh_2013_text, mahablesh_2013_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_mahablesh_2013 = query_gpt4_full(mahablesh_2013_text, mahablesh_2013_tables, "../Desktop/Testing/mahablesh2013.pdf")

In [92]:
mahablesh_2013_df = json_parsing(structured_data_mahablesh_2013)

In [93]:
mahablesh_2013_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Mahableshwarkar et al., 2014",781,Placebo,White,65.0,24.4,36.8,23.1,"Takeda Pharmaceutical Company, Ltd",12,DSM-IV
1,"Mahableshwarkar et al., 2014",781,Vortioxetine: 2.5 mg/day,White,69.9,25.3,39.2,23.1,"Takeda Pharmaceutical Company, Ltd",12,DSM-IV
2,"Mahableshwarkar et al., 2014",781,Vortioxetine: 5 mg/day,White,64.1,25.0,37.7,28.8,"Takeda Pharmaceutical Company, Ltd",12,DSM-IV
3,"Mahableshwarkar et al., 2014",781,Vortioxetine: 10 mg/day,White,67.3,25.3,39.8,28.2,"Takeda Pharmaceutical Company, Ltd",12,DSM-IV
4,"Mahableshwarkar et al., 2014",781,Duloxetine: 60 mg/day,White,72.4,25.0,39.5,32.1,"Takeda Pharmaceutical Company, Ltd",12,DSM-IV


In [94]:
df_truth[df_truth["References"] == "Mahableshwarkar et al., 2013"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
2,"Mahableshwarkar et al., 2013",781,vortioxetine: 2.5 mg/day,White,69.9,25.3,39.2,23.1,Takeda,8,dsm-iv
3,"Mahableshwarkar et al., 2013",781,vortioxetine: 5 mg/day,White,64.1,25.0,37.7,25.0,Takeda,8,dsm-iv
4,"Mahableshwarkar et al., 2013",781,vortioxetine: 10 mg/day,White,67.3,25.3,39.8,28.8,Takeda,8,dsm-iv
5,"Mahableshwarkar et al., 2013",781,duloxetine: 60 mg/day,White,72.4,25.0,39.5,32.1,Takeda,8,dsm-iv
6,"Mahableshwarkar et al., 2013",781,placebo,White,65.0,24.4,36.8,22.9,Takeda,8,dsm-iv


In [95]:
mahablesh_2013_df = rename_columns(mahablesh_2013_df)
mahablesh_2013_df = preprocess_interventions_and_diag(mahablesh_2013_df)
mahablesh_2013_df = preprocess_numerical_columns(mahablesh_2013_df, numerical_cols)

In [101]:
mahablesh2013_results = testing(df_truth, mahablesh_2013_df)

In [102]:
mahablesh2013_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Mahableshwarkar et al., 2014",vortioxetine: 10 mg/day,True,True,True,True,True,True,True,False,False
1,"Mahableshwarkar et al., 2014",vortioxetine: 2.5 mg/day,True,True,True,True,True,True,True,True,False
2,"Mahableshwarkar et al., 2014",vortioxetine: 5 mg/day,True,True,True,True,True,True,True,False,False
3,"Mahableshwarkar et al., 2014",placebo,True,True,True,True,True,True,True,True,False
4,"Mahableshwarkar et al., 2014",duloxetine: 60 mg/day,True,True,True,True,True,True,True,True,False


In [103]:
mahablesh2013_results_final = ensure_all_truth_interventions_present(mahablesh2013_results,df_truth)

In [104]:
mahablesh2013_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Mahableshwarkar et al., 2014",vortioxetine: 10 mg/day,True,True,True,True,True,True,True,False,False
1,"Mahableshwarkar et al., 2014",vortioxetine: 2.5 mg/day,True,True,True,True,True,True,True,True,False
2,"Mahableshwarkar et al., 2014",vortioxetine: 5 mg/day,True,True,True,True,True,True,True,False,False
3,"Mahableshwarkar et al., 2014",placebo,True,True,True,True,True,True,True,True,False
4,"Mahableshwarkar et al., 2014",duloxetine: 60 mg/day,True,True,True,True,True,True,True,True,False


In [105]:
file_path = os.path.join("data_test", "mahablesh2013data.csv")  
mahablesh_2013_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "mahablesh2013results.csv")  
mahablesh2013_results_final.to_csv(file_path, index=False) 

# Davidson 2004

In [106]:
file_path = "../Desktop/Testing/davidson2004.pdf"

# Extract content from PDF
davidson_2004_text, davidson_2004_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_davidson_2004 = query_gpt4_full(davidson_2004_text, davidson_2004_tables, "../Desktop/Testing/davidson2004.pdf")

In [107]:
davidson_2004_df = json_parsing(structured_data_davidson_2004)

In [108]:
davidson_2004_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Davidson et al., 2004",315,Escitalopram: 10-20 mg/day,Caucasian,52.5,23.6,39.5,25,"Forest Laboratories, Inc.",8,DSM-IV
1,"Davidson et al., 2004",315,Placebo,Caucasian,52.9,23.2,39.5,22,"Forest Laboratories, Inc.",8,DSM-IV


In [109]:
df_truth[df_truth["References"] == "Davidson et al., 2004"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
36,"Davidson et al., 2004",315,escitalopram: 10-20 mg/day,Caucasian,52.5,23.6,39.5,25,Forest laboratories,8,dsm-iv
37,"Davidson et al., 2004",315,placebo,Caucasian,52.9,23.2,39.5,22,Forest laboratories,8,dsm-iv


In [110]:
davidson_2004_df = rename_columns(davidson_2004_df)
davidson_2004_df = preprocess_interventions_and_diag(davidson_2004_df)
davidson_2004_df = preprocess_numerical_columns(davidson_2004_df, numerical_cols)

In [111]:
davidson2004_results = testing(df_truth, davidson_2004_df)

In [112]:
davidson2004_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Davidson et al., 2004",placebo,True,True,True,True,True,True,True,True,True
1,"Davidson et al., 2004",escitalopram: 10-20 mg/day,True,True,True,True,True,True,True,True,True


In [113]:
davidson2004_results_final = ensure_all_truth_interventions_present(davidson2004_results,df_truth)

In [114]:
davidson2004_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Davidson et al., 2004",placebo,True,True,True,True,True,True,True,True,True
1,"Davidson et al., 2004",escitalopram: 10-20 mg/day,True,True,True,True,True,True,True,True,True


In [115]:
file_path = os.path.join("data_test", "davidson2004data.csv")  
davidson_2004_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "davidson2004results.csv")  
davidson2004_results_final.to_csv(file_path, index=False)

# Pollock 2001

In [116]:
file_path = "../Desktop/Testing/pollock2001.pdf"

# Extract content from PDF
pollock_2001_text, pollock_2001_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_pollock_2001 = query_gpt4_full(pollock_2001_text, pollock_2001_tables, "../Desktop/Testing/pollock2001.pdf")

In [117]:
pollock_2001_df = json_parsing(structured_data_pollock_2001)

In [118]:
pollock_2001_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Pollack et al., 2001",324,Paroxetine: 20-50 mg/day,White,60.9,24.2,39.7,21.1,GlaxoSmithKline,8,DSM-IV
1,"Pollack et al., 2001",324,Placebo,White,66.3,24.1,41.3,18.4,GlaxoSmithKline,8,DSM-IV


In [119]:
df_truth[df_truth["References"] == "Pollack et al., 2001"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
80,"Pollack et al., 2001",324,paroxetine: 20-50 mg/day,White,60.9,24.2,39.7,21.1,GlaxoSmithKline,8,dsm-iv
81,"Pollack et al., 2001",324,placebo,White,66.3,24.1,41.3,18.4,GlaxoSmithKline,8,dsm-iv


In [120]:
pollock_2001_df = rename_columns(pollock_2001_df)
pollock_2001_df = preprocess_interventions_and_diag(pollock_2001_df)
pollock_2001_df = preprocess_numerical_columns(pollock_2001_df, numerical_cols)

In [121]:
pollock2001_results = testing(df_truth, pollock_2001_df)

In [122]:
pollock2001_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Pollack et al., 2001",paroxetine: 20-50 mg/day,True,True,True,True,True,True,True,True,True
1,"Pollack et al., 2001",placebo,True,True,True,True,True,True,True,True,True


In [123]:
pollock2001_results_final = ensure_all_truth_interventions_present(pollock2001_results,df_truth)

In [124]:
pollock2001_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Pollack et al., 2001",paroxetine: 20-50 mg/day,True,True,True,True,True,True,True,True,True
1,"Pollack et al., 2001",placebo,True,True,True,True,True,True,True,True,True


In [125]:
file_path = os.path.join("data_test", "pollock2001data.csv")  
pollock_2001_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "pollock2001results.csv")  
pollock2001_results_final.to_csv(file_path, index=False)

# Nicolini 2009 was omitted due to having an reference file with patient charecteristics linked, which means GPT-4o could not access. 

# Allgulander 2004

In [138]:
file_path = "../Desktop/Testing/allgulander2004.pdf"

# Extract content from PDF
allgulander_2004_text, allgulander_2004_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_allgulander_2004 = query_gpt4_full(allgulander_2004_text, allgulander_2004_tables, "../Desktop/Testing/allgulander2004.pdf")

In [139]:
allgulander_2004_df = json_parsing(structured_data_allgulander_2004)

In [140]:
allgulander_2004_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Allgulander et al., 2004",378,Sertraline: 50-150 mg/day,White,59,24.6,40.3,20,"Pfizer, Inc.",12,DSM-IV
1,"Allgulander et al., 2004",378,Placebo,White,51,25.0,42.4,27,"Pfizer, Inc.",12,DSM-IV


In [141]:
df_truth[df_truth["References"] == "Allgulander et al., 2004"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
34,"Allgulander et al., 2004",378,sertraline: 50-150 mg/day,White,59,24.6,40.3,20,Pfizer,12,dsm-iv
35,"Allgulander et al., 2004",378,placebo,White,51,25.0,42.4,27,Pfizer,12,dsm-iv


In [142]:
allgulander_2004_df = rename_columns(allgulander_2004_df)
allgulander_2004_df = preprocess_interventions_and_diag(allgulander_2004_df)
allgulander_2004_df = preprocess_numerical_columns(allgulander_2004_df, numerical_cols)

In [143]:
allgulander2004_results = testing(df_truth, allgulander_2004_df)

In [144]:
allgulander2004_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Allgulander et al., 2004",sertraline: 50-150 mg/day,True,True,True,True,True,True,True,True,True
1,"Allgulander et al., 2004",placebo,True,True,True,True,True,True,True,True,True


In [145]:
allgulander2004_results_final = ensure_all_truth_interventions_present(allgulander2004_results,df_truth)

In [146]:
allgulander2004_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Allgulander et al., 2004",sertraline: 50-150 mg/day,True,True,True,True,True,True,True,True,True
1,"Allgulander et al., 2004",placebo,True,True,True,True,True,True,True,True,True


In [147]:
file_path = os.path.join("data_test", "allgulander2004data.csv")  
allgulander_2004_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "allgulander2004results.csv")  
allgulander2004_results_final.to_csv(file_path, index=False)

# Wu 2011

In [148]:
file_path = "../Desktop/Testing/wu2011.pdf"

# Extract content from PDF
wu_2011_text, wu_2011_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_wu_2011 = query_gpt4_full(wu_2011_text, wu_2011_tables, "../Desktop/Testing/wu2011.pdf")

In [149]:
wu_2011_df = json_parsing(structured_data_wu_2011)

In [150]:
wu_2011_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Wu et al., 2011",210,Duloxetine: 60-120 mg/day,Chinese,46.3,24.5,37.3,24.1,Eli Lilly and Company,15,DSM-IV
1,"Wu et al., 2011",210,Placebo,Chinese,54.9,24.2,38.0,27.5,Eli Lilly and Company,15,DSM-IV


In [151]:
df_truth[df_truth["References"] == "Wu et al., 2011"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
73,"Wu et al., 2011",210,duloxetine: 60-120 mg/day,Chinese,46.3,24.5,37.3,24.1,Eli Lilly,15,dsm-iv
74,"Wu et al., 2011",210,placebo,Chinese,54.9,24.2,38.0,27.5,Eli Lilly,15,dsm-iv


In [152]:
wu_2011_df = rename_columns(wu_2011_df)
wu_2011_df = preprocess_interventions_and_diag(wu_2011_df)
wu_2011_df = preprocess_numerical_columns(wu_2011_df, numerical_cols)

In [153]:
wu2011_results = testing(df_truth, wu_2011_df)

In [154]:
wu2011_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Wu et al., 2011",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
1,"Wu et al., 2011",placebo,True,True,True,True,True,True,True,True,True


In [155]:
wu2011_results_final = ensure_all_truth_interventions_present(wu2011_results,df_truth)

In [156]:
wu2011_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Wu et al., 2011",duloxetine: 60-120 mg/day,True,True,True,True,True,True,True,True,True
1,"Wu et al., 2011",placebo,True,True,True,True,True,True,True,True,True


In [157]:
file_path = os.path.join("data_test", "wu2011data.csv")  
wu_2011_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "wu2011results.csv")  
wu2011_results_final.to_csv(file_path, index=False)

# Bose 2008

In [158]:
file_path = "../Desktop/Testing/bose2008.pdf"

# Extract content from PDF
bose_2008_text, bose_2008_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_bose_2008 = query_gpt4_full(bose_2008_text, bose_2008_tables, "../Desktop/Testing/bose2008.pdf")

In [159]:
bose_2008_df = json_parsing(structured_data_bose_2008)

In [160]:
bose_2008_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Bose et al., 2008",404,Escitalopram: 10-20 mg/day,White,64.6,24.2,38.2,19.7,Forest Laboratories,8,DSM-IV
1,"Bose et al., 2008",404,Venlafaxine XR: 75-225 mg/day,White,59.7,23.8,37.1,25.6,Forest Laboratories,8,DSM-IV
2,"Bose et al., 2008",404,Placebo,White,62.5,23.7,37.6,23.5,Forest Laboratories,8,DSM-IV


In [168]:
df_truth[df_truth["References"] == "Bose et al., 2008"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
49,"Bose et al., 2008",404,escitalopram: 10-20 mg/day,White,64.6,24.2,38.2,19.7,Forest laboratories,8,dsm-iv
50,"Bose et al., 2008",404,venlafaxine: 75-225 mg/day,White,59.7,23.8,37.1,25.6,Forest laboratories,8,dsm-iv
51,"Bose et al., 2008",404,placebo,White,62.5,23.7,37.6,23.5,Forest laboratories,8,dsm-iv


In [169]:
bose_2008_df = rename_columns(bose_2008_df)
bose_2008_df = preprocess_interventions_and_diag(bose_2008_df)
bose_2008_df = preprocess_numerical_columns(bose_2008_df, numerical_cols)

In [170]:
bose_2008_df

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
0,"Bose et al., 2008",404,escitalopram: 10-20 mg/day,White,64.6,24.2,38.2,19.7,Forest Laboratories,8,dsm-iv
1,"Bose et al., 2008",404,venlafaxine: 75-225 mg/day,White,59.7,23.8,37.1,25.6,Forest Laboratories,8,dsm-iv
2,"Bose et al., 2008",404,placebo,White,62.5,23.7,37.6,23.5,Forest Laboratories,8,dsm-iv


In [171]:
bose2008_results = testing(df_truth, bose_2008_df)

In [172]:
bose2008_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Bose et al., 2008",venlafaxine: 75-225 mg/day,True,True,True,True,True,True,True,True,True
1,"Bose et al., 2008",placebo,True,True,True,True,True,True,True,True,True
2,"Bose et al., 2008",escitalopram: 10-20 mg/day,True,True,True,True,True,True,True,True,True


In [173]:
bose2008_results_final = ensure_all_truth_interventions_present(bose2008_results,df_truth)

In [174]:
bose2008_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Bose et al., 2008",venlafaxine: 75-225 mg/day,True,True,True,True,True,True,True,True,True
1,"Bose et al., 2008",placebo,True,True,True,True,True,True,True,True,True
2,"Bose et al., 2008",escitalopram: 10-20 mg/day,True,True,True,True,True,True,True,True,True


In [175]:
file_path = os.path.join("data_test", "bose2008data.csv")  
bose_2008_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "bose2008results.csv")  
bose2008_results_final.to_csv(file_path, index=False)

# Rickels 2003

In [176]:
file_path = "../Desktop/Testing/rickels2003.pdf"

# Extract content from PDF
rickels_2003_text, rickels_2003_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_rickels_2003 = query_gpt4_full(rickels_2003_text, rickels_2003_tables, "../Desktop/Testing/rickels2003.pdf")

In [177]:
rickels_2003_df = json_parsing(structured_data_rickels_2003)

In [178]:
rickels_2003_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Rickels et al., 2003",566,Paroxetine: 20 mg/day,White,54,24.1,40.2,23.9,GlaxoSmithKline,8,DSM-IV
1,"Rickels et al., 2003",566,Paroxetine: 40 mg/day,White,56,23.8,40.5,27.4,GlaxoSmithKline,8,DSM-IV
2,"Rickels et al., 2003",566,Placebo,White,56,24.4,40.8,22.2,GlaxoSmithKline,8,DSM-IV


In [179]:
df_truth[df_truth["References"] == "Rickels et al., 2003"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
42,"Rickels et al., 2003",566,paroxetine: 20 mg/day,White,54,24.1,40.2,23.9,GlaxoSmithKline,8,dsm-iv
43,"Rickels et al., 2003",566,paroxetine: 40 mg/day,White,56,23.8,40.5,27.4,GlaxoSmithKline,8,dsm-iv
44,"Rickels et al., 2003",566,placebo,White,56,24.4,40.8,22.2,GlaxoSmithKline,8,dsm-iv


In [180]:
rickels_2003_df = rename_columns(rickels_2003_df)
rickels_2003_df = preprocess_interventions_and_diag(rickels_2003_df)
rickels_2003_df = preprocess_numerical_columns(rickels_2003_df, numerical_cols)

In [181]:
rickels2003_results = testing(df_truth, rickels_2003_df)

In [182]:
rickels2003_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rickels et al., 2003",paroxetine: 20 mg/day,True,True,True,True,True,True,True,True,True
1,"Rickels et al., 2003",paroxetine: 40 mg/day,True,True,True,True,True,True,True,True,True
2,"Rickels et al., 2003",placebo,True,True,True,True,True,True,True,True,True


In [183]:
rickels2003_results_final = ensure_all_truth_interventions_present(rickels2003_results,df_truth)

In [184]:
rickels2003_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rickels et al., 2003",paroxetine: 20 mg/day,True,True,True,True,True,True,True,True,True
1,"Rickels et al., 2003",paroxetine: 40 mg/day,True,True,True,True,True,True,True,True,True
2,"Rickels et al., 2003",placebo,True,True,True,True,True,True,True,True,True


In [185]:
file_path = os.path.join("data_test", "rickels2003data.csv")  
rickels_2003_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "rickels2003results.csv")  
rickels2003_results_final.to_csv(file_path, index=False)

# Rothschild 2012

In [186]:
file_path = "../Desktop/Testing/rothschild2012.pdf"

# Extract content from PDF
rothschild_2012_text, rothschild_2012_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_rothschild_2012 = query_gpt4_full(rothschild_2012_text, rothschild_2012_tables, "../Desktop/Testing/rothschild2012.pdf")

In [187]:
rothschild_2012_df = json_parsing(structured_data_rothschild_2012)

In [188]:
rothschild_2012_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Rothschild et al., 2012",304,Vortioxetine: 5 mg/day,"Caucasian (White, including Hispanic)",67.8,24.7,41.0,17.8,"Takeda Pharmaceutical Company, Ltd.",8,DSM-IV
1,"Rothschild et al., 2012",304,Placebo,"Caucasian (White, including Hispanic)",63.8,24.6,41.4,25.0,"Takeda Pharmaceutical Company, Ltd.",8,DSM-IV


In [189]:
df_truth[df_truth["References"] == "Rothschild et al., 2012"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
71,"Rothschild et al., 2012",304,vortioxetine: 5 mg/day,Caucasian,67.8,24.7,41.0,17.8,Takeda,8,dsm-iv
72,"Rothschild et al., 2012",304,placebo,Caucasian,63.8,24.6,41.4,25.0,Takeda,8,dsm-iv


In [190]:
rothschild_2012_df = rename_columns(rothschild_2012_df)
rothschild_2012_df = preprocess_interventions_and_diag(rothschild_2012_df)
rothschild_2012_df = preprocess_numerical_columns(rothschild_2012_df, numerical_cols)

In [198]:
rothschild2012_results = testing(df_truth, rothschild_2012_df)

In [199]:
rothschild2012_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rothschild et al., 2012",vortioxetine: 5 mg/day,True,True,True,True,True,True,True,True,True
1,"Rothschild et al., 2012",placebo,True,True,True,True,True,True,True,True,True


In [200]:
rothschild2012_results_final = ensure_all_truth_interventions_present(rothschild2012_results,df_truth)

In [201]:
rothschild2012_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Rothschild et al., 2012",vortioxetine: 5 mg/day,True,True,True,True,True,True,True,True,True
1,"Rothschild et al., 2012",placebo,True,True,True,True,True,True,True,True,True


In [202]:
file_path = os.path.join("data_test", "rothschild2012data.csv")  
rothschild_2012_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "rothschild2012results.csv")  
rothschild2012_results_final.to_csv(file_path, index=False)

# Stein 2008

In [210]:
file_path = "../Desktop/Testing/stein2008.pdf"

# Extract content from PDF
stein_2008_text, stein_2008_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_stein_2008 = query_gpt4_full(stein_2008_text, stein_2008_tables, "../Desktop/Testing/stein2008.pdf")

In [211]:
stein_2008_df = json_parsing(structured_data_stein_2008)

In [212]:
stein_2008_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Stein et al., 2008",121,Agomelatine: 25-50 mg/day,,68.3,29.0,41.7,7.4,Servier,12,DSM-IV
1,"Stein et al., 2008",121,Placebo,,68.9,28.6,41.7,6.9,Servier,12,DSM-IV


In [213]:
df_truth[df_truth["References"] == "Stein et al., 2008"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
32,"Stein et al., 2008",121,agomelatine: 25-50 mg/day,,,29.0,,8.0,Servier,12,dsm-iv
33,"Stein et al., 2008",121,placebo,,,28.6,,6.9,Servier,12,dsm-iv


In [214]:
stein_2008_df = rename_columns(stein_2008_df)
stein_2008_df = preprocess_interventions_and_diag(stein_2008_df)
stein_2008_df = preprocess_numerical_columns(stein_2008_df, numerical_cols)

In [215]:
stein2008_results = testing(df_truth, stein_2008_df)

In [216]:
stein2008_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Stein et al., 2008",agomelatine: 25-50 mg/day,True,True,True,True,False,True,False,False,True
1,"Stein et al., 2008",placebo,True,True,True,True,False,True,False,True,True


In [217]:
stein2008_results_final = ensure_all_truth_interventions_present(stein2008_results,df_truth)

In [218]:
stein2008_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Stein et al., 2008",agomelatine: 25-50 mg/day,True,True,True,True,False,True,False,False,True
1,"Stein et al., 2008",placebo,True,True,True,True,False,True,False,True,True


In [219]:
file_path = os.path.join("data_test", "stein2008data.csv")  
stein_2008_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "stein2008results.csv")  
stein2008_results_final.to_csv(file_path, index=False)

# Khan 2011

In [220]:
file_path = "../Desktop/Testing/khan2011.pdf"

# Extract content from PDF
khan_2011_text, khan_2011_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_khan_2011 = query_gpt4_full(khan_2011_text, khan_2011_tables, "../Desktop/Testing/khan2011.pdf")

In [221]:
khan_2011_df = json_parsing(structured_data_khan_2011)

In [222]:
khan_2011_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Khan et al., 2011",951,Quetiapine XR: 50 mg/day,,,,,15.4,AstraZeneca Pharmaceuticals,10,DSM-IV-TR
1,"Khan et al., 2011",951,Quetiapine XR: 150 mg/day,,,,,15.6,AstraZeneca Pharmaceuticals,10,DSM-IV-TR
2,"Khan et al., 2011",951,Quetiapine XR: 300 mg/day,,,,,18.5,AstraZeneca Pharmaceuticals,10,DSM-IV-TR
3,"Khan et al., 2011",951,Placebo,,,,,13.9,AstraZeneca Pharmaceuticals,10,DSM-IV-TR


In [226]:
df_truth[df_truth["References"] == "Khan et al., 2011"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
7,"Khan et al., 2011",951,quetiapine: 50 mg/day,,,,,,AstraZeneca,8,dsm-iv
8,"Khan et al., 2011",951,quetiapine: 150 mg/day,,,,,,AstraZeneca,8,dsm-iv
9,"Khan et al., 2011",951,quetiapine: 300 mg/day,,,,,,AstraZeneca,8,dsm-iv
10,"Khan et al., 2011",951,placebo,,,,,,AstraZeneca,8,dsm-iv


In [227]:
khan_2011_df = rename_columns(khan_2011_df)
khan_2011_df = preprocess_interventions_and_diag(khan_2011_df)
khan_2011_df = preprocess_numerical_columns(khan_2011_df, numerical_cols)

In [228]:
khan2011_results = testing(df_truth, khan_2011_df)

In [229]:
khan2011_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Khan et al., 2011",quetiapine: 150 mg/day,True,True,True,True,True,True,True,False,False
1,"Khan et al., 2011",quetiapine: 300 mg/day,True,True,True,True,True,True,True,False,False
2,"Khan et al., 2011",quetiapine: 50 mg/day,True,True,True,True,True,True,True,False,False
3,"Khan et al., 2011",placebo,True,True,True,True,True,True,True,False,False


In [230]:
khan2011_results_final = ensure_all_truth_interventions_present(khan2011_results,df_truth)

In [231]:
khan2011_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Khan et al., 2011",quetiapine: 150 mg/day,True,True,True,True,True,True,True,False,False
1,"Khan et al., 2011",quetiapine: 300 mg/day,True,True,True,True,True,True,True,False,False
2,"Khan et al., 2011",quetiapine: 50 mg/day,True,True,True,True,True,True,True,False,False
3,"Khan et al., 2011",placebo,True,True,True,True,True,True,True,False,False


In [232]:
file_path = os.path.join("data_test", "khan2011data.csv")  
khan_2011_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "khan2011results.csv")  
khan2011_results_final.to_csv(file_path, index=False)

# Alaka 2014

In [233]:
file_path = "../Desktop/Testing/alaka2014.pdf"

# Extract content from PDF
alaka_2014_text, alaka_2014_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_alaka_2014 = query_gpt4_full(alaka_2014_text, alaka_2014_tables, "../Desktop/Testing/alaka2014.pdf")

In [234]:
alaka_2014_df = json_parsing(structured_data_alaka_2014)

In [235]:
alaka_2014_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Alaka et al., 2014",291,Duloxetine: 30-120 mg/day,Caucasian,75.5,24.6,71.4,24,Eli Lilly and Company,10,DSM-IV
1,"Alaka et al., 2014",291,Placebo,Caucasian,80.0,24.4,71.7,25,Eli Lilly and Company,10,DSM-IV


In [236]:
df_truth[df_truth["References"] == "Alaka et al., 2014"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
24,"Alaka et al., 2014",291,duloxetine: 60-120 mg/day,Caucasian,75.5,24.6,71.4,25,Eli Lilly,10,dsm-v
25,"Alaka et al., 2014",291,placebo,Caucasian,80.0,24.4,71.7,24,Eli Lilly,10,dsm-v


In [252]:
alaka_2014_df = rename_columns(alaka_2014_df)
alaka_2014_df = preprocess_interventions_and_diag(alaka_2014_df)
alaka_2014_df = preprocess_numerical_columns(alaka_2014_df, numerical_cols)

In [253]:
alaka2014_results = testing(df_truth, alaka_2014_df)

In [254]:
alaka2014_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Alaka et al., 2014",placebo,True,True,True,True,True,True,True,True,True
1,"Alaka et al., 2014",duloxetine: 30-120 mg/day,True,True,True,True,True,True,True,True,True


In [255]:
alaka2014_results_final = ensure_all_truth_interventions_present(alaka2014_results,df_truth)

In [256]:
alaka2014_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Alaka et al., 2014",placebo,True,True,True,True,True,True,True,True,True
1,"Alaka et al., 2014",duloxetine: 30-120 mg/day,True,True,True,True,True,True,True,True,True


In [257]:
file_path = os.path.join("data_test", "alaka2014data.csv")  
alaka_2014_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "alaka2014results.csv")  
alaka2014_results_final.to_csv(file_path, index=False)

# Ball 2015

In [258]:
file_path = "../Desktop/Testing/ball2015.pdf"

# Extract content from PDF
ball_2015_text, ball_2015_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_ball_2015 = query_gpt4_full(ball_2015_text, ball_2015_tables, "../Desktop/Testing/ball2015.pdf")

In [259]:
ball_2015_df = json_parsing(structured_data_ball_2015)

In [260]:
ball_2015_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Ball et al., 2015",291,Duloxetine: 30-120 mg/day,Caucasian,77.7,24.5,71.6,,Eli Lilly and Company,10,DSM-IV TR
1,"Ball et al., 2015",291,Placebo,Caucasian,77.7,24.5,71.6,,Eli Lilly and Company,10,DSM-IV TR


In [261]:
df_truth[df_truth["References"] == "Ball et al., 2015"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
58,"Ball et al., 2015",291,duloxetine: 30-120 mg/day,Caucasian,,,,,Eli Lilly,10,dsm-iv
59,"Ball et al., 2015",291,placebo,Caucasian,,,,,Eli Lilly,10,dsm-iv


In [262]:
ball_2015_df = rename_columns(ball_2015_df)
ball_2015_df = preprocess_interventions_and_diag(ball_2015_df)
ball_2015_df = preprocess_numerical_columns(ball_2015_df, numerical_cols)

In [263]:
ball2015_results = testing(df_truth, ball_2015_df)

In [264]:
ball2015_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Ball et al., 2015",placebo,True,True,True,True,False,False,False,True,True
1,"Ball et al., 2015",duloxetine: 30-120 mg/day,True,True,True,True,False,False,False,True,True


In [265]:
ball2015_results_final = ensure_all_truth_interventions_present(ball2015_results,df_truth)

In [266]:
ball2015_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Ball et al., 2015",placebo,True,True,True,True,False,False,False,True,True
1,"Ball et al., 2015",duloxetine: 30-120 mg/day,True,True,True,True,False,False,False,True,True


In [267]:
file_path = os.path.join("data_test", "ball2015data.csv")  
ball_2015_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "ball2015results.csv")  
ball2015_results_final.to_csv(file_path, index=False)

# Nimatoudis 2004

In [268]:
file_path = "../Desktop/Testing/nimatoudis2004.pdf"

# Extract content from PDF
nimatoudis_2004_text, nimatoudis_2004_tables = extract_pdf_content(file_path)

# Query GPT-4
structured_data_nimatoudis_2004 = query_gpt4_full(nimatoudis_2004_text, nimatoudis_2004_tables, "../Desktop/Testing/nimatoudis2004.pdf")

In [269]:
nimatoudis_2004_df = json_parsing(structured_data_nimatoudis_2004)

In [270]:
nimatoudis_2004_df

Unnamed: 0,Last Name of Main Author and Year,Full Population Sample Size,Intervention,Main Race,Percent of Intervention Population that is Female (%),Mean HAMA Score,Mean Population Age (Year),Attrition Rate (%),Full Sponsor Name,Follow-up Time (Weeks),Diagnostic Criteria
0,"Nimatoudis et al., 2004",46,Venlafaxine XR: 75-150 mg/day,,66.7,27.1,41,20.8,Wyeth Hellas,8,DSM-IV
1,"Nimatoudis et al., 2004",46,Placebo,,68.2,28.5,44,50.0,Wyeth Hellas,8,DSM-IV


In [271]:
df_truth[df_truth["References"] == "Nimatoudis et al., 2004"]

Unnamed: 0,References,Sample size,Interventions,Main race,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Sponsor,Follow-up time (weeks),Diagnosis criteria
52,"Nimatoudis et al., 2004",46,venlafaxine: 75 mg/day,,66.7,27.1,41,21,,8,dsm-iv
53,"Nimatoudis et al., 2004",46,placebo,,68.2,28.5,44,50,,8,dsm-iv


In [278]:
nimatoudis_2004_df = rename_columns(nimatoudis_2004_df)
nimatoudis_2004_df = preprocess_interventions_and_diag(nimatoudis_2004_df)
nimatoudis_2004_df = preprocess_numerical_columns(nimatoudis_2004_df, numerical_cols)

In [279]:
nimatoudis2004_results = testing(df_truth, nimatoudis_2004_df)

In [280]:
nimatoudis2004_results

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Nimatoudis et al., 2004",venlafaxine: 75-150 mg/day,True,True,False,True,True,True,True,True,True
1,"Nimatoudis et al., 2004",placebo,True,True,False,True,True,True,True,True,True


In [281]:
nimatoudis2004_results_final = ensure_all_truth_interventions_present(nimatoudis2004_results,df_truth)

In [282]:
nimatoudis2004_results_final

Unnamed: 0,References,Interventions,Main race,Diagnosis criteria,Sponsor,Sample size,Female (%),Mean HAMA,Mean age (Year),Attrition rate (%),Follow-up time (weeks)
0,"Nimatoudis et al., 2004",venlafaxine: 75-150 mg/day,True,True,False,True,True,True,True,True,True
1,"Nimatoudis et al., 2004",placebo,True,True,False,True,True,True,True,True,True


In [283]:
file_path = os.path.join("data_test", "nimatoudis2004data.csv")  
nimatoudis_2004_df.to_csv(file_path, index=False)  

file_path = os.path.join("results_test", "nimatoudis2004results.csv")  
nimatoudis2004_results_final.to_csv(file_path, index=False)