In [1]:
from cns_obsidian.utils import load_variable_json
import pandas as pd
import json

In [2]:
path = "/gpfs/data/oermannlab/private_data/TheMedScrolls/FiguresJadenTextract/{}/full_journal_dataset_both_test.json"
nsgy_df = pd.read_json(path.format("Neurosurgery"))
opns_df = pd.read_json(path.format("Operative_Neurosurgery"))
nspr_df = pd.read_json(path.format("Neurosurgery_Practice"))
ns_df = pd.concat([nsgy_df, opns_df, nspr_df])

sans_questions = pd.read_csv("/gpfs/data/oermannlab/private_data/SANS_Questions/Full_set_SANS_Modules_Gen_Indications_ABNS_local.csv",
                            index_col=0)

In [3]:
gpt_df = ns_df[(ns_df["mode"] == "mc") & (ns_df["source"] == "gpt")]
claude_df = ns_df[(ns_df["mode"] == "mc") & (ns_df["source"] == "claude")]

In [4]:
def expand_row(row):
    question = json.loads(row['question'].replace("\n", ""))
    
    # Handle the case where 'b' is a list and needs to be unpacked
    if isinstance(question.get('answer_choices'), list):
        if len(question['answer_choices']) != 5: return None
        letters = ["A", "B", "C", "D", "E"]
        question.update({f'answer_choice_{letters[i]}': value for i, value in enumerate(question['answer_choices'])})
        del question['answer_choices']
    
    # Create a new DataFrame from the expanded dictionary
    expanded_df = pd.DataFrame([question])
    
    # Add other columns from the original DataFrame
    for col in ["image", "source"]:
        expanded_df[col] = row[col]
    
    return expanded_df

In [5]:
claude_exp = []
for index, row in claude_df.iterrows():
    claude_exp.append(expand_row(row))

claude_exp = pd.concat(claude_exp, ignore_index=True)

gpt_exp = []
for index, row in gpt_df.iterrows():
    gpt_exp.append(expand_row(row))
gpt_exp = pd.concat(gpt_exp, ignore_index=True)

In [6]:
renaming = {"Question Text": "question_stem",
            "Answer Choice1": "answer_choice_A",
            "Answer Choice2": "answer_choice_B",
            "Answer Choice3": "answer_choice_C",
            "Answer Choice4": "answer_choice_D",
            "Answer Choice5": "answer_choice_E",
            "Correct Answer": "correct_answer",
            "Discussion": "discussion",
            "Question Assets": "image"}

sans_questions = sans_questions[renaming.keys()]
sans_questions = sans_questions.rename(columns=renaming)


In [7]:
sans_exp = sans_questions[~((sans_questions["image"].isna()) | (sans_questions["image"].str.contains(',')))]
sans_exp["image"] = sans_exp["image"].apply(lambda x: x + ".jpg")
sans_exp["source"] = "sans"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sans_exp["image"] = sans_exp["image"].apply(lambda x: x + ".jpg")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sans_exp["source"] = "sans"


In [8]:
gpt_exp.tail();

In [9]:
def extend_image_path(row):
    if row['source'] == 'sans':
        new_path = "/gpfs/data/oermannlab/private_data/SANS_Questions/question_assets/" + row['image']
    elif row['image'].split('_')[3] == 'NSGY':
        new_path = "/gpfs/data/oermannlab/private_data/TheMedScrolls/FiguresJadenTextract/Neurosurgery/images/" + row['image']
    elif row['image'].split('_')[3] == 'OPNSGY':
        new_path = "/gpfs/data/oermannlab/private_data/TheMedScrolls/FiguresJadenTextract/Operative_Neurosurgery/images/" + row['image']
    elif row['image'].split('_')[3] == 'PRAC':
        new_path = "/gpfs/data/oermannlab/private_data/TheMedScrolls/FiguresJadenTextract/Neurosurgery_Practice/images/" + row['image']
    else:
        raise RuntimeError
    return new_path

In [10]:
gpt_path = gpt_exp.copy()
gpt_path['image'] = gpt_path.apply(extend_image_path, axis=1)
claude_path = claude_exp.copy()
claude_path['image'] = claude_path.apply(extend_image_path, axis=1)
sans_path = sans_exp.copy()
sans_path['image'] = sans_path.apply(extend_image_path, axis=1)

In [11]:
import random
import numpy as np
random.seed(314)
np.random.seed(314)
gpt_shuffled = gpt_path.sample(frac=1).reset_index(drop=True)[:200]
claude_shuffled = claude_path.sample(frac=1).reset_index(drop=True)[:200]
sans_shuffled = sans_path.sample(frac=1).reset_index(drop=True)[:200]
gpt_shuffled['fold'] = [i for i in range(20)] * 10
claude_shuffled['fold'] = [i for i in range(20)] * 10
sans_shuffled['fold'] = [i for i in range(20)] * 10

In [12]:
all_shuffled = pd.concat([gpt_shuffled, claude_shuffled, sans_shuffled])
all_shuffled['rand'] = np.random.randint(0, 10**10, 600)

In [13]:
final_df = all_shuffled.reset_index().sort_values(['fold', 'rand']).reset_index(drop=True).drop(columns=['index']).reset_index()
final_df['index'] = final_df['index'] * 314
final_df = final_df.drop(columns=['rand'])[:150]
final_df;

In [14]:
import shutil

for i, row in final_df.iterrows():
    src = row['image']
    dst = '/gpfs/data/oermannlab/users/alyaka01/important-obsidian-data/mcq_eval/images' + '0' * (6 - len(str(row['index']))) + str(row['index']) + '.jpg'
    shutil.copy(src, dst)

In [15]:
final_df.to_csv('/gpfs/data/oermannlab/users/alyaka01/important-obsidian-data/mcq_eval/questions.csv')