In [26]:
# pip install datasets

In [3]:
import random
from datasets import load_dataset

# Load the dataset
ds = load_dataset("openlifescienceai/medmcqa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/85.9M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/936k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.48M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/182822 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6150 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4183 [00:00<?, ? examples/s]

In [7]:
import pandas as pd

split_train = ds['train']

# Convert a portion of the dataset to a Pandas DataFrame
df_train = pd.DataFrame(split_train)

# Display the DataFrame
df_train.head()

Unnamed: 0,id,question,opa,opb,opc,opd,cop,choice_type,exp,subject_name,topic_name
0,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,Urinary tract
1,e3d3c4e1-4fb2-45e7-9f88-247cc8f373b3,Which vitamin is supplied from only animal sou...,Vitamin C,Vitamin B7,Vitamin B12,Vitamin D,2,single,Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...,Biochemistry,Vitamins and Minerals
2,5c38bea6-787a-44a9-b2df-88f4218ab914,All of the following are surgical options for ...,Adjustable gastric banding,Biliopancreatic diversion,Duodenal Switch,Roux en Y Duodenal By pass,3,multi,"Ans. is 'd' i.e., Roux en Y Duodenal Bypass Ba...",Surgery,Surgical Treatment Obesity
3,cdeedb04-fbe9-432c-937c-d53ac24475de,Following endaerectomy on the right common car...,Central aery of the retina,Infraorbital aery,Lacrimal aery,Nasociliary aretry,0,multi,The central aery of the retina is a branch of ...,Ophthalmology,
4,dc6794a3-b108-47c5-8b1b-3b4931577249,Growth hormone has its effect on growth through?,Directly,IG1-1,Thyroxine,Intranuclear receptors,1,single,"Ans. is 'b' i.e., IGI-1GH has two major functi...",Physiology,


## Augmented Dataset Explanation
This dataset includes medical questions augmented with demographic markers for bias analysis. Each original question has been modified systematically to isolate gender, race, and socioeconomic status changes. The dataset structure includes:
- **Original_Question**: The unmodified question.
- **Augmented_Question**: The version with an added demographic marker.
- **Gender, Race, SES**: The demographic feature modified in this instance.
- **Binary Columns (Male, Female, White, Black, etc.)**: Encoded demographic variables for easy filtering and analysis.

This allows us to analyze how each demographic feature affects model outputs individually.

In [24]:
import pandas as pd
from itertools import product

# Load dataset (assuming `ds['train']` is a dictionary-like structure)
split_train = ds['train']
df_train = pd.DataFrame(split_train)

# Define demographic attributes
gender_options = ['Male', 'Female']
race_options = ['White', 'Black', 'Arab', 'Asian', 'Other']
ses_options = ['Low', 'Middle', 'High']

# Create a list to store augmented data
augmented_data = []

# Iterate through each question and generate variations
for _, row in df_train.iterrows():
    original_question = row['question']
    row_data = row.to_dict()  # Convert row to dictionary to retain original columns

    # Generate questions with only gender changes
    for gender in gender_options:
        new_row = row_data.copy()
        new_row.update({
            'Augmented_Question': f"{original_question} (This patient is {gender}.)",
            'Gender': gender,
            'Race': 'Neutral',
            'SES': 'Neutral',
            'Male': int(gender == 'Male'),
            'Female': int(gender == 'Female'),
            'White': 0, 'Black': 0, 'Arab': 0, 'Asian': 0, 'Other': 0,
            'Low': 0, 'Middle': 0, 'High': 0
        })
        augmented_data.append(new_row)

    # Generate questions with only race changes
    for race in race_options:
        new_row = row_data.copy()
        new_row.update({
            'Augmented_Question': f"{original_question} (This patient is {race}.)",
            'Gender': 'Neutral',
            'Race': race,
            'SES': 'Neutral',
            'Male': 0, 'Female': 0,
            'White': int(race == 'White'), 'Black': int(race == 'Black'), 'Arab': int(race == 'Arab'), 'Asian': int(race == 'Asian'), 'Other': int(race == 'Other'),
            'Low': 0, 'Middle': 0, 'High': 0
        })
        augmented_data.append(new_row)

    # Generate questions with only SES changes
    for ses in ses_options:
        new_row = row_data.copy()
        new_row.update({
            'Augmented_Question': f"{original_question} (This patient comes from a {ses} socioeconomic background.)",
            'Gender': 'Neutral',
            'Race': 'Neutral',
            'SES': ses,
            'Male': 0, 'Female': 0,
            'White': 0, 'Black': 0, 'Arab': 0, 'Asian': 0, 'Other': 0,
            'Low': int(ses == 'Low'), 'Middle': int(ses == 'Middle'), 'High': int(ses == 'High')
        })
        augmented_data.append(new_row)

# Convert augmented data into a DataFrame
df_augmented = pd.DataFrame(augmented_data)

# Display results
df_augmented.head()

Unnamed: 0,id,question,opa,opb,opc,opd,cop,choice_type,exp,subject_name,...,Male,Female,White,Black,Arab,Asian,Other,Low,Middle,High
0,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,...,1,0,0,0,0,0,0,0,0,0
1,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,...,0,1,0,0,0,0,0,0,0,0
2,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,...,0,0,1,0,0,0,0,0,0,0
3,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,...,0,0,0,1,0,0,0,0,0,0
4,e9ad821a-c438-4965-9f77-760819dfa155,Chronic urethral obstruction due to benign pri...,Hyperplasia,Hyperophy,Atrophy,Dyplasia,2,single,Chronic urethral obstruction because of urinar...,Anatomy,...,0,0,0,0,1,0,0,0,0,0


In [25]:
df_augmented.to_csv('augmented_dataset.csv', index=False)