In [1]:
# PHASE ONE - LOCAL TESTING / DRY RUN
# Author: Arturo Valle
# Goal: 
# Load the CuraiHealth dataset and confirm that Ollama is installed and responding

In [2]:
# !pip install -q datasets pandas

In [3]:
# Imports
from datasets import load_dataset   # load datasets directly from Hugging Face
import pandas as pd                 # work with tables


In [4]:
# Load dataset
# Download the CuraiHealth medical question pairs dataset
ds = load_dataset("curaihealth/medical_questions_pairs")

# Display dataset
ds


DatasetDict({
    train: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label'],
        num_rows: 3048
    })
})

In [5]:
# Convert the Hugging Face dataset to a pandas DataFrame
df = pd.DataFrame(ds["train"])

# Show the first 5 rows to confirm
df.head()


Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0


In [6]:
# Inspect data
# Check overall shape (rows, columns)
print("Shape:", df.shape)

# Get info about datatypes and nulls
df.info()

# Count missing values per column
print("\nMissing values per column:")
print(df.isna().sum())

# Randomly sample 5 rows to inspect data
print("\nSample rows:")
display(df.sample(5))


Shape: (3048, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   dr_id       3048 non-null   int64 
 1   question_1  3048 non-null   object
 2   question_2  3048 non-null   object
 3   label       3048 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 95.4+ KB

Missing values per column:
dr_id         0
question_1    0
question_2    0
label         0
dtype: int64

Sample rows:


Unnamed: 0,dr_id,question_1,question_2,label
762,3,"Posterior vitreous detachments eye-p.v.d., wha...",How posterior viterous detachment of eye is tr...,0
2914,11,My first sonogram said I was 7 weeks pregnant ...,Hello doctor I had sex on 14th june with ex a...,0
2081,8,Is it possible to be suffering allergic reacti...,I got a rash on my belly. I am allergic to dog...,0
1746,7,I didn't get my period for a month and now hav...,I have PCOS and my cycles are irregular. I mis...,0
2382,9,My daughter was diagnosed w a murmur@her4month...,My 4 month old daughter has been diagnsoed wit...,1


In [7]:
# Rename columns
df = df.rename(columns={
    "question_1": "question_type_1",
    "question_2": "question_type_2"
})

# New column names
df.head()


Unnamed: 0,dr_id,question_type_1,question_type_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0


In [8]:
# Quick stats
print("Unique doctors in dataset:", df["dr_id"].nunique())

print("\nLabel counts:")
print(df["label"].value_counts())


Unique doctors in dataset: 11

Label counts:
label
1    1524
0    1524
Name: count, dtype: int64


In [1]:
# Ollama Test
# This aproach had to be aborted since my computers did not take running ollama that well. 
