In [1]:
# PHASE ONE - LOCAL TESTING / DRY RUN
# Author: Arturo Valle
# Goal: 
# Load the CuraiHealth dataset and confirm that Ollama is installed and responding

In [None]:
# !pip install -q datasets pandas

In [3]:
# Imports
from datasets import load_dataset   # load datasets directly from Hugging Face
import pandas as pd                 # work with tables


In [4]:
# Load dataset
# Download the CuraiHealth medical question pairs dataset
ds = load_dataset("curaihealth/medical_questions_pairs")

# Display dataset
ds


README.md: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/314k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3048 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['dr_id', 'question_1', 'question_2', 'label'],
        num_rows: 3048
    })
})

In [5]:
# Convert the Hugging Face dataset to a pandas DataFrame
df = pd.DataFrame(ds["train"])

# Show the first 5 rows to confirm
df.head()


Unnamed: 0,dr_id,question_1,question_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0


In [None]:
# Inspect data
# Check overall shape (rows, columns)
print("Shape:", df.shape)

# Get info about datatypes and nulls
df.info()

# Count missing values per column
print("\nMissing values per column:")
print(df.isna().sum())

# Randomly sample 5 rows to inspect data
print("\nSample rows:")
display(df.sample(5))


Shape: (3048, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3048 entries, 0 to 3047
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   dr_id       3048 non-null   int64 
 1   question_1  3048 non-null   object
 2   question_2  3048 non-null   object
 3   label       3048 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 95.4+ KB

Missing values per column:
dr_id         0
question_1    0
question_2    0
label         0
dtype: int64

Sample rows:


Unnamed: 0,dr_id,question_1,question_2,label
2178,8,What types of exercises are recommended for re...,Do you have any ideas about exercises that can...,1
2189,8,Will bodybuiling at sixteen and a half slow my...,I am sixteen and half year old and my height i...,0
1622,6,What is the use of the drug lovenox (enoxaparin)?,How is Lovenox used?,1
232,2,How can you heal chronic knee bursitis?,How is chronic knee bursitis diagnosed? Is MRI...,0
2603,10,Interested in yoga but don't know which one to...,"Yoga is helping me with relaxation, could you ...",0


In [7]:
# Rename columns
df = df.rename(columns={
    "question_1": "question_type_1",
    "question_2": "question_type_2"
})

# New column names
df.head()


Unnamed: 0,dr_id,question_type_1,question_type_2,label
0,1,After how many hour from drinking an antibioti...,I have a party tonight and I took my last dose...,1
1,1,After how many hour from drinking an antibioti...,I vomited this morning and I am not sure if it...,0
2,1,Am I over weight (192.9) for my age (39)?,I am a 39 y/o male currently weighing about 19...,1
3,1,Am I over weight (192.9) for my age (39)?,What diet is good for losing weight? Keto or v...,0
4,1,Aspirin allergy - is it worth getting a bracelet?,How much Aspirin can I take for my headache wi...,0


In [8]:
# Quick stats
print("Unique doctors in dataset:", df["dr_id"].nunique())

print("\nLabel counts:")
print(df["label"].value_counts())


Unique doctors in dataset: 11

Label counts:
label
1    1524
0    1524
Name: count, dtype: int64


In [None]:
# Ollama Test
# This checks if Ollama is installed and can respond to a simple question.
# It does NOT save anything yet — it's just a dry run, be patient!

import subprocess

test_prompt = "What is a common cold?"

# Run the model (can change 'deepseek' to 'gemma' just to test the different models)
result = subprocess.run(
    ["ollama", "run", "deepseek", test_prompt],
    capture_output=True,
    text=True
)

print("\nOllama model test output:\n")
if result.stdout:
    print(result.stdout)
else:
    print("Error or no response:\n", result.stderr)
