In [3]:
import json
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from collections import Counter, defaultdict

In [4]:
file_path = 'reasoned_qa_output/all_reasoned_qa.json'
dataset = load_dataset('json', data_files=file_path)
dataset = dataset['train']
dataset

Dataset({
    features: ['Queue', 'Question', 'Answer', 'Reasoning', 'Chapter', 'Type'],
    num_rows: 598
})

In [5]:
all_data = list(dataset)

# 1. Create the labels for stratification (by Chapter only)
stratify_labels = [str(d['Chapter']) for d in all_data]

# 2. First split: Get the 80% training set.
#    The other 20% is a temporary set for validation and testing.
train_data, test_data, train_label, test_label = train_test_split(
    all_data,
    stratify_labels,
    test_size=0.1,      # Splitting off 20%
    random_state=614,
    stratify=stratify_labels
)

# 3. Second split: Split the 20% temporary set in half.
#    This gives 10% for validation and 10% for testing.
train_data, val_data, _, _ = train_test_split(
    train_data,
    train_label,
    test_size=1/9,
    random_state=614,
    stratify=train_label
)

print("--- Simplified Split Results ---")
print(f"Training set total: {len(train_data)}")
print(f"Validation set total: {len(val_data)}")
print(f"Test set total: {len(test_data)}")
print("\nSplitting complete!")

--- Simplified Split Results ---
Training set total: 478
Validation set total: 60
Test set total: 60

Splitting complete!


In [6]:
from collections import defaultdict

# --- Re-create the count dictionaries using ONLY 'Chapter' as the key ---

train_counts = defaultdict(int)
val_counts = defaultdict(int)
test_counts = defaultdict(int)

# Count items by Chapter in the training set
for item in train_data:
    key = item["Chapter"]  # Use 'Chapter' only
    train_counts[key] += 1

# Count items by Chapter in the validation set
for item in val_data:
    key = item["Chapter"]  # Use 'Chapter' only
    val_counts[key] += 1

# Count items by Chapter in the test set
for item in test_data:
    key = item["Chapter"]  # Use 'Chapter' only
    test_counts[key] += 1


In [7]:
import pandas as pd

# This code is the same as before, but now it uses the correctly populated dictionaries

# 1. Get the total size of each dataset
total_train = len(train_data)
total_val = len(val_data)
total_test = len(test_data)

# 2. Get a sorted list of all unique chapters
all_chapters = sorted(set(train_counts.keys()) | set(val_counts.keys()) | set(test_counts.keys()))

# 3. Prepare the data for display
results_list = []
for chapter in all_chapters:
    train_c = train_counts.get(chapter, 0)
    val_c = val_counts.get(chapter, 0)
    test_c = test_counts.get(chapter, 0)
    
    train_pct = (train_c / total_train) * 100 if total_train > 0 else 0
    val_pct = (val_c / total_val) * 100 if total_val > 0 else 0
    test_pct = (test_c / total_test) * 100 if total_test > 0 else 0
    
    results_list.append({
        "Chapter": chapter,
        "Train %": f"{train_pct:.2f}%",
        "Val %": f"{val_pct:.2f}%",
        "Test %": f"{test_pct:.2f}%",
        "Raw Counts (Tr/V/T)": f"{train_c}/{val_c}/{test_c}"
    })

# 4. Create and print the DataFrame
df_distribution = pd.DataFrame(results_list)
print(f"\n--- Distribution of Chapters Across Datasets ---")
print(f"Total Sizes -> Train: {total_train}, Validation: {total_val}, Test: {total_test}\n")
print(df_distribution)


--- Distribution of Chapters Across Datasets ---
Total Sizes -> Train: 478, Validation: 60, Test: 60

    Chapter Train %  Val % Test % Raw Counts (Tr/V/T)
0         1   3.35%  3.33%  3.33%              16/2/2
1         2   3.35%  3.33%  3.33%              16/2/2
2         3   3.35%  3.33%  3.33%              16/2/2
3         4   3.35%  3.33%  3.33%              16/2/2
4         5   3.14%  3.33%  3.33%              15/2/2
5         6   3.35%  3.33%  3.33%              16/2/2
6         7   3.35%  3.33%  3.33%              16/2/2
7         8   3.14%  3.33%  3.33%              15/2/2
8         9   3.35%  3.33%  3.33%              16/2/2
9        10   3.35%  3.33%  3.33%              16/2/2
10       11   3.35%  3.33%  3.33%              16/2/2
11       12   3.35%  3.33%  3.33%              16/2/2
12       13   3.35%  3.33%  3.33%              16/2/2
13       14   3.35%  3.33%  3.33%              16/2/2
14       15   3.35%  3.33%  3.33%              16/2/2
15       16   3.35%  3.33%  3.33%

In [9]:
# to data frame
df_train = pd.DataFrame(train_data)
df_val = pd.DataFrame(val_data)
df_test = pd.DataFrame(test_data)

# choose Q&A&R for fine tuning
df_train = df_train[['Question', 'Answer', 'Reasoning']]
df_val = df_val[['Question', 'Answer', 'Reasoning']]
df_test = df_test[['Question', 'Answer', 'Reasoning']]

# stroe to json files
df_train.to_json("reasoned_qa_output/allfours_train_data.json", orient="records", lines=True, force_ascii=False)
df_val.to_json("reasoned_qa_output/allfours_val_data.json", orient="records", lines=True, force_ascii=False)
df_test.to_json("reasoned_qa_output/allfours_test_data.json", orient="records", lines=True, force_ascii=False)


print("Successfully saved train_data to allfours_train_data.json!")
print("Successfully saved val_data to allfours_val_data.json!")
print("Successfully saved test_data to allfours_test_data.json!")

Successfully saved train_data to allfours_train_data.json!
Successfully saved val_data to allfours_val_data.json!
Successfully saved test_data to allfours_test_data.json!
