### Install and Import Packages

In [6]:
!pip install -r ..\requirements.txt



In [1]:
import json
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from collections import Counter, defaultdict

### Data Loading

In [2]:
sunrise_path = "fine_tuning_sunrise.json"

In [25]:
dataset = load_dataset('json', data_files=sunrise_path)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'type', 'chapter', 'set', 'question', 'answer'],
        num_rows: 600
    })
})

In [31]:
dataset_chapter = dataset['train']['chapter']
dataset_type = dataset['train']['type']
dataset_set = dataset['train']['set']
print(dataset_chapter)
print(dataset_type)
print(dataset_set)

['Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4'

In [24]:
chapter_counts  = {}
for chapter in dataset_chapter:
    chapter_counts[chapter] = chapter_counts.get(chapter, 0) + 1
print(chapter_counts)

{'Chapter 1': 20, 'Chapter 2': 20, 'Chapter 3': 20, 'Chapter 4': 20, 'Chapter 5': 20, 'Chapter 6': 20, 'Chapter 7': 20, 'Chapter 8': 20, 'Chapter 9': 20, 'Chapter 10': 20, 'Chapter 11': 20, 'Chapter 12': 20, 'Chapter 13': 20, 'Chapter 14': 20, 'Chapter 15': 20, 'Chapter 16': 20, 'Chapter 17': 20, 'Chapter 18': 20, 'Chapter 19': 20, 'Chapter 20': 20, 'Chapter 21': 20, 'Chapter 22': 20, 'Chapter 23': 20, 'Chapter 24': 20, 'Chapter 25': 20, 'Chapter 26': 20, 'Chapter 27': 20, 'Chapter 28': 20, 'Full Book': 40}


In [27]:
type_counts  = {}
for type in dataset_type:
    type_counts[type] = type_counts.get(type, 0) + 1
print(type_counts)

{'Summary & Inference': 300, 'Detail Extraction': 300}


In [29]:
set_counts  = {}
for set in dataset_set:
    set_counts[set] = set_counts.get(set, 0) + 1
print(set_counts)

{'training': 480, 'validation': 60, 'testing': 60}


### Data Splitting

In [36]:
tuning_set = dataset['train']
tuning_set

Dataset({
    features: ['id', 'type', 'chapter', 'set', 'question', 'answer'],
    num_rows: 600
})

In [37]:
tuning_set = dataset['train']
tuning_df = pd.DataFrame(
    {
        'id':tuning_set['id'],
        'type': tuning_set['type'],
        'chapter':tuning_set['chapter'],
        'set':tuning_set['set'],
        'question': tuning_set['question'],
        'answer':tuning_set['answer']
    }
)
tuning_df

Unnamed: 0,id,type,chapter,set,question,answer
0,Ch1_SumInf_001_Train,Summary & Inference,Chapter 1,training,What is Haymitch's initial attitude towards hi...,{'chain_of_thought': ['Identify Haymitch's ini...
1,Ch1_SumInf_002_Train,Summary & Inference,Chapter 1,training,Haymitch plans to spend his afternoon after fi...,{'chain_of_thought': ['Identify the two things...
2,Ch1_SumInf_003_Train,Summary & Inference,Chapter 1,training,How does Lenore Dove's perspective on the Mead...,{'chain_of_thought': ['Identify how 'most peop...
3,Ch1_SumInf_004_Train,Summary & Inference,Chapter 1,training,"Lenore Dove's name is explained by her, connec...",{'chain_of_thought': ['Recall Lenore Dove's ex...
4,Ch1_SumInf_005_Train,Summary & Inference,Chapter 1,training,"Haymitch states, 'I didn't say it was just bec...",{'chain_of_thought': ['Identify Lenore Dove's ...
...,...,...,...,...,...,...
595,FullBook_DetExt_016_Train,Detail Extraction,Full Book,training,What is the symbolic implication of the 'mecha...,{'chain_of_thought': ['Identify the sound: 'A ...
596,FullBook_DetExt_017_Validation,Detail Extraction,Full Book,validation,What specific item is Maysilee seen using to r...,{'chain_of_thought': ['Locate the scene after ...
597,FullBook_DetExt_018_Validation,Detail Extraction,Full Book,validation,What is the nickname for the Capitol store bee...,{'chain_of_thought': ['Locate the scene where ...
598,FullBook_DetExt_019_Testing,Detail Extraction,Full Book,testing,What is the primary method of movement for the...,{'chain_of_thought': ['Locate the description ...


In [39]:
traning_df = tuning_df[tuning_df['set'] == 'training']
val_df = tuning_df[tuning_df['set'] == 'validation']
test_df = tuning_df[tuning_df['set'] == 'testing']

### Store to JSON

In [45]:
# to data frame

# stroe to json files
traning_df.to_json("train_data.json", orient="records", lines=True, force_ascii=False)
val_df.to_json("val_data.json", orient="records", lines=True, force_ascii=False)
test_df.to_json("test_data.json", orient="records", lines=True, force_ascii=False)


print("Successfully saved train_data to train_data.json!")
print("Successfully saved val_data to val_data.json!")
print("Successfully saved test_data to test_data.json!")

Successfully saved train_data to train_data.json!
Successfully saved val_data to val_data.json!
Successfully saved test_data to test_data.json!
