### Install and Import Packages

In [6]:
!pip install -r ..\requirements.txt



In [1]:
import json
import random
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from collections import Counter, defaultdict

### Data Loading

In [2]:
sunrise_path = "fine_tuning_sunrise.json"

In [3]:
dataset = load_dataset('json', data_files=sunrise_path)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'type', 'chapter', 'set', 'question', 'answer'],
        num_rows: 600
    })
})

In [4]:
dataset_chapter = dataset['train']['chapter']
dataset_type = dataset['train']['type']
dataset_set = dataset['train']['set']
print(dataset_chapter)
print(dataset_type)
print(dataset_set)

['Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 1', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 2', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 3', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4', 'Chapter 4'

In [5]:
chapter_counts  = {}
for chapter in dataset_chapter:
    chapter_counts[chapter] = chapter_counts.get(chapter, 0) + 1
print(chapter_counts)

{'Chapter 1': 20, 'Chapter 2': 20, 'Chapter 3': 20, 'Chapter 4': 20, 'Chapter 5': 20, 'Chapter 6': 20, 'Chapter 7': 20, 'Chapter 8': 20, 'Chapter 9': 20, 'Chapter 10': 20, 'Chapter 11': 20, 'Chapter 12': 20, 'Chapter 13': 20, 'Chapter 14': 20, 'Chapter 15': 20, 'Chapter 16': 20, 'Chapter 17': 20, 'Chapter 18': 20, 'Chapter 19': 20, 'Chapter 20': 20, 'Chapter 21': 20, 'Chapter 22': 20, 'Chapter 23': 20, 'Chapter 24': 20, 'Chapter 25': 20, 'Chapter 26': 20, 'Chapter 27': 20, 'Chapter 28': 20, 'Full Book': 40}


In [6]:
type_counts  = {}
for type in dataset_type:
    type_counts[type] = type_counts.get(type, 0) + 1
print(type_counts)

{'Summary & Inference': 300, 'Detail Extraction': 300}


In [7]:
set_counts  = {}
for set in dataset_set:
    set_counts[set] = set_counts.get(set, 0) + 1
print(set_counts)

{'training': 480, 'validation': 60, 'testing': 60}


### Data Splitting

In [8]:
tuning_set = dataset['train']
tuning_set

Dataset({
    features: ['id', 'type', 'chapter', 'set', 'question', 'answer'],
    num_rows: 600
})

In [28]:
tuning_set = dataset['train']
tuning_df = pd.DataFrame(
    {
        'Set':tuning_set['set'],
        'Question': tuning_set['question'],
        'Answer':[answer['final_answer'] for answer in tuning_set['answer']],
        'Reasoning': [cot['chain_of_thought'] for cot in tuning_set['answer']],
    }
)
tuning_df

Unnamed: 0,Set,Question,Answer,Reasoning
0,training,What is Haymitch's initial attitude towards hi...,Haymitch initially views the upside of his bir...,[Identify Haymitch's initial feeling about his...
1,training,Haymitch plans to spend his afternoon after fi...,Haymitch hopes to devote his afternoon to 'was...,[Identify the two things Haymitch loves doing:...
2,training,How does Lenore Dove's perspective on the Mead...,"While 'most people comment on its beauty' , Le...",[Identify how 'most people' view the Meadow: T...
3,training,"Lenore Dove's name is explained by her, connec...",Lenore Dove explains her name by saying 'Dove ...,[Recall Lenore Dove's explanation of her name:...
4,training,"Haymitch states, 'I didn't say it was just bec...","Lenore Dove accuses Haymitch of faulty logic, ...",[Identify Lenore Dove's logical argument: 'You...
...,...,...,...,...
595,training,What is the symbolic implication of the 'mecha...,The 'mechanical buzz of protest' from below af...,[Identify the sound: 'A mechanical buzz of pro...
596,validation,What specific item is Maysilee seen using to r...,Maysilee uses 'a wince' and her fingers to rem...,[Locate the scene after the ladybug attack whe...
597,validation,What is the nickname for the Capitol store bee...,The nickname for the Capitol store beer that H...,[Locate the scene where Haymitch is brainstorm...
598,testing,What is the primary method of movement for the...,The squirrel mutts primarily move by 'bouncing...,[Locate the description of the squirrel mutts....


In [46]:
training_df = tuning_df[tuning_df['Set'] == 'training']
val_df = tuning_df[tuning_df['Set'] == 'validation']
test_df = tuning_df[tuning_df['Set'] == 'testing']

training_df.drop(columns = ['Set'], inplace= True)
training_df.reset_index(inplace=True)
training_df.drop(columns = ['index'], inplace= True)

val_df.drop(columns = ['Set'], inplace= True)
val_df.reset_index(inplace=True)
val_df.drop(columns = ['index'], inplace= True)

test_df.drop(columns = ['Set'], inplace= True)
test_df.reset_index(inplace=True)
test_df.drop(columns = ['index'], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df.drop(columns = ['Set'], inplace= True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df.drop(columns = ['index'], inplace= True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.drop(columns = ['Set'], inplace= True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df.drop(column

In [48]:
training_df

Unnamed: 0,Question,Answer,Reasoning
0,What is Haymitch's initial attitude towards hi...,Haymitch initially views the upside of his bir...,[Identify Haymitch's initial feeling about his...
1,Haymitch plans to spend his afternoon after fi...,Haymitch hopes to devote his afternoon to 'was...,[Identify the two things Haymitch loves doing:...
2,How does Lenore Dove's perspective on the Mead...,"While 'most people comment on its beauty' , Le...",[Identify how 'most people' view the Meadow: T...
3,"Lenore Dove's name is explained by her, connec...",Lenore Dove explains her name by saying 'Dove ...,[Recall Lenore Dove's explanation of her name:...
4,"Haymitch states, 'I didn't say it was just bec...","Lenore Dove accuses Haymitch of faulty logic, ...",[Identify Lenore Dove's logical argument: 'You...
...,...,...,...
475,What are the approximate dimensions (diameter ...,The berms observed by Haymitch are 'about eigh...,"[Locate Haymitch's examination of the berms., ..."
476,What specific advice did Mamaw give about char...,Mamaw's specific advice about charcoal tablets...,[Locate the flashback to Mamaw and charcoal ta...
477,What is the specific color and pattern of Drus...,Drusilla's jumpsuit during the interviews is '...,[Locate the description of Drusilla's outfit d...
478,What specific item is Maysilee wearing that is...,The 'braided cord' from Maysilee's token neckl...,[Locate the scene where Haymitch unravels Mays...


In [49]:
val_df

Unnamed: 0,Question,Answer,Reasoning
0,What is the implied psychological manipulation...,The Capitol uses a 'five-minute hold on the br...,[Identify the explicit detail: A 'five-minute ...
1,What is the primary product of District 12's i...,The primary product of District 12's industry ...,[Recall details about Haymitch's father's deat...
2,When Maysilee throws a literal punch at Drusil...,When Maysilee throws a literal 'wallop' back ...,[Recall the action: Maysilee 'slaps her right ...
3,What is the name of the District 3 victor who ...,The District 3 victor who won the previous Hun...,[Identify the District 3 victor introduced as ...
4,Haymitch believes that 'Woodbine no longer see...,Haymitch's dark reflection that 'Woodbine no l...,[Recall Woodbine's death: Shot in the head try...
5,What is the name of the District 3 victor who ...,The District 3 victor who is Ampert's father i...,"[Identify Ampert's father's name: 'Beetee'., C..."
6,What is the implied effect of the Gamemakers' ...,The Gamemakers' unprecedented decision to give...,"[Recall Haymitch's score: 'I get a one'., Reca..."
7,What is the specific material used to coat the...,The District 9 sunflower tokens are 'coated wi...,[Locate the explanation about the District 9 s...
8,What is the implied purpose of the Capitol col...,The implied purpose of the Capitol collecting ...,[Identify the fact: Trackers are 'electronic d...
9,What color are the District 3 tributes' outfit...,The District 3 tributes' outfits are 'electric...,[Locate the description of District 3 tributes...


In [50]:
test_df

Unnamed: 0,Question,Answer,Reasoning
0,How does Haymitch's act of covering the camera...,Haymitch's act of throwing 'a towel over the c...,[Identify the action: Haymitch throws 'a towel...
1,What are the two items Haymitch gives to his m...,"Before leaving for the Games, Haymitch empties...",[Locate the scene where Haymitch empties his p...
2,How does the Gamemakers' choice of Wiress and ...,The Gamemakers' choice of Wiress and Mags as m...,[Recall District 12's mentor situation: 'the o...
3,What happens to the tributes' clothing after t...,"After tributes strip for the communal shower, ...",[Locate the scene where boys are told to strip...
4,What kind of 'dreams' does Haymitch have in th...,Haymitch's dreams in this chapter are of 'fear...,[Identify the nature of his dreams: 'fearful t...
5,What are the two animals that form the design ...,"The golden staircase and eagle motif, found in...",[Locate the description of the staircase in Pl...
6,How does Lou Lou's reaction to the 'dark cresc...,"Lou Lou's intense, emotional reaction to the '...",[Recall Lou Lou's reaction: She stares in 'fas...
7,What is the primary content of the District 12...,The primary content of the District 12 tribute...,[Locate descriptions of food in the tribute ap...
8,Haymitch's act of applauding President Snow af...,Haymitch's act of applauding President Snow af...,[Recall Haymitch's action: He 'gesture[s] to h...
9,"What item, resembling a metal device, is Maysi...",Maysilee is seen knuckle-rolling a 'scrip coin...,[Locate the scene where Maysilee is on the tra...


### Store to JSON

In [51]:
# to data frame

# stroe to json files
training_df.to_json("sunrise_train_data.json", orient="records", lines=False, force_ascii=False)
val_df.to_json("sunrise_val_data.json", orient="records", lines=False, force_ascii=False)
test_df.to_json("sunrise_test_data.json", orient="records", lines=False, force_ascii=False)


print("Successfully saved train_data to train_data.json!")
print("Successfully saved val_data to val_data.json!")
print("Successfully saved test_data to test_data.json!")

Successfully saved train_data to train_data.json!
Successfully saved val_data to val_data.json!
Successfully saved test_data to test_data.json!


### Load Sunrise on the Reaping