In [1]:
# import boto3
import glob
import json
import argparse
from tqdm import tqdm
# from botocore.config import Config

def read_txt_file(file_path):
    """
    Reads a text file and returns the content as a string.

    :param file_path: Path to the .txt file to be read.
    :return: Content of the file as a string.
    """
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        return "The file was not found."
    except Exception as e:
        return f"An error occurred: {e}"

def read_json_file(file_path):
    """
    Reads a JSON file and returns the content as a dictionary.

    :param file_path: Path to the .json file to be read.
    :return: Content of the file as a dictionary.
    """
    import json

    try:  
        with open(file_path, 'r') as file:
            content = json.load(file)
        return content
    except FileNotFoundError:
        return "The file was not found."
    except json.JSONDecodeError:
        return "The file is not a valid JSON file."
    except Exception as e:
        return f"An error occurred: {e}"

In [3]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4")

# for each json in directory load the json and calculate the total token count of the fields 'Question' and 'Options' of all dicts in it

def count_tokens_in_json_files(directory):
    """
    Counts the total number of tokens in the 'Question' and 'Options' fields of all JSON files in the specified directory.

    :param directory: Path to the directory containing the JSON files.
    :return: Total number of tokens in the 'Question' and 'Options' fields of all JSON files in the directory.
    """
    json_files = glob.glob(f"{directory}/*.json")
    total_tokens = 0

    for file in tqdm(json_files):
        content = read_json_file(file)
        if isinstance(content, dict):
            for question in content:
                question_text = question.get('Question', '')
                options_text = question.get('Options', '')
                total_tokens += len(encoding(question_text + options_text).tokens)
    return total_tokens

In [33]:
directory = "_data/hub/datasets--NovelQA--NovelQA/snapshots/47755b7f1d5810dc01ea9d6074d45ad126019f45/Data"
json_files = glob.glob(f"{directory}/*.json")

total_tokens = 0
total_nr_of_questions = 0
nr_of_novels = 0
for file in tqdm(json_files):
    nr_of_novels += 1
    content = read_json_file(file)
    for question in content:
        total_nr_of_questions += 1
        question_text = question.get('Question', '')
        options_text = "".join(question.get('Options', ''))
        total_tokens += len(encoding.encode(question_text + options_text))
print("Total tokens for all question and options", total_tokens)
print("Total number of questions", total_nr_of_questions)
print("Total number of novels", nr_of_novels)

100%|██████████| 88/88 [00:00<00:00, 1205.85it/s]

Total tokens for all question and options 110094
Total number of questions 2283
Total number of novels 88





In [31]:
# token count of starter prompt
starter_prompt = """You are a literature professor. I will provide you with the full text of a novel
along with a series of questions and corresponding choices pertaining to it.
Please thoroughly analyze the novel ’s content to accurately respond to each of the following questions.
Book title: ; Book Content: ; Book ends. Questions start here:
; Questions end here.
Try your best to select the correct choice to each question based on the given full text the novel.
Your should output the choice to each question with the format
’Answer0: <choice> Answer1: <choice>... Answern: <choice>’
(only the choice index is required), each answer in one line without outputing the questions and other info."""
total_tokens_main_prompt = len(encoding.encode(starter_prompt))
total_tokens_questions_template = total_nr_of_questions/nr_of_novels * len(encoding.encode("Question:  Choices: 0:  1:  2:  3: ")) # avg number of questions per novel x template tokens

print("Token count of starter prompt", total_tokens_main_prompt+total_tokens_questions_template)

Token count of starter prompt 690.8068181818181
