In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re



In [3]:
# Constants and Patterns

# Regex for matching roman numerals
ROMAN_NUMERAL_PATTERN = re.compile(
    r'^(M{0,3})(CM|CD|D?C{0,3})'
    r'(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$', re.IGNORECASE
)

# Spelled-out numbers
SPELLED_NUMBERS = [
    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty",
    "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
    "hundred", "thousand", "million", "billion"
]

SPELLED_NUMBERS += ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth']
SPELLED_NUMBERS += ['eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth']
SPELLED_NUMBERS += ['twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth']
SPELLED_NUMBERS += ['hundredth', 'thousandth', 'millionth', 'billionth']

SPELLED_NUMBERS += ['twice', 'thrice', 'once']
SPELLED_NUMBERS += ['single', 'double', 'triple', 'quadruple', 'quintuple', 'sextuple', 'septuple', 'octuple', 'nonuple', 'decuple']
SPELLED_NUMBERS += ['dozen', 'fortnight', 'score', 'century', 'millennium']


In [4]:

# File functions
# Load JSON data from a file
def load_json_data(file_path):
    try:
        data = pd.read_json(file_path, orient="records", lines=False)
        return data
    except ValueError as e:
        print(f"Error loading JSON data: {e}")
        return None


def strip_html_tags(text):
    """
    Strips HTML tags from a string.

    Args:
        text (str): The input string potentially containing HTML tags.

    Returns:
        str: The string with HTML tags removed.
    """
    # Regex to find any HTML tag: < followed by any characters, then >
    # The '?' makes it non-greedy, matching the shortest possible string.
    # The '|' handles self-closing tags like <br/> or <img src="...">
    # and also comments (though less common in user-generated text usually)
    clean = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(clean, '', text)

def strip_quotes(text):
    """
    Strips quotes only if it appears at the start and end of the argument text.
    """
    if text.startswith('"') and text.endswith('"'):
        return text[1:-1]
    elif text.startswith("'") and text.endswith("'"):
        return text[1:-1]
    return text

In [5]:
# preprocess the data
JSON_FILE_PATH = './dataset/JEOPARDY_QUESTIONS1.json'
df = load_json_data(JSON_FILE_PATH)

In [6]:
# Pre-cleaning steps

# create copy of question to original_question
df['original_question'] = df['question'].copy()

# Strip HTML tags from the 'question' column
df['question'] = df['question'].apply(strip_html_tags)

# Strip quotes from the 'question' column
df['question'] = df['question'].apply(strip_quotes)

In [7]:
# Check for spelled numbers in the 'question' column
df['has_spelled_number'] = df['question'].apply(
    lambda x: any(token in x.lower() for token in SPELLED_NUMBERS)
)

# Check for roman numerals in the 'question' column
df['has_roman_numeral'] = df['question'].apply(
    lambda x: any(ROMAN_NUMERAL_PATTERN.match(token) for token in x.split())
)

# Check for numerical values in the 'question' column
df['has_numerical_value'] = df['question'].apply(
    lambda x: any(token.isdigit() for token in x.split())
)

In [23]:
df['category'].value_counts()[df['category'].value_counts() > 100]


category
BEFORE & AFTER      547
SCIENCE             519
LITERATURE          496
AMERICAN HISTORY    418
POTPOURRI           401
                   ... 
THE OSCARS          102
FAMOUS WOMEN        102
PLAYWRIGHTS         102
ANCIENT HISTORY     102
LIBRARIES           101
Name: count, Length: 145, dtype: int64

In [24]:
# Output samples of spelling numbers
print("Sample questions with spelled numbers:")
sample_list = df[df['has_spelled_number']].sample(5)[['category', 'question']]

for idx, row in sample_list.iterrows():
    print(f"{idx}. Category: {row['category']}, Question: {row['question']}")

# Output samples of roman numerals
print("Sample questions with roman numerals:")
sample_list = df[df['has_roman_numeral']].sample(5)[['category', 'question']]
for idx, row in sample_list.iterrows():
    print(f"{idx}. Category: {row['category']}, Question: {row['question']}")

# Output samples of numerical values
print("Sample questions with numerical values:")
sample_list = df[df['has_numerical_value']].sample(5)[['category', 'question']]
for idx, row in sample_list.iterrows():
    print(f"{idx}. Category: {row['category']}, Question: {row['question']}")


Sample questions with spelled numbers:
79046. Category: "P" IS FOR PHYSICS, Question: A cube is an example of a rectangular one of these; A Pink Floyd cover displays a triangular one
177400. Category: "K" RATIONS, Question: In 1891 African-American jockey Isaac Murphy became the first man to win this horse race 3 times
130831. Category: NAME ANAGRAMS, Question: Stewart pulls one out when a fly goes by
79664. Category: GERMAN SCIENCE, Question: Around 1850 Helmholtz clocked impulses moving along these body parts at about 90 feet per second
94227. Category: SHAKESPEARE TITLES IN OTHER WORDS, Question: "A Subjugation for One Small Burrowing Mammal"
Sample questions with roman numerals:
151379. Category: THEY COME IN SEVENS, Question: Can I get an amen for Pope Gregory, who thankfully reduced the list of these to 7
42201. Category: IF I HAD A HAMMER..., Question: William Wallace's nemesis, Edward I of England, was called the hammer of these people
177781. Category: ROBERT FROST BITES, Ques