In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
import logging

from nltk.corpus import wordnet, words, stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('words', quiet=True)
nltk.download('stopwords', quiet=True)

lemmatizer = WordNetLemmatizer()

In [None]:
# Constants and Patterns

# Regex for matching roman numerals
ROMAN_NUMERAL_PATTERN = re.compile(
    r'^(M{0,3})(CM|CD|D?C{0,3})'
    r'(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$', re.IGNORECASE
)

# Spelled-out numbers
SPELLED_NUMBERS = [
    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty",
    "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
    "hundred", "thousand", "million", "billion"
]

SPELLED_NUMBERS += ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth']
SPELLED_NUMBERS += ['eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth']
SPELLED_NUMBERS += ['twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth']
SPELLED_NUMBERS += ['hundredth', 'thousandth', 'millionth', 'billionth']

SPELLED_NUMBERS += ['twice', 'thrice', 'once']
SPELLED_NUMBERS += ['single', 'double', 'triple', 'quadruple', 'quintuple', 'sextuple', 'septuple', 'octuple', 'nonuple', 'decuple']
SPELLED_NUMBERS += ['dozen', 'fortnight', 'score', 'century', 'millennium']

STOPWORDS = set(stopwords.words("english"))

ENGLISH_VOCAB = set(w.lower() for w in words.words())

NUMBER_OF_SAMPLES_TO_SHOW = 3



In [23]:

# File functions
# Load JSON data from a file
def load_json_data(file_path):
    '''
    Loads JSON data from the specified file path into a pandas DataFrame.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        pd.DataFrame: DataFrame containing the loaded JSON data, or None if loading fails.
    '''
    try:
        data = pd.read_json(file_path, orient="records", lines=False)
        return data
    except ValueError as e:
        print(f"Error loading JSON data: {e}")
        return None


def strip_html_tags(text):
    """
    Strips HTML tags from a string.

    Args:
        text (str): The input string potentially containing HTML tags.

    Returns:
        str: The string with HTML tags removed.
    """
    # Regex to find any HTML tag: < followed by any characters, then >
    # The '?' makes it non-greedy, matching the shortest possible string.
    # The '|' handles self-closing tags like <br/> or <img src="...">
    # and also comments (though less common in user-generated text usually)
    clean = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(clean, '', text)

def display_sample_questions(df, lookup_column = None, extra_details_col = None, sample_count = 3):
    """
    Displays a sample of questions from the DataFrame where the specified lookup_column is True.

    Args:
        df (pd.DataFrame): The DataFrame containing the questions.
        lookup_column (str, optional): The column name to use as a boolean filter for selecting rows.
        sample_count (int, optional): The number of sample questions to display. Default is 3.

    Raises:
        ValueError: If lookup_column is None or empty.

    Prints:
        The index, category, and question text for each sampled row.
    """
    if lookup_column is None:
        raise ValueError("lookup_column argument cannot be empty or None")
    
    if df is None or lookup_column not in df.columns:
        raise ValueError(f"DataFrame is None or lookup_column '{lookup_column}' does not exist in DataFrame")
    
    if extra_details_col is not None and extra_details_col not in df.columns:
        raise ValueError(f"show_supplementary_column '{extra_details_col}' does not exist in DataFrame")
    
    if extra_details_col is None:
        print(f"Sampling questions with '{lookup_column}' set to True:")

        sample_list = df[df[lookup_column]].sample(sample_count)[['category', 'question']]
        for idx, row in sample_list.iterrows():
            print(f"{idx} - [{row['category']}] {row['question']}")

    else:
        print(f"Sampling questions with '{lookup_column}' set to True, showing '{extra_details_col}':")

        sample_list = df[df[lookup_column]].sample(sample_count)[['category', 'question', extra_details_col]]

        for idx, row in sample_list.iterrows():
            print(f"{idx} - [{row['category']}] {row['question']} (Extra: {row[extra_details_col]})")
        
    

    print()

def strip_quotes(text):
    """
    Strips quotes only if it appears at the start and end of the argument text.
    """
    if text.startswith('"') and text.endswith('"'):
        return text[1:-1]
    elif text.startswith("'") and text.endswith("'"):
        return text[1:-1]
    return text

def is_valid_roman(string):
    """
    Checks if the input text is a valid Roman numeral.

    Args:
        text (str): The input string to check.

    Returns:
        bool: True if the text is a valid Roman numeral, False otherwise.
    """
    return bool(ROMAN_NUMERAL_PATTERN.match(string))

def is_noun_roman_bigram(bigram):
    """
    Checks if the bigram contains a valid Roman numeral and a noun.

    Args:
        bigram (tuple): A tuple containing two words (bigram).

    Returns:
        bool: True if the first word is a valid Roman numeral and the second word is a noun, False otherwise.
    """
    return is_noun(bigram[0]) and is_valid_roman(bigram[1])

def find_valid_roman_numerals(input_text, debug=False):
    """
    Parses the input text to find and return valid Roman numerals.

    Args:
        input_text (str): The input string to parse.

    Returns:
        list: A list of valid Roman numerals found in the input text.
    """
    # Split the input text into words
    # input_tokens = nltk.word_tokenize(input_text)
    input_tokens = input_text.split()
    valid_roman_numerals = []

    for idx in range(len(input_tokens)):
        # Check if the word is a valid Roman numeral
        if debug:
            print(f"Checking token: {input_tokens[idx]} at index {idx}")
        
        # If entire token is not uppercase, skip it
        if not input_tokens[idx].isupper():
            continue

        if len(input_tokens[idx]) <= 2:
            if idx == 0:
                continue  # Short words at the start are likely not valid Roman numerals

            first_word, second_word = input_tokens[idx-1], input_tokens[idx]
            # If first word ends in comma or period, skip it
            if first_word.endswith((',', '.', ';', ':')):
                if debug:
                    print(f"Skipping due to punctuation: {first_word}")
                continue

            # Remove any symbols from first word
            first_word = re.sub(r'[^\w\s]', '', first_word)

            # Check if previous word is a noun followed by a valid Roman numeral
            if is_noun_roman_bigram((first_word, second_word)):
                # If the previous word is a noun and the current word is a valid Roman numeral
                if debug:
                    print(f"Found valid Roman numeral: {first_word} {second_word} -- adding {input_tokens[idx]}")
                valid_roman_numerals.append(input_tokens[idx])
        else: 
            if is_valid_roman(input_tokens[idx]):
                valid_roman_numerals.append(input_tokens[idx])

    if debug:
        print(f"Original input: {input_text}")
        print(f"Valid Roman numerals found: {valid_roman_numerals}")

    
    # Filter and return only valid Roman numerals
    return valid_roman_numerals

def is_noun(word):
    # NLTK POS tagger expects a list of tokens
    pos = nltk.pos_tag([word])[0][1]
    # Nouns in Penn Treebank tagset start with 'NN'
    return pos.startswith('NN')


# def find_non_english_word(input_text :str, method="wordnet", stopword_list=STOPWORDS, lemmatizer=lemmatizer, debug=False) -> list[str]:
def find_non_english_word(
        input_text: str,
        method: str = "wordnet",
        stopword_list: set[str] | list[str] = set(),
        lemmatizer: "WordNetLemmatizer" = lemmatizer,
        debug: bool = False
    ) -> list[str]:

    """
    Identify non-English words in the input text using POS tagging and lemmatization.

    Args:
        input_text (str): The input string to analyze.
        method (str, optional): Method to determine if a word is English.
            - "wordnet": Uses WordNet synsets (default).
            - "en_dict": Uses the NLTK English vocabulary word list.
        stopword_list (set or list of str): Required. A set or list of stopwords to ignore (e.g., set(stopwords.words("english"))).
        lemmatizer (WordNetLemmatizer): Required. An instance of WordNetLemmatizer must be provided by the caller.
        debug (bool, optional): If True, prints debug information. Default is False.

    Returns:
        list: A list of non-English words found in the input text.

    Notes:
        - Only alphabetic, non-stopword, and non-function-tagged words are checked.
        - POS tags excluded: NNP, NNPS, IN, DT, WP, WP$, WRB, PRP, PRP$, CC, TO, MD, EX, UH.
        - The method parameter controls the English word check:
            * "wordnet": A word is considered English if it has WordNet synsets.
            * "en_dict": A word is considered English if it is in the NLTK English vocabulary.
    """
    # Tags only used for tagged-based filtering
    excluded_pos_tags = ['NNP', 'NNPS', 'IN', 'DT', 'WP', 'WP$', 'WRB', 'PRP', 'PRP$', 'CC', 'TO', 'MD', 'EX', 'UH']

    # Tokenize and POS tag the full sentence
    input_tokens = nltk.word_tokenize(input_text)
    tagged_tokens = nltk.pos_tag(input_tokens)

    # Filter out tokens that are not alphabetic or are stopwords or in one of the predefined function tags
    filtered_tokens = [
        (word, tag) for word, tag in tagged_tokens
        if word.isalpha() and word.lower() not in stopword_list and tag not in excluded_pos_tags
    ]

    non_english_words = []
    for word, tag in filtered_tokens:
        lemma_word = lemmatizer.lemmatize(word.lower())
        if method == "wordnet" and not wordnet.synsets(lemma_word):
            # Check lemma words in WordNet only
            non_english_words.append(word)
        elif method == "en_dict" and not (lemma_word in ENGLISH_VOCAB or lemma_word.lower() in ENGLISH_VOCAB or lemma_word.upper() in ENGLISH_VOCAB or word.lower() in ENGLISH_VOCAB):
            # Check lemma words in the NLTK English vocabulary
            non_english_words.append(word)
        elif method == "combined":
            # Check both WordNet and NLTK English vocabulary
            in_wordnet = bool(wordnet.synsets(lemma_word))
            in_en_dict = lemma_word in ENGLISH_VOCAB or lemma_word.lower() in ENGLISH_VOCAB or lemma_word.upper() in ENGLISH_VOCAB or word.lower() in ENGLISH_VOCAB
            if not (in_wordnet or in_en_dict):
                non_english_words.append(word)

        if debug:
            print(f"Word: {word}, Tag: {tag}, Lemma: {lemma_word}, In WordNet: {bool(wordnet.synsets(lemma_word))}, In English Dict: {lemma_word in ENGLISH_VOCAB or lemma_word.lower() in ENGLISH_VOCAB or lemma_word.upper() in ENGLISH_VOCAB or word.lower() in ENGLISH_VOCAB}")

    if debug:
        print(f"Original input: {input_text}")
        print(f"Non-English words found: {non_english_words}")

    return non_english_words

In [4]:
find_non_english_word("This soccer player whose real first name is Mariel is one of 4 women to have scored over 100 goals in international play.", debug=True)

# nltk.pos_tag(['Richard'])[0][1] == 'NNP'

# lemmatizer.lemmatize('praciticing', 'v')

Word: soccer, Tag: NN, Lemma: soccer, In WordNet: True, In English Dict: True
Word: player, Tag: NN, Lemma: player, In WordNet: True, In English Dict: True
Word: real, Tag: JJ, Lemma: real, In WordNet: True, In English Dict: True
Word: first, Tag: JJ, Lemma: first, In WordNet: True, In English Dict: True
Word: name, Tag: NN, Lemma: name, In WordNet: True, In English Dict: True
Word: is, Tag: VBZ, Lemma: is, In WordNet: True, In English Dict: True
Word: is, Tag: VBZ, Lemma: is, In WordNet: True, In English Dict: True
Word: one, Tag: CD, Lemma: one, In WordNet: True, In English Dict: True
Word: women, Tag: NNS, Lemma: woman, In WordNet: True, In English Dict: True
Word: have, Tag: VB, Lemma: have, In WordNet: True, In English Dict: True
Word: scored, Tag: VBN, Lemma: scored, In WordNet: True, In English Dict: True
Word: goals, Tag: NNS, Lemma: goal, In WordNet: True, In English Dict: True
Word: international, Tag: JJ, Lemma: international, In WordNet: True, In English Dict: True
Word: pl

[]

In [5]:
# preprocess the data
JSON_FILE_PATH = './dataset/JEOPARDY_QUESTIONS1.json'
df = load_json_data(JSON_FILE_PATH)

In [6]:
# Pre-cleaning steps

# create copy of question to original_question
df['original_question'] = df['question'].copy()

# Strip HTML tags from the 'question' column
df['question'] = df['question'].apply(strip_html_tags)

# Strip quotes from the 'question' column
df['question'] = df['question'].apply(strip_quotes)

In [7]:
# Check for spelled numbers in the 'question' column
df['has_spelled_number'] = df['question'].apply(
    lambda x: any(word in SPELLED_NUMBERS for word in nltk.word_tokenize(x.lower()))
)

# Extract roman numerals in the 'question' column
df['roman_text'] = df['question'].apply(
    lambda x: find_valid_roman_numerals(x)
)
df['has_roman_numeral'] = df['roman_text'].apply(
    lambda x: len(x) > 0
)

# Check for numerical values in the 'question' column
df['has_numerical_value'] = df['question'].apply(
    lambda x: any(re.search(r'\d', token) for token in nltk.word_tokenize(x))
)

# has_number 
df['has_number'] = df['has_spelled_number'] | df['has_numerical_value'] | df['has_roman_numeral']

In [19]:
# Output samples of filtered numbers
display_sample_questions(df, 'has_spelled_number')
display_sample_questions(df, 'has_roman_numeral')
display_sample_questions(df, 'has_numerical_value')


Sampling questions with 'has_spelled_number' set to True:
127425 - [AUTHOR-ITATIVE INFORMATION] In 1913 he published his first book of poems as well as "Sons and Lovers"
214396 - [QUOTATIONS] (Jon of the Clue Crew reports from the Freud Museum in Vienna.) Despite all the research that's been done on Freud, no one can definitely say if, or when, he made the attributed remark "sometimes a" this "is just a" this
78634 - [WELCOME BACK, QATAR] This satellite TV news service based in Qatar has become one of the most important Middle East broadcasters

Sampling questions with 'has_roman_numeral' set to True:
156466 - [1492] Casimir IV was succeeded as king of this country by his son John Albert
128942 - [YOU TAKE A MILE] You might be MIA at MIA, this city's international airport; its concourse D is 1.3 miles long, end to end
209451 - [WORLD HISTORY] In 1488 this king of Spain sent 100 Moorish slaves to Pope Innocent VIII who gave them as gifts to Cardinals

Sampling questions with 'has_numeri

In [None]:
print("Parsing questions for non-English words... (this may take a while)", end=' ')

df['non_english_words'] = df['question'].apply(
    lambda x: find_non_english_word(x, method="combined", stopword_list=STOPWORDS, lemmatizer=lemmatizer, debug=False)
)

print("Done.", end='\n')

df['has_non_english_word'] = df['non_english_words'].apply(
    lambda x: len(x) > 0
)

Parsing questions for non-English words... (this may take a while)


In [24]:
# Show sample questions with non-English words
display_sample_questions(df, lookup_column='has_non_english_word', extra_details_col='non_english_words', sample_count=10)


Sampling questions with 'has_non_english_word' set to True, showing 'non_english_words':
174221 - [NO. 1 HITS OF THE '70s] This No. 1 hit by the Bee Gees says, "What you doin' and you're laying on your back, aah" (Extra: ['doin'])
102332 - [SWORDS] The name of this highly curved, highly deadly sword comes from the Persian shamshir (Extra: ['shamshir'])
34270 - [CHRISTMAS CUISINE] At Christmastime you might have a "burning" desire to make a buche de noel, a cake shaped like this (Extra: ['buche'])
54287 - [MY OFF-SHOW WARDROBE] Hey, there, Daddy-O--it's the rhymin' name of the threads I'm stylin' here (Extra: ['rhymin', 'stylin'])
82097 - ["RED" SCIENCE] This salad ingredient, capsicum annuum grossum, has been allowed to ripen on the vine (Extra: ['annuum', 'grossum'])
183869 - [SHE INVENTED WHAT?] Rachel Brown & Elizabeth Hazen invented nystatin, an anitbiotic used on people & on this "Dutch" tree disease (Extra: ['anitbiotic'])
71354 - [BRITCOMS] This 1980-84 britcom about a strugglin

In [11]:
df.has_non_english_word.value_counts()

has_non_english_word
False    204598
True      12332
Name: count, dtype: int64

dictionary
has_non_english_word -- en_dict limited checks
False    118159
True      98771
Name: count, dtype: int64

tagg
has_non_english_word using tagg
False    198435
True      18495
Name: count, dtype: int64

In [12]:
# test if 'richard' is nnp
print("others" in ENGLISH_VOCAB)
# nltk.pos_tag(['Run'])[0][1] == 'NNP'

find_non_english_word(df.iloc[175864].question, method="en_dict", stopword_list=STOPWORDS, lemmatizer=lemmatizer, debug=True)

False
Word: testing, Tag: VBG, Lemma: testing, In WordNet: True, In English Dict: True
Word: new, Tag: JJ, Lemma: new, In WordNet: True, In English Dict: True
Word: drugs, Tag: NNS, Lemma: drug, In WordNet: True, In English Dict: True
Word: control, Tag: NN, Lemma: control, In WordNet: True, In English Dict: True
Word: group, Tag: NN, Lemma: group, In WordNet: True, In English Dict: True
Word: given, Tag: VBN, Lemma: given, In WordNet: True, In English Dict: True
Word: nonmedical, Tag: JJ, Lemma: nonmedical, In WordNet: False, In English Dict: True
Word: substance, Tag: NN, Lemma: substance, In WordNet: True, In English Dict: True
Word: made, Tag: VBN, Lemma: made, In WordNet: True, In English Dict: True
Word: sugar, Tag: NN, Lemma: sugar, In WordNet: True, In English Dict: True
Word: saline, Tag: JJ, Lemma: saline, In WordNet: True, In English Dict: True
Word: solution, Tag: NN, Lemma: solution, In WordNet: True, In English Dict: True
Original input: When testing new drugs, a control 

[]