In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import wordnet, words, stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('words', quiet=True)
nltk.download('stopwords', quiet=True)

lemmatizer = WordNetLemmatizer()

In [42]:
# Constants and Patterns

# Regex for matching roman numerals
ROMAN_NUMERAL_PATTERN = re.compile(
    r'^(M{0,3})(CM|CD|D?C{0,3})'
    r'(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$', re.IGNORECASE
)

# Spelled-out numbers and their variations
SPELLED_NUMBERS = [
    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty",
    "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
    "hundred", "thousand", "million", "billion"
]

SPELLED_NUMBERS += ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth']
SPELLED_NUMBERS += ['eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth']
SPELLED_NUMBERS += ['twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth']
SPELLED_NUMBERS += ['hundredth', 'thousandth', 'millionth', 'billionth']

SPELLED_NUMBERS += ['twice', 'thrice', 'once']
SPELLED_NUMBERS += ['single', 'double', 'triple', 'quadruple', 'quintuple', 'sextuple', 'septuple', 'octuple', 'nonuple', 'decuple']
SPELLED_NUMBERS += ['dozen', 'fortnight', 'score', 'century', 'millennium']

# Set of spelled-out numbers for quick lookup
SPELLED_NUMBERS_SET = set(SPELLED_NUMBERS)

# Set of stopwords from NLTK
STOPWORDS = set(stopwords.words("english"))

# English vocabulary set for filtering unusual proper nouns
ENGLISH_VOCAB = set(w.lower() for w in words.words())

# Number of samples to show in the display function
NUMBER_OF_SAMPLES_TO_SHOW = 3

# Threshold for low frequency words, used in filtering unusual proper nouns
LOW_FREQUENCY_THRESHOLD = 2



In [None]:

# File functions
# Load JSON data from a file
def load_json_data(file_path):
    '''
    Loads JSON data from the specified file path into a pandas DataFrame.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        pd.DataFrame: DataFrame containing the loaded JSON data, or None if loading fails.
    '''
    try:
        data = pd.read_json(file_path, orient="records", lines=False)
        return data
    except ValueError as e:
        print(f"Error loading JSON data: {e}")
        return None


def strip_html_tags(text):
    """
    Strips HTML tags from a string.

    Args:
        text (str): The input string potentially containing HTML tags.

    Returns:
        str: The string with HTML tags removed.
    """
    # Regex to find any HTML tag: < followed by any characters, then >
    # The '?' makes it non-greedy, matching the shortest possible string.
    # The '|' handles self-closing tags like <br/> or <img src="...">
    # and also comments (though less common in user-generated text usually)
    clean = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(clean, '', text)

def display_sample_questions(df, lookup_column = None, extra_details_col = None, sample_count = 3):
    """
    Displays a sample of questions from the DataFrame where the specified lookup_column is True.

    Args:
        df (pd.DataFrame): The DataFrame containing the questions.
        lookup_column (str, optional): The column name to use as a boolean filter for selecting rows.
        sample_count (int, optional): The number of sample questions to display. Default is 3.

    Raises:
        ValueError: If lookup_column is None or empty.

    Prints:
        The index, category, and question text for each sampled row.
    """
    if lookup_column is None:
        raise ValueError("lookup_column argument cannot be empty or None")
    
    if df is None or lookup_column not in df.columns:
        raise ValueError(f"DataFrame is None or lookup_column '{lookup_column}' does not exist in DataFrame")
    
    if extra_details_col is not None and extra_details_col not in df.columns:
        raise ValueError(f"show_supplementary_column '{extra_details_col}' does not exist in DataFrame")
    
    if extra_details_col is None:
        print(f"Sampling questions with '{lookup_column}' set to True:")

        sample_list = df[df[lookup_column]].sample(sample_count)[['category', 'question']]
        for idx, row in sample_list.iterrows():
            print(f"{idx} - [{row['category']}] {row['question']}")

    else:
        print(f"Sampling questions with '{lookup_column}' set to True, showing '{extra_details_col}':")

        sample_list = df[df[lookup_column]].sample(sample_count)[['category', 'question', extra_details_col]]

        for idx, row in sample_list.iterrows():
            print(f"{idx} - [{row['category']}] {row['question']} (Extra: {row[extra_details_col]})")
        
    

    print()

def strip_quotes(text):
    """
    Strips quotes only if it appears at the start and end of the argument text.
    """
    if text.startswith('"') and text.endswith('"'):
        return text[1:-1]
    elif text.startswith("'") and text.endswith("'"):
        return text[1:-1]
    return text

def is_valid_roman(string):
    """
    Checks if the input text is a valid Roman numeral.

    Args:
        text (str): The input string to check.

    Returns:
        bool: True if the text is a valid Roman numeral, False otherwise.
    """
    return bool(ROMAN_NUMERAL_PATTERN.match(string))

def is_noun_roman_bigram(bigram):
    """
    Checks if the bigram contains a valid Roman numeral and a noun.

    Args:
        bigram (tuple): A tuple containing two words (bigram).

    Returns:
        bool: True if the first word is a valid Roman numeral and the second word is a noun, False otherwise.
    """
    return is_noun(bigram[0]) and is_valid_roman(bigram[1])

def find_valid_roman_numerals(input_text, debug=False):
    """
    Parses the input text to find and return valid Roman numerals.

    Args:
        input_text (str): The input string to parse.

    Returns:
        list: A list of valid Roman numerals found in the input text.
    """
    # Split the input text into words
    # input_tokens = nltk.word_tokenize(input_text)
    input_tokens = input_text.split()
    valid_roman_numerals = []

    for idx in range(len(input_tokens)):
        # Check if the word is a valid Roman numeral
        if debug:
            print(f"Checking token: {input_tokens[idx]} at index {idx}")
        
        # If entire token is not uppercase, skip it
        if not input_tokens[idx].isupper():
            continue

        if len(input_tokens[idx]) <= 2:
            if idx == 0:
                continue  # Short words at the start are likely not valid Roman numerals

            first_word, second_word = input_tokens[idx-1], input_tokens[idx]
            # If first word ends in comma or period, skip it
            if first_word.endswith((',', '.', ';', ':')):
                if debug:
                    print(f"Skipping due to punctuation: {first_word}")
                continue

            # Remove any symbols from first word
            first_word = re.sub(r'[^\w\s]', '', first_word)

            # Check if previous word is a noun followed by a valid Roman numeral
            if is_noun_roman_bigram((first_word, second_word)):
                # If the previous word is a noun and the current word is a valid Roman numeral
                if debug:
                    print(f"Found valid Roman numeral: {first_word} {second_word} -- adding {input_tokens[idx]}")
                valid_roman_numerals.append(input_tokens[idx])
        else: 
            if is_valid_roman(input_tokens[idx]):
                valid_roman_numerals.append(input_tokens[idx])

    if debug:
        print(f"Original input: {input_text}")
        print(f"Valid Roman numerals found: {valid_roman_numerals}")

    
    # Filter and return only valid Roman numerals
    return valid_roman_numerals

def is_noun(word):
    # NLTK POS tagger expects a list of tokens
    pos = nltk.pos_tag([word])[0][1]
    # Nouns in Penn Treebank tagset start with 'NN'
    return pos.startswith('NN')


# def find_non_english_word(input_text :str, method="wordnet", stopword_list=STOPWORDS, lemmatizer=lemmatizer, debug=False) -> list[str]:
def find_non_english_word(
        input_text: str,
        method: str = "wordnet",
        stopword_list: set[str] | list[str] = set(),
        lemmatizer: "WordNetLemmatizer" = lemmatizer,
        debug: bool = False
    ) -> list[str]:

    """
    Identify non-English words in the input text using POS tagging and lemmatization.

    Args:
        input_text (str): The input string to analyze.
        method (str, optional): Method to determine if a word is English.
            - "wordnet": Uses WordNet synsets (default).
            - "en_dict": Uses the NLTK English vocabulary word list.
        stopword_list (set or list of str): Required. A set or list of stopwords to ignore (e.g., set(stopwords.words("english"))).
        lemmatizer (WordNetLemmatizer): Required. An instance of WordNetLemmatizer must be provided by the caller.
        debug (bool, optional): If True, prints debug information. Default is False.

    Returns:
        list: A list of non-English words found in the input text.

    Notes:
        - Only alphabetic, non-stopword, and non-function-tagged words are checked.
        - POS tags excluded: NNP, NNPS, IN, DT, WP, WP$, WRB, PRP, PRP$, CC, TO, MD, EX, UH.
        - The method parameter controls the English word check:
            * "wordnet": A word is considered English if it has WordNet synsets.
            * "en_dict": A word is considered English if it is in the NLTK English vocabulary.
    """
    # Tags only used for tagged-based filtering
    excluded_pos_tags = ['NNP', 'NNPS', 'IN', 'DT', 'WP', 'WP$', 'WRB', 'PRP', 'PRP$', 'CC', 'TO', 'MD', 'EX', 'UH']

    # Tokenize and POS tag the full sentence
    input_tokens = nltk.word_tokenize(input_text)
    tagged_tokens = nltk.pos_tag(input_tokens)

    # Filter out tokens that are not alphabetic or are stopwords or in one of the predefined function tags
    filtered_tokens = [
        (word, tag) for word, tag in tagged_tokens
        if word.isalpha() and word.lower() not in stopword_list and tag not in excluded_pos_tags
    ]

    non_english_words = []
    for word, tag in filtered_tokens:
        lemma_word = lemmatizer.lemmatize(word.lower())
        if method == "wordnet" and not wordnet.synsets(lemma_word):
            # Check lemma words in WordNet only
            non_english_words.append(word)
        elif method == "en_dict" and not (lemma_word in ENGLISH_VOCAB or lemma_word.lower() in ENGLISH_VOCAB or lemma_word.upper() in ENGLISH_VOCAB or word.lower() in ENGLISH_VOCAB):
            # Check lemma words in the NLTK English vocabulary
            non_english_words.append(word)
        elif method == "combined":
            # Check both WordNet and NLTK English vocabulary
            in_wordnet = bool(wordnet.synsets(lemma_word))
            in_en_dict = lemma_word in ENGLISH_VOCAB or lemma_word.lower() in ENGLISH_VOCAB or lemma_word.upper() in ENGLISH_VOCAB or word.lower() in ENGLISH_VOCAB
            if not (in_wordnet or in_en_dict):
                non_english_words.append(word)

        if debug:
            print(f"Word: {word}, Tag: {tag}, Lemma: {lemma_word}, In WordNet: {bool(wordnet.synsets(lemma_word))}, In English Dict: {lemma_word in ENGLISH_VOCAB or lemma_word.lower() in ENGLISH_VOCAB or lemma_word.upper() in ENGLISH_VOCAB or word.lower() in ENGLISH_VOCAB}")

    if debug:
        print(f"Original input: {input_text}")
        print(f"Non-English words found: {non_english_words}")

    return non_english_words


def find_proper_nouns(text: str, debug: str = False) -> list[str]:
    """
    Extract all proper nouns (NNP, NNPS) from the input text.

    Args:
        text (str): The input string to analyze.

    Returns:
        list[str]: A list of proper nouns found in the text.
    """
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)

    if debug:
        print(f"Input text: {text}")
        for word, tag in tagged:
            print(f"Word: {word}, Tag: {tag}")

    return [word for word, tag in tagged if tag in ("NNP", "NNPS")]

def generate_word_frequency(df: pd.DataFrame, column: str):
    if column not in df.columns:
        raise ValueError(f"Column '{column}' does not exist in DataFrame")
    
    # Concatenate all questions into a single string
    all_text = ' '.join(df[column].astype(str).tolist())

    # Convert to lowercase and remove HTML tags
    all_text = all_text.lower()
    all_text = strip_html_tags(all_text)

    # Tokenize the text into words
    words = nltk.word_tokenize(all_text)
    # Create a frequency distribution of the words
    freq_dist = nltk.FreqDist(words)

    # Convert the frequency distribution to a DataFrame
    freq_df = pd.DataFrame(freq_dist.items(), columns=['word', 'frequency'])

    # Sort the DataFrame by frequency in descending order
    freq_df = freq_df.sort_values(by='frequency', ascending=False).reset_index(drop=True)

    return freq_df

In [15]:
# find_non_english_word("This soccer player whose real first name is Mariel is one of 4 women to have scored over 100 goals in international play.", debug=True)

# nltk.pos_tag(['Richard'])[0][1] == 'NNP'

# lemmatizer.lemmatize('praciticing', 'v')

proper_nouns = find_proper_nouns("Barack Obama was the 44th President of the United States.")
print(proper_nouns)  # Output: ['Barack', 'Obama', 'United', 'States']

['Barack', 'Obama', 'President', 'United', 'States']


In [5]:
# preprocess the data
JSON_FILE_PATH = './dataset/JEOPARDY_QUESTIONS1.json'
df = load_json_data(JSON_FILE_PATH)

In [6]:
# Pre-cleaning steps

# create copy of question to original_question
df['original_question'] = df['question'].copy()

# Strip HTML tags from the 'question' column
df['question'] = df['question'].apply(strip_html_tags)

# Strip quotes from the 'question' column
df['question'] = df['question'].apply(strip_quotes)

In [None]:
# Check for spelled numbers in the 'question' column
df['has_spelled_number'] = df['question'].apply(
    lambda x: bool(set(nltk.word_tokenize(x.lower())) & SPELLED_NUMBERS_SET)
)

# Extract roman numerals in the 'question' column
df['roman_text'] = df['question'].apply(
    lambda x: find_valid_roman_numerals(x)
)
df['has_roman_numeral'] = df['roman_text'].apply(
    lambda x: len(x) > 0
)

# Check for numerical values in the 'question' column
df['has_numerical_value'] = df['question'].apply(
    lambda x: any(re.search(r'\d', token) for token in nltk.word_tokenize(x))
)

# has_number 
df['has_number'] = df['has_spelled_number'] | df['has_numerical_value'] | df['has_roman_numeral']

In [8]:
# Output samples of filtered numbers
display_sample_questions(df, 'has_spelled_number')
display_sample_questions(df, 'has_roman_numeral')
display_sample_questions(df, 'has_numerical_value')


Sampling questions with 'has_spelled_number' set to True:
44099 - ["PEACE", BRO] The first  volunteers in this U.S. government humanitarian force went to Ghana & Tanzania in 1961
19452 - [LEX' LAB] Lex traded in his optical one for a scanning electron one & increased his magnification 50x
167080 - [CINEMA TRIVIA] Harrison Ford famously shot a swordsman in this first Indiana Jones film because he was too ill to do a fight scene

Sampling questions with 'has_roman_numeral' set to True:
17419 - [DYNASTY] The House of Savoy-Carignano was shuttered up in this country in 1946 when Humbert II left the throne
77487 - [FOOD & DRINK] Antonin Careme created Charlotte Russe & this sour-creamed meat dish for Czar Alexander I
8413 - [SHAKESPEAREAN WORDS] This word in "Henry VI Part 2" meant blase & world-weary, not having to do with nephrite

Sampling questions with 'has_numerical_value' set to True:
66788 - [STAR LIGHT, STAR BRIGHT] A spectral class G yellow star, it has an apparent magnitude of ab

In [None]:
print("Parsing questions for non-English words (this may take few minutes)...", end=' ')

df['non_english_words'] = df['question'].apply(
    lambda x: find_non_english_word(x, method="combined", stopword_list=STOPWORDS, lemmatizer=lemmatizer, debug=False)
)

print("Parsing finished.", end='\n')

df['has_non_english_word'] = df['non_english_words'].apply(
    lambda x: len(x) > 0
)

Parsing questions for non-English words... (this may take a while) Done.


In [10]:
# Show sample questions with non-English words
display_sample_questions(df, lookup_column='has_non_english_word', extra_details_col='non_english_words', sample_count=10)


Sampling questions with 'has_non_english_word' set to True, showing 'non_english_words':
86846 - [THE 1870s] Due to growing responsibilities of the Attorney General, Congress established this Cabinet dept. on June 22, 1870 (Extra: ['dept'])
101649 - [WHERE AM I?] (Cheryl of the Clue Crew will stands in front of a building that extends partway over a river.)  I'm at the library of this president who says it symbolizes his efforts to build bridges from yesterday to tomorrow (Extra: ['partway'])
183490 - [SITCOMS] Jon Cryer plays an agent who moved up from mailroom clerk at the Unlimited Talent Agency in this CBS sitcom (Extra: ['mailroom'])
29587 - [FLOWERS & CANDY] (Kelly of the Clue Crew shows some candy at the See's Candy Factory.)  A cookbook from the days of the Roman Empire has a recipe for "nucatum" which is the Latin root of this confection still popular today (Extra: ['nucatum'])
155063 - [TV CATCHPHRASES] "What you talkin' 'bout, Willis?" (Extra: ['talkin'])
208894 - [ONE FACT 

In [20]:
# # test if 'richard' is nnp
# print("others" in ENGLISH_VOCAB)
# # nltk.pos_tag(['Run'])[0][1] == 'NNP'

# find_non_english_word(df.iloc[175864].question, method="en_dict", stopword_list=STOPWORDS, lemmatizer=lemmatizer, debug=True)

In [25]:
# Unusual Proper Nouns

df['proper_nouns'] = df['question'].apply(
    lambda x: find_proper_nouns(x, debug=False)
)



In [26]:
# Generate word frequency DataFrame for the 'question' column
frequency_df = generate_word_frequency(df, 'question')


In [43]:
# Filter frequency to 1 and extract all the words
unusual_words = set(frequency_df[frequency_df['frequency'] <= LOW_FREQUENCY_THRESHOLD]['word'].tolist())

df['has_unusual_proper_noun'] = df['proper_nouns'].apply(
    lambda x: bool(set(word.lower() for word in x) & unusual_words)
)

In [44]:
# count the number of has_unusual_proper_noun
df['has_unusual_proper_noun'].value_counts()

has_unusual_proper_noun
False    189531
True      27399
Name: count, dtype: int64

In [45]:
display_sample_questions(df, lookup_column='has_unusual_proper_noun', extra_details_col='proper_nouns', sample_count=10)

Sampling questions with 'has_unusual_proper_noun' set to True, showing 'proper_nouns':
63367 - [DON'T LISTEN TO THEM!] In 1839 French painter Paul Delaroche said, "From today painting is dead" after an exhibition of these (Extra: ['Paul', 'Delaroche'])
143857 - [A MATTER OF TASTE] Despite the name of this brand of spread, chef Anthony Bourdain says, "I can" (Extra: ['Anthony', 'Bourdain'])
19053 - [GOOD OLD SOUTHERN EATS] A popular recipe for this pie was created by the wife of a sales executive at Karo syrup (Extra: ['Karo'])
117017 - [ROLLING STONE'S 20 MOST ANNOYING SONGS] This Sisqo song about a piece of clothing mentioned another song on the list, "Livin' La Vida Loca" (Extra: ['Sisqo', 'Livin', 'La', 'Vida', 'Loca'])
204818 - [SCOTLAND] This Scottish seaport is the seat of the Strathclyde state (Extra: ['Strathclyde'])
99261 - [HAPPY 200th, LINCOLN & DARWIN] Born in Kentucky Feb.12, 1809, Abe Lincoln lived from age 7 to 21 in this state before the move to Illinois (Extra: ['Kentu