In [210]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
import logging

from nltk.corpus import wordnet, words, stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('punkt_tab', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('words', quiet=True)
nltk.download('stopwords', quiet=True)

lemmatizer = WordNetLemmatizer()

In [212]:
# Constants and Patterns

# Regex for matching roman numerals
ROMAN_NUMERAL_PATTERN = re.compile(
    r'^(M{0,3})(CM|CD|D?C{0,3})'
    r'(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$', re.IGNORECASE
)

# Spelled-out numbers
SPELLED_NUMBERS = [
    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty",
    "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
    "hundred", "thousand", "million", "billion"
]

SPELLED_NUMBERS += ['first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eighth', 'ninth', 'tenth']
SPELLED_NUMBERS += ['eleventh', 'twelfth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth', 'eighteenth', 'nineteenth']
SPELLED_NUMBERS += ['twentieth', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth']
SPELLED_NUMBERS += ['hundredth', 'thousandth', 'millionth', 'billionth']

SPELLED_NUMBERS += ['twice', 'thrice', 'once']
SPELLED_NUMBERS += ['single', 'double', 'triple', 'quadruple', 'quintuple', 'sextuple', 'septuple', 'octuple', 'nonuple', 'decuple']
SPELLED_NUMBERS += ['dozen', 'fortnight', 'score', 'century', 'millennium']

STOPWORDS = set(stopwords.words("english"))

ENGLISH_VOCAB = set(w.lower() for w in words.words())



In [290]:

# File functions
# Load JSON data from a file
def load_json_data(file_path):
    try:
        data = pd.read_json(file_path, orient="records", lines=False)
        return data
    except ValueError as e:
        print(f"Error loading JSON data: {e}")
        return None


def strip_html_tags(text):
    """
    Strips HTML tags from a string.

    Args:
        text (str): The input string potentially containing HTML tags.

    Returns:
        str: The string with HTML tags removed.
    """
    # Regex to find any HTML tag: < followed by any characters, then >
    # The '?' makes it non-greedy, matching the shortest possible string.
    # The '|' handles self-closing tags like <br/> or <img src="...">
    # and also comments (though less common in user-generated text usually)
    clean = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(clean, '', text)

def strip_quotes(text):
    """
    Strips quotes only if it appears at the start and end of the argument text.
    """
    if text.startswith('"') and text.endswith('"'):
        return text[1:-1]
    elif text.startswith("'") and text.endswith("'"):
        return text[1:-1]
    return text

def is_valid_roman(string):
    """
    Checks if the input text is a valid Roman numeral.

    Args:
        text (str): The input string to check.

    Returns:
        bool: True if the text is a valid Roman numeral, False otherwise.
    """
    return bool(ROMAN_NUMERAL_PATTERN.match(string))

def is_noun_roman_bigram(bigram):
    """
    Checks if the bigram contains a valid Roman numeral and a noun.

    Args:
        bigram (tuple): A tuple containing two words (bigram).

    Returns:
        bool: True if the first word is a valid Roman numeral and the second word is a noun, False otherwise.
    """
    return is_noun(bigram[0]) and is_valid_roman(bigram[1])

def find_valid_roman_numerals(input_text, debug=False):
    """
    Parses the input text to find and return valid Roman numerals.

    Args:
        input_text (str): The input string to parse.

    Returns:
        list: A list of valid Roman numerals found in the input text.
    """
    # Split the input text into words
    # input_tokens = nltk.word_tokenize(input_text)
    input_tokens = input_text.split()
    valid_roman_numerals = []

    for idx in range(len(input_tokens)):
        # Check if the word is a valid Roman numeral
        if debug:
            print(f"Checking token: {input_tokens[idx]} at index {idx}")
        
        # If entire token is not uppercase, skip it
        if not input_tokens[idx].isupper():
            continue

        if len(input_tokens[idx]) <= 2:
            if idx == 0:
                continue  # Short words at the start are likely not valid Roman numerals

            first_word, second_word = input_tokens[idx-1], input_tokens[idx]
            # If first word ends in comma or period, skip it
            if first_word.endswith((',', '.', ';', ':')):
                if debug:
                    print(f"Skipping due to punctuation: {first_word}")
                continue

            # Remove any symbols from first word
            first_word = re.sub(r'[^\w\s]', '', first_word)

            # Check if previous word is a noun followed by a valid Roman numeral
            if is_noun_roman_bigram((first_word, second_word)):
                # If the previous word is a noun and the current word is a valid Roman numeral
                if debug:
                    print(f"Found valid Roman numeral: {first_word} {second_word} -- adding {input_tokens[idx]}")
                valid_roman_numerals.append(input_tokens[idx])
        else: 
            if is_valid_roman(input_tokens[idx]):
                valid_roman_numerals.append(input_tokens[idx])

    if debug:
        print(f"Original input: {input_text}")
        print(f"Valid Roman numerals found: {valid_roman_numerals}")

    
    # Filter and return only valid Roman numerals
    return valid_roman_numerals

def is_noun(word):
    # NLTK POS tagger expects a list of tokens
    pos = nltk.pos_tag([word])[0][1]
    # Nouns in Penn Treebank tagset start with 'NN'
    return pos.startswith('NN')


def find_non_english_word(input_text, filter="wordnet", stopword_list=STOPWORDS, lemmatizer=lemmatizer, debug=False):
    """
        Finds non-English words in the input text using POS tagging and lemmatization.

        Args:
            input_text (str): The input string to parse.
            filter (str, optional): Method to determine English words. 
                "wordnet" uses WordNet synsets, "dict" uses ENGLISH_VOCAB. Default is "wordnet".
            stopword_list (set, optional): Set of stopwords to ignore. Default is STOPWORDS.
            lemmatizer (WordNetLemmatizer, optional): Lemmatizer for reducing words to base form. Default is lemmatizer.
            debug (bool, optional): If True, prints debug information. Default is False.

        Returns:
            list: A list of non-English words found in the input text.
        """
    # Tags only used for tagged-based filtering
    function_tags = ['NNP', 'NNPS', 'IN', 'DT', 'WP', 'WP$', 'WRB', 'PRP', 'PRP$', 'CC', 'TO', 'MD', 'EX', 'UH']

    # Tokenize and POS tag the full sentence
    input_tokens = nltk.word_tokenize(input_text)
    tagged_tokens = nltk.pos_tag(input_tokens)

    # Filter out tokens that are not alphabetic or are stopwords or have a function tag
    filtered_tokens = [
        (word, tag) for word, tag in tagged_tokens
        if word.isalpha() and word.lower() not in stopword_list and tag not in function_tags
    ]

    non_english_words = []
    for word, tag in filtered_tokens:
        lemma_word = lemmatizer.lemmatize(word.lower())
        if filter == "wordnet" and not wordnet.synsets(lemma_word):
            non_english_words.append(word)
        elif filter == "dict" and not (lemma_word in ENGLISH_VOCAB or word.lower() in ENGLISH_VOCAB):
            non_english_words.append(word)

        if debug:
            print(f"Word: {word}, Tag: {tag}, Lemma: {lemma_word}, Is English: {bool(wordnet.synsets(lemma_word))}")

    if debug:
        print(f"Original input: {input_text}")
        print(f"Non-English words found: {non_english_words}")

    return non_english_words

In [283]:
find_non_english_word("This soccer player whose real first name is Mariel is one of 4 women to have scored over 100 goals in international play.", debug=True)

# nltk.pos_tag(['Richard'])[0][1] == 'NNP'

# lemmatizer.lemmatize('praciticing', 'v')

Word: soccer, Tag: NN, Lemma: soccer, Is English: True
Word: player, Tag: NN, Lemma: player, Is English: True
Word: real, Tag: JJ, Lemma: real, Is English: True
Word: first, Tag: JJ, Lemma: first, Is English: True
Word: name, Tag: NN, Lemma: name, Is English: True
Word: one, Tag: CD, Lemma: one, Is English: True
Word: women, Tag: NNS, Lemma: woman, Is English: True
Word: scored, Tag: VBN, Lemma: scored, Is English: True
Word: goals, Tag: NNS, Lemma: goal, Is English: True
Word: international, Tag: JJ, Lemma: international, Is English: True
Word: play, Tag: NN, Lemma: play, Is English: True
Original input: This soccer player whose real first name is Mariel is one of 4 women to have scored over 100 goals in international play.
Non-English words found: []


[]

In [180]:
# preprocess the data
JSON_FILE_PATH = './dataset/JEOPARDY_QUESTIONS1.json'
df = load_json_data(JSON_FILE_PATH)

In [181]:
# Pre-cleaning steps

# create copy of question to original_question
df['original_question'] = df['question'].copy()

# Strip HTML tags from the 'question' column
df['question'] = df['question'].apply(strip_html_tags)

# Strip quotes from the 'question' column
df['question'] = df['question'].apply(strip_quotes)

In [None]:
# Check for spelled numbers in the 'question' column
df['has_spelled_number'] = df['question'].apply(
    lambda x: any(word in SPELLED_NUMBERS for word in nltk.word_tokenize(x.lower()))
)

# Extract roman numerals in the 'question' column
df['roman_text'] = df['question'].apply(
    lambda x: find_valid_roman_numerals(x)
)
df['has_roman_numeral'] = df['roman_text'].apply(
    lambda x: len(x) > 0
)

# Check for numerical values in the 'question' column
df['has_numerical_value'] = df['question'].apply(
    lambda x: any(re.search(r'\d', token) for token in nltk.word_tokenize(x))
)

# has_number 
df['has_number'] = df['has_spelled_number'] | df['has_numerical_value'] | df['has_roman_numeral']

In [183]:
# Output samples of spelling numbers
print("Sample questions with spelled numbers:")
sample_list = df[df['has_spelled_number']].sample(5)[['category', 'question']]

for idx, row in sample_list.iterrows():
    print(f"{idx}. Category: {row['category']}, Question: {row['question']}")

# Output samples of roman numerals
print("Sample questions with roman numerals:")
sample_list = df[df['has_roman_numeral']].sample(5)[['category', 'question']]
for idx, row in sample_list.iterrows():
    print(f"{idx}. Category: {row['category']}, Question: {row['question']}")

# Output samples of numerical values
print("Sample questions with numerical values:")
sample_list = df[df['has_numerical_value']].sample(5)[['category', 'question']]
for idx, row in sample_list.iterrows():
    print(f"{idx}. Category: {row['category']}, Question: {row['question']}")


Sample questions with spelled numbers:
173009. Category: GOING HORSE, Question: This capital of Canada's Yukon Territory was once home to the world's largest Tungsten reserve
162543. Category: SPEAK UP!, Question: An Irish wit & playwright:"Bigamy is having one wife too many.  Monogamy is the same"
180058. Category: MAGAZINES, Question: Yachting magazine celebrates its 100th anniversary in 2007; the first issue featured an article by this tea merchant
75069. Category: & NEVER THE TWAINS SHALL MEET, Question: Mark first visited this city in 1872; in 1999 Shania played the Prince's Trust concert in Hyde Park there
151329. Category: LOTTO FEVER, Question: 13 workers at one of this coffee chain's stores in California became stars when they won big bucks in 2000...$87 million
Sample questions with roman numerals:
126213. Category: FELONIOUS MONKS, Question: In 1589 a fanatical monk named Jacques Clement assassinated this country's reigning King Henry III
163639. Category: BALLET, Question: 

In [261]:
df['non_english_words'] = df['question'].apply(
    lambda x: find_non_english_word(x)
)

df['has_non_english_word'] = df['non_english_words'].apply(
    lambda x: len(x) > 0
)

In [284]:
# Show sample questions with non-English words
print("Sample questions with non-English words:")
sample_list = df[df['has_non_english_word']].sample(10)[['category', 'question', 'non_english_words']]
for idx, row in sample_list.iterrows():
    print(f"{idx}. Q: {row['question']}, Non-EN: {row['non_english_words']}")

Sample questions with non-English words:
154842. Q: This name of an airport runway surface comes from a substance in it & inventor McAdam, Non-EN: ['McAdam']
99382. Q: Like some gouramis, several grunts are famous for doing this when 2 of them meet, Non-EN: ['gouramis']
168539. Q: It's where Finns traditionally go to feel the loyly, "steam heat", & get hikinen, "sweaty", Non-EN: ['loyly', 'hikinen', 'sweaty']
207978. Q: In 787, during Pope Adrian I's reign, the 2nd Council of this "creed" city tried to resolve the iconoclatic controversy, Non-EN: ['iconoclatic']
88900. Q: "Please, Lily, understand, everything I did was to keep you and the twins safe.""Cane, you pretended that you were dead and made me think that I was sleeping with a ghost.  I ended up in an insane asylum.  You're..."...this 10-letter adjective from the Latin for "look down on" (& favored by Daffy Duck on occasion), Non-EN: ['everything']
34014. Q: Phoenix's firefighting museum isn't called the Hall of Fame but the Hal

In [292]:
# test if 'richard' is nnp
"everything" in ENGLISH_VOCAB
# nltk.pos_tag(['Run'])[0][1] == 'NNP'

find_non_english_word(df.iloc[88900].question, debug=True)

Word: understand, Tag: NN, Lemma: understand, Is English: True
Word: everything, Tag: NN, Lemma: everything, Is English: False
Word: keep, Tag: VB, Lemma: keep, Is English: True
Word: twins, Tag: NNS, Lemma: twin, Is English: True
Word: safe, Tag: JJ, Lemma: safe, Is English: True
Word: pretended, Tag: VBD, Lemma: pretended, Is English: True
Word: dead, Tag: JJ, Lemma: dead, Is English: True
Word: made, Tag: VBD, Lemma: made, Is English: True
Word: think, Tag: VB, Lemma: think, Is English: True
Word: sleeping, Tag: VBG, Lemma: sleeping, Is English: True
Word: ghost, Tag: NN, Lemma: ghost, Is English: True
Word: ended, Tag: VBD, Lemma: ended, Is English: True
Word: insane, Tag: NN, Lemma: insane, Is English: True
Word: asylum, Tag: NN, Lemma: asylum, Is English: True
Word: adjective, Tag: NN, Lemma: adjective, Is English: True
Word: look, Tag: VB, Lemma: look, Is English: True
Word: favored, Tag: VBN, Lemma: favored, Is English: True
Word: occasion, Tag: NN, Lemma: occasion, Is English:

['everything']