In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import re

# Define the folder path containing the text files
folder_path = 'data-raw/'

# Instantiate PlaintextCorpusReader with the folder path
pokemon_corpus = PlaintextCorpusReader(folder_path, '.*\.txt')

## 1. Load text files & Tokenize text into words within sentences

In [None]:
# Get the file IDs (names) in the corpus
file_ids = pokemon_corpus.fileids()

# Initialize an empty list to store tokenized contents of all files
tokenized_corpus = []

# Tokenize each file in the corpus
for file_id in file_ids:
    # Get raw text content of the file
    file_content = pokemon_corpus.raw(file_id)

    # Convert text to lowercase
    file_content_lower = file_content.lower()

    # Remove symbols using regular expressions
    file_content_cleaned = re.sub('(#|\(|\)‚Äù)', '', file_content_lower)

    # Tokenize the cleaned text content into sentences
    sentences = sent_tokenize(file_content_cleaned)
    
    # Tokenize the text content inside sentences
    tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    # Append tokenized content to the tokenized_corpus list
    tokenized_corpus.append(tokens)

In [None]:
tokenized_corpus[0][0]

In [None]:
# Assuming tokenized_corpus is your list of lists of lists
num_files = len(tokenized_corpus)  # Number of files

# Number of rows in each layer (assuming all inner lists have the same length)
num_sents = len(tokenized_corpus[0]) if tokenized_corpus else 0

print("Shape of the tokenized corpus: ", num_files, "files x", num_sents, "sentences")

## 2. Subset to only sentences containing the words "evolve / evolved / evolving"

In [None]:
# Define a list to store sentences containing the words "evolve/evolved/evolving"
tokenized_corpus_evolve = []

# Iterate over each document in the tokenized_corpus
for file in tokenized_corpus:
    # Iterate over each sentence in the document
    for sentence_tokens in file:
        # Check if any of the keywords are present in the sentence
        if any(token in ['evolve', 'evolved', 'evolving'] for token in sentence_tokens):
            # Check if "to" or "into" immediately follow the key tokens
            if not any((sentence_tokens[i] in ['from', 'to', 'into', 'him', 'it', '.', '...', ',', ';', ':', '?', '!', '&', '-', '*'] and sentence_tokens[i-1] in ['evolve', 'evolved', 'evolving']) for i in range(1, len(sentence_tokens))):
                tokenized_corpus_evolve.append(sentence_tokens)

In [None]:
print('No. of sentences in corpus containing the word "evolve/evolved/evolving":', len(tokenized_corpus_evolve))

In [None]:
# Concordance to get surrounding context to check that "evolve to" etc. are not in the data
from nltk.text import ConcordanceIndex

# Convert the list of tokenized sentences into a list of words
words = [word for sentence in tokenized_corpus_evolve for word in sentence]

# Convert the filtered tokens into an NLTK Text object for contextual analysis
evolve_text = nltk.Text(words)

# Apply concordance method to the NLTK Text object
print(evolve_text.concordance(["evolve", "to"], width = 100, lines=20))
print(evolve_text.concordance(["evolve", "an"], width = 100, lines=20))

In [None]:
# Frequency analysis
from nltk.corpus import stopwords
import string
from collections import Counter

# Flatten the list of sentences
flattened_tokens = [token for sublist in tokenized_corpus_evolve for token in sublist]

# Remove stopwords + punctuation
stop_words = set(stopwords.words('english') + list(string.punctuation) + ['evolve', 'evolved', 'evolving'])
filtered_tokens = [word for word in flattened_tokens if word not in stop_words]

# Perform frequency analysis
word_freq = Counter(filtered_tokens)

# Print the most common words and their frequencies
print("Most common words and their frequencies:")
for word, freq in word_freq.most_common(5):
    print(f"{word}: {freq}")

In [None]:
# Save checkpoint
import csv

# Write data to a CSV file
with open('data-processed/tokenized_corpus_evolve-checkpoint.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerows(tokenized_corpus_evolve)

# 3. Subset corpus to sentences that contain Pokemon name close to "evolve"

In [None]:
# Import a pokemon name list

import pandas as pd
import re

# Read the CSV file into a pandas DataFrame
columns_to_read = ['Name', 'Aliases']
pokemon_list = pd.read_csv("pokemon_list.csv", usecols = columns_to_read)

# Read the columns of interest and flatten the lists into a single list. 
pokemon_name_singular = pokemon_list['Name'].str.lower().tolist()
pokemon_name_singular_alias = pokemon_list['Aliases'].dropna().str.lower().tolist()

# Add plural forms to list
# Define a list of sibilant endings
sibilant_endings = ['s', 'sh', 'ch', 'x', 'z']

# Initialize an empty list to store the plural forms
pokemon_name_plural = []
pokemon_name_plural_alias = []

# Loop through each word in the flattened list
for word in pokemon_name_singular:
    # Check if the word ends with a sibilant
    if any(word.endswith(ending) for ending in sibilant_endings):
        # Add "es" to the word
        plural_word = word + 'es'
    else:
        # Add "s" to the word
        plural_word = word + 's'   
    # Append the plural form to the list
    pokemon_name_plural.append(plural_word)

for word in pokemon_name_singular_alias:
    # Check if the word ends with a sibilant
    if any(word.endswith(ending) for ending in sibilant_endings):
        # Add "es" to the word
        plural_word = word + 'es'
    else:
        # Add "s" to the word
        plural_word = word + 's'   
    # Append the plural form to the list
    pokemon_name_plural_alias.append(plural_word)

pokemon_name_list = pokemon_name_singular + pokemon_name_plural + pokemon_name_singular_alias + pokemon_name_plural_alias
pokemon_name_list[:5]

In [None]:
# Subset the data so that the sentences only contain a Pokemon name within 5 words to the right of "evolve/d/ing"
# Extract fields for: pokemon name, evolve word form, evolve syntactic frame
# Define the target words
target_words = ["evolve", "evolved", "evolving"]
name_list = pokemon_name_list

# Initialize an empty list to store the filtered sentences and names
evolve_pokemon_name_in_text = []
evolve_word_form = []
evolve_frame = []
evolve_pokemon_sentences = []

# Iterate over each sentence in the tokenized_corpus
for sentence in tokenized_corpus_evolve:
    # Flag to indicate if the sentence contains any of the target words
    contains_target_word = False
    
    # Iterate over each word index in the sentence
    for i, word in enumerate(sentence):
        # Check if the word is one of the target words
        if word in target_words:
            # Check the next five words after the target word, or till the end of the sentence
            for j in range(i + 1, min(i + 6, len(sentence)-1)):
                # Check if any of the following words are in the Pokemon name list
                if sentence[j] in name_list:
                    # If any of the following words are in the Pokemon name list, add the pokemon name and the sentence
                    evolve_pokemon_name_in_text.append(sentence[j])
                    evolve_word_form.append(word)
                    evolve_frame.append(sentence[i+1:j])
                    evolve_pokemon_sentences.append(sentence)
                    contains_target_word = True
                    break  # Break out of the inner loop
        
        if contains_target_word:
            break  # Break out of the outer loop if the sentence is included

# filtered_sentences now contain sentences that meet the criteria

In [None]:
# Check the list lengths are identical
print(len(evolve_pokemon_name_in_text))
print(len(evolve_word_form))
print(len(evolve_frame))
print(len(evolve_pokemon_sentences))

In [None]:
# Concordance to get surrounding context
from nltk.text import ConcordanceIndex

# Convert the list of tokenized sentences into a list of words
# Convert the filtered tokens into an NLTK Text object for contextual analysis
# Apply concordance method to the NLTK Text object

words = [word for sentence in evolve_pokemon_sentences for word in sentence]
evolve_text = nltk.Text(words)
evolve_text.concordance(["evolve", "my", "first"], width = 100, lines=20)

In [None]:
# Lemmatize / Standardize pokemon names

# Mapping dictionary for lemmatization
plural_mapping = dict(zip(pokemon_name_plural, pokemon_name_singular))
additional_mapping = {
    "weeping": "weepinbell",
    "weepingbell": "weepinbell",
    "graveller": "graveler",
    "magicarp": "magikarp",
    "garados": "gyarados",
    "gyrados": "gyarados",
    "ladyba": "ledyba",
    "alteria": "altaria"
}
plural_mapping.update(additional_mapping)

evolve_pokemon_lemma = [plural_mapping.get(name, name) for name in evolve_pokemon_name_in_text]


In [None]:
evolve_frame = [['EVOLVE'] + sublist + ['POKEMON'] for sublist in evolve_frame]

In [None]:
evolve_df = pd.DataFrame(list(zip(evolve_pokemon_lemma, evolve_word_form, evolve_frame, evolve_pokemon_name_in_text, evolve_pokemon_sentences)),
               columns =['pokemon_lemma', 'evolve_word_form', 'evolve_frame', 'pokemon_name_in_text', 'sentence'])

In [None]:
evolve_df

In [None]:
evolve_df.to_csv('data-processed/pokemon_evolve.csv', index = False)