In [1]:
import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import re

# Define the folder path containing the text files
folder_path = 'data-raw/'

# Instantiate PlaintextCorpusReader with the folder path
pokemon_corpus = PlaintextCorpusReader(folder_path, '.*\.txt')

In [2]:
# Get the file IDs (names) in the corpus
file_ids = pokemon_corpus.fileids()

# Initialize an empty list to store tokenized contents of all files
tokenized_corpus = []

# Tokenize each file in the corpus
for file_id in file_ids:
    # Get raw text content of the file
    file_content = pokemon_corpus.raw(file_id)

    # Convert text to lowercase
    file_content_lower = file_content.lower()

    # Remove symbols using regular expressions
    file_content_cleaned = re.sub('(#|\(|\)”)', '', file_content_lower)

    # Tokenize the cleaned text content into sentences
    sentences = sent_tokenize(file_content_cleaned)
    
    # Tokenize the text content inside sentences
    tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    # Append tokenized content to the tokenized_corpus list
    tokenized_corpus.append(tokens)

In [3]:
tokenized_corpus[0][0]

['post',
 '1',
 'title',
 ':',
 'thank',
 'you',
 'created',
 ':',
 '2018-05-19',
 '23:50:29',
 'id',
 ':',
 '8koqq2',
 'original',
 'post',
 'hello',
 'everybody',
 '.']

In [4]:
# Assuming tokenized_corpus is your list of lists of lists
num_files = len(tokenized_corpus)  # Number of files

# Number of rows in each layer (assuming all inner lists have the same length)
num_sents = len(tokenized_corpus[0]) if tokenized_corpus else 0

print("Shape of the tokenized corpus: ", num_files, "files x", num_sents, "sentences")

Shape of the tokenized corpus:  7 files x 238959 sentences


In [5]:
# Define a list to store sentences containing the words "evolve/evolved/evolving"
tokenized_corpus_evolve = []

# Iterate over each document in the tokenized_corpus
for file in tokenized_corpus:
    # Iterate over each sentence in the document
    for sentence_tokens in file:
        # Check if any of the keywords are present in the sentence
        if any(token in ['evolve', 'evolved', 'evolving'] for token in sentence_tokens):
            # Check if "to" or "into" immediately follow the key tokens
            if not any((sentence_tokens[i] in ['from', 'to', 'into', 'him', 'it', '.', '...', ',', ';', ':', '?', '!', '&', '-', '*'] and sentence_tokens[i-1] in ['evolve', 'evolved', 'evolving']) for i in range(1, len(sentence_tokens))):
                tokenized_corpus_evolve.append(sentence_tokens)

In [6]:
print("No. of sentences in corpus:", len(tokenized_corpus_evolve))
tokenized_corpus_evolve[5]

No. of sentences in corpus: 12557


['to',
 'evolve',
 'i',
 'need',
 '100.',
 'i',
 'have',
 '30.',
 'someone',
 'do',
 'math',
 'i',
 'ca',
 "n't",
 'think',
 'that',
 'hard',
 '.']

In [7]:
# Concordance to get surrounding context to check that "evolve to" etc. are not in the data
from nltk.text import ConcordanceIndex

# Convert the list of tokenized sentences into a list of words
words = [word for sentence in tokenized_corpus_evolve for word in sentence]

# Convert the filtered tokens into an NLTK Text object for contextual analysis
evolve_text = nltk.Text(words)

# Apply concordance method to the NLTK Text object
print(evolve_text.concordance(["evolve", "to"], width = 100, lines=20))
print(evolve_text.concordance(["evolve", "an"], width = 100, lines=20))

no matches
None
Displaying 20 of 88 matches:
 be easy might be able to evolve a kingdra . evolve an alteria . etcor that you evolve a seadra int
ght away . it 's frustrating enough when you evolve an eevee and do n't get the one you want and th
 you need the name trick if you 're going to evolve an old eevee . it seems you have to catch new e
speon and tamao for umbreonwhenever i try to evolve an evee the game jsut freezes so i have to clos
would one want to use 25 jigglypuff candy to evolve an igglybuff to get 1 more jigglypuff ? : d28 s
 but i 'm going to wait for 20 more candy to evolve an 84 % one . hopefully i 'll walk enough to ev
 i 'm stoked to use him as my buddy until he evolve an arcanine this list is subject to change , bu
n eclipse ? i ’ ll have to set a reminder to evolve an eevee at that exact moment . yes , its my bi
rmal wild spawn.the real question is can you evolve an event pikachu into alola raichu and still ke
aught is probably omanyte and enough abra to evolve an 

In [8]:
# Frequency analysis
from nltk.corpus import stopwords
import string
from collections import Counter

# Flatten the list of sentences
flattened_tokens = [token for sublist in tokenized_corpus_evolve for token in sublist]

# Remove stopwords + punctuation
stop_words = set(stopwords.words('english') + list(string.punctuation) + ['evolve', 'evolved', 'evolving'])
filtered_tokens = [word for word in flattened_tokens if word not in stop_words]

# Perform frequency analysis
word_freq = Counter(filtered_tokens)

# Print the most common words and their frequencies
print("Most common words and their frequencies:")
for word, freq in word_freq.most_common(25):
    print(f"{word}: {freq}")

Most common words and their frequencies:
pokemon: 3070
n't: 2604
one: 2377
's: 2177
get: 2044
candy: 1796
candies: 1397
cp: 1344
...: 1333
level: 1296
'm: 1235
like: 1230
egg: 1207
would: 1163
lucky: 1143
've: 1098
got: 999
catch: 935
enough: 907
time: 856
eevee: 855
'': 755
xp: 752
caught: 739
still: 738


In [9]:
# Save checkpoint
import csv

# Write data to a CSV file
with open('data-processed/tokenized_corpus_evolve-checkpoint.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerows(tokenized_corpus_evolve)

In [80]:
import pandas as pd
import re

# Read the CSV file into a pandas DataFrame
columns_to_read = ['Name', 'Aliases']
pokemon_list = pd.read_csv("pokemon_list.csv", usecols = columns_to_read)

# Read the columns of interest and flatten the lists into a single list. 
pokemon_name_singular = pokemon_list['Name'].str.lower().tolist()
pokemon_name_singular_alias = pokemon_list['Aliases'].dropna().str.lower().tolist()

# Add plural forms to list
# Define a list of sibilant endings
sibilant_endings = ['s', 'sh', 'ch', 'x', 'z']

# Initialize an empty list to store the plural forms
pokemon_name_plural = []
pokemon_name_plural_alias = []

# Loop through each word in the flattened list
for word in pokemon_name_singular:
    # Check if the word ends with a sibilant
    if any(word.endswith(ending) for ending in sibilant_endings):
        # Add "es" to the word
        plural_word = word + 'es'
    else:
        # Add "s" to the word
        plural_word = word + 's'   
    # Append the plural form to the list
    pokemon_name_plural.append(plural_word)

for word in pokemon_name_singular_alias:
    # Check if the word ends with a sibilant
    if any(word.endswith(ending) for ending in sibilant_endings):
        # Add "es" to the word
        plural_word = word + 'es'
    else:
        # Add "s" to the word
        plural_word = word + 's'   
    # Append the plural form to the list
    pokemon_name_plural_alias.append(plural_word)

pokemon_name_list = pokemon_name_singular + pokemon_name_plural + pokemon_name_singular_alias + pokemon_name_plural_alias
pokemon_name_list

['bulbasaur',
 'ivysaur',
 'venusaur',
 'mega-venusaur',
 'charmander',
 'charmeleon',
 'charizard',
 'mega-charizard-x',
 'mega-charizard-y',
 'squirtle',
 'wartortle',
 'blastoise',
 'mega-blastoise',
 'caterpie',
 'metapod',
 'butterfree',
 'weedle',
 'kakuna',
 'beedrill',
 'mega-beedrill',
 'pidgey',
 'pidgeotto',
 'pidgeot',
 'mega-pidgeot',
 'rattata',
 'raticate',
 'spearow',
 'fearow',
 'ekans',
 'arbok',
 'pikachu',
 'raichu',
 'sandshrew',
 'sandslash',
 'nidoran-male',
 'nidorina',
 'nidoqueen',
 'nidoran-female',
 'nidorino',
 'nidoking',
 'clefairy',
 'clefable',
 'vulpix',
 'ninetales',
 'jigglypuff',
 'wigglytuff',
 'zubat',
 'golbat',
 'oddish',
 'gloom',
 'vileplume',
 'paras',
 'parasect',
 'venonat',
 'venomoth',
 'diglett',
 'dugtrio',
 'meowth',
 'persian',
 'psyduck',
 'golduck',
 'mankey',
 'primeape',
 'growlithe',
 'arcanine',
 'poliwag',
 'poliwhirl',
 'poliwrath',
 'abra',
 'kadabra',
 'alakazam',
 'mega-alakazam',
 'machop',
 'machoke',
 'machamp',
 'bellsp

In [11]:
# Concordance to get surrounding context
from nltk.text import ConcordanceIndex

# Convert the list of tokenized sentences into a list of words
# Convert the filtered tokens into an NLTK Text object for contextual analysis
# Apply concordance method to the NLTK Text object

words = [word for sentence in tokenized_corpus_evolve for word in sentence]
evolve_text = nltk.Text(words)
evolve_text.concordance(["evolved"], width = 100, lines=20)

Displaying 20 of 3765 matches:
i 've evolved 7 garados , caught 1 , & have over 700 magica
i saw him on the radar from my house . i just evolved my very first dragonite . to evolve i need 10
even keep certain good iv pokemon anymore . i evolved my 100 % weedle two weeks ago . only reason t
 exist is because that 's how it came when it evolved on cd and has n't been changed via tm yet.you
ut `` evo '' in front of ones that need to be evolved and `` l '' on the end of lucky ones . commen
 i clicked it it was a marowak , it must have evolved since your comment.there you are ! you got a 
nly works once for each one , unfortunately.i evolved 2 with the name trickmust have been a coincid
t before trying to evolve your eevee . [ just evolved vaporeon , 100 % ] https : //imgur.com/3cedek
there is , the dust requirement increases for evolved forms . * * edit * * i apparently am incorrec
etting to maximum power , irrespective of how evolved that pokemon is ? this is incorrect . i actua
e anyway 

In [12]:
from nltk import FreqDist

# Initialize an empty list to store words following "evolve", "evolved", and "evolving"
words_following_evolve = []

# Define the target words
target_words = ["evolve", "evolved", "evolving"]

# Iterate over the tokenized corpus
for i in range(len(tokenized_corpus_evolve) - 1):
    for j in range(len(tokenized_corpus_evolve[i])):
        # Check if the current token is one of the target words
        if tokenized_corpus_evolve[i][j] in target_words and j < len(tokenized_corpus_evolve[i]) - 1:
            # Append the word immediately following the target word
            words_following_evolve.append(tokenized_corpus_evolve[i][j + 1])

# Calculate frequency distribution of words following the target words
fdist = FreqDist(words_following_evolve)

# Get the most common words
most_common_words = fdist.most_common()

# Print the most common words vertically
for word, frequency in most_common_words:
    print(f"{word}: {frequency}")

a: 1532
my: 822
them: 757
and: 583
pokemon: 509
one: 496
the: 489
your: 196
an: 189
all: 189
for: 185
with: 181
forms: 175
form: 168
in: 156
that: 140
at: 139
or: 137
as: 112
): 109
is: 101
during: 99
another: 91
3: 91
'': 89
eevee: 88
pokémon: 87
mine: 81
two: 77
anything: 74
some: 73
any: 73
pidgeys: 70
2: 69
so: 69
like: 67
4: 65
button: 65
everything: 64
i: 63
something: 63
more: 63
but: 59
when: 58
5: 58
this: 51
on: 49
first: 46
before: 40
those: 38
both: 37
about: 37
if: 36
because: 36
three: 36
while: 34
6: 33
their: 32
just: 32
magikarp: 32
stuff: 31
pidgey: 31
you: 31
then: 31
by: 30
once: 30
things: 30
spree: 30
gives: 30
mons: 29
every: 29
1: 29
until: 29
ones: 29
will: 28
up: 28
session: 27
after: 26
only: 26
now: 26
eevees: 26
versions: 26
does: 25
animation: 25
right: 25
over: 24
..: 24
his: 24
high: 24
fodder: 23
60: 23
without: 23
version: 22
yet: 21
pikachu: 20
each: 20
though: 20
10: 20
would: 19
too: 17
items: 17
twice: 17
8: 17
are: 17
again: 16
gyarados: 16
starte

In [13]:
# Get frequency of most common pokemon names that appear immediately after "evolve/d/ing"
most_common_pokemon = [name for name in most_common_words if name[0] in pokemon_name_list]

# Print the most common Pokemon names vertically
for pokemon, frequency in most_common_pokemon:
    print(f"{pokemon}: {frequency}")

eevee: 88
pidgeys: 70
magikarp: 32
pidgey: 31
eevees: 26
pikachu: 20
gyarados: 16
espeon: 11
umbreon: 9
weedles: 9
nincada: 8
charmander: 8
pidgeotto: 8
squirtle: 7
dratini: 7
blastoise: 7
dragonite: 6
pidgeottos: 6
scyther: 5
cubone: 5
gyrados: 5
golbat: 5
meowth: 4
vaporeon: 4
trapinch: 4
magicarp: 4
togepi: 4
tyrogue: 4
onix: 4
golbats: 4
hypno: 4
zubats: 4
pidgeot: 4
drowzees: 4
seadra: 3
shellder: 3
machamp: 3
raichu: 3
machoke: 3
porygon: 3
swablu: 3
gengar: 3
dragonair: 3
gloom: 3
bellossom: 3
slowpokes: 3
wailmer: 3
machop: 3
rattata: 3
pichu: 3
oddish: 3
snorlax: 3
zubat: 3
weedle: 3
rattatas: 3
golems: 2
aron: 2
horsea: 2
bulbasaur: 2
muk: 2
charizard: 2
mareep: 2
ekans: 2
swinub: 2
chansey: 2
exeggutor: 2
kingdra: 2
kabuto: 2
pikachus: 2
flareon: 2
jolteon: 2
feraligatr: 2
porygon2: 2
grimer: 2
pichus: 2
gastlys: 2
dodrio: 2
sunkern: 2
mareeps: 2
ampharos: 2
caterpie: 2
hypnos: 2
caterpies: 2
steelix: 2
alakazam: 2
kakunas: 2
venusaur: 2
chikorita: 2
gravelers: 1
starmie: 1


In [70]:
# Subset the data so that the sentences only contain a Pokemon name within 5 words to the right of "evolve/d/ing"
# Define the target words
target_words = ["evolve", "evolved", "evolving"]
name_list = pokemon_name_list

# Initialize an empty list to store the filtered sentences and names
evolve_pokemon_name_in_text = []
evolve_word_form = []
evolve_frame = []
evolve_pokemon_sentences = []

# Iterate over each sentence in the tokenized_corpus
for sentence in tokenized_corpus_evolve:
    # Flag to indicate if the sentence contains any of the target words
    contains_target_word = False
    
    # Iterate over each word index in the sentence
    for i, word in enumerate(sentence):
        # Check if the word is one of the target words
        if word in target_words:
            # Check the next five words after the target word, or till the end of the sentence
            for j in range(i + 1, min(i + 6, len(sentence)-1)):
                # Check if any of the following words are in the Pokemon name list
                if sentence[j] in name_list:
                    # If any of the following words are in the Pokemon name list, add the pokemon name and the sentence
                    evolve_pokemon_name_in_text.append(sentence[j])
                    evolve_word_form.append(word)
                    evolve_frame.append(sentence[i+1:j])
                    evolve_pokemon_sentences.append(sentence)
                    contains_target_word = True
                    break  # Break out of the inner loop
        
        if contains_target_word:
            break  # Break out of the outer loop if the sentence is included

# filtered_sentences now contain sentences that meet the criteria

In [71]:
# Check the list lengths are identical
print(len(evolve_pokemon_name_in_text))
print(len(evolve_word_form))
print(len(evolve_frame))
print(len(evolve_pokemon_sentences))

3686
3686
3686
3686


In [72]:
# Concordance to get surrounding context
from nltk.text import ConcordanceIndex

# Convert the list of tokenized sentences into a list of words
# Convert the filtered tokens into an NLTK Text object for contextual analysis
# Apply concordance method to the NLTK Text object

words = [word for sentence in evolve_pokemon_sentences for word in sentence]
evolve_text = nltk.Text(words)
evolve_text.concordance(["evolve", "my", "first"], width = 100, lines=20)

Displaying 20 of 23 matches:
into vaporeon . finally scraped enough to evolve my first eevee , got a 700cp jolteon . i 've only 
efore tonight it took me since release to evolve my first gengar 2 days ago . edit2 : as someone po
a lot of walking to get enough candies to evolve my first larvitar , only for my tyranitar to know 
. i 've only just scraped enough candy to evolve my first piloswine : p i play a lot ) not anymore 
 'm unhappy that i 'll finally be able to evolve my first dragonite a little sooner . good to know 
felt when i finally had enough candies to evolve my first charizard . alolan raichu comes from evol
 the tweet . i just got enough candies to evolve my first dragonite less than a week ago . i 'm gon
 evolve 2 omastars . enough karp candy to evolve my first gyrados , ive got a 93 % er thats going t
quest . finally was able to get enough to evolve my first magikarp 3 months ago . i ’ m frustrated 
p ! i needed to catch one more dratini to evolve my first dragonite , d

In [73]:
# Lemmatize / Standardize pokemon names

# Mapping dictionary for lemmatization
plural_mapping = dict(zip(pokemon_name_plural, pokemon_name_singular))
additional_mapping = {
    "weeping": "weepinbell",
    "weepingbell": "weepinbell",
    "graveller": "graveler",
    "magicarp": "magikarp",
    "garados": "gyarados",
    "gyrados": "gyarados",
    "ladyba": "ledyba",
    "alteria": "altaria"
}
plural_mapping.update(additional_mapping)

evolve_pokemon_lemma = [plural_mapping.get(name, name) for name in evolve_pokemon_names]


In [74]:
evolve_frame = [["EVOLVE'] + sublist + ['POKEMON'] for sublist in evolve_frame]

In [78]:
evolve_df = pd.DataFrame(list(zip(evolve_pokemon_lemma, evolve_word_form, evolve_frame, evolve_pokemon_names, evolve_pokemon_sentences)),
               columns =['pokemon_lemma', 'evolve_word_form', 'evolve_frame', 'pokemon_name_in_text', 'sentence'])

In [79]:
evolve_df

Unnamed: 0,pokemon_lemma,evolve_word_form,evolve_frame,pokemon_name_in_text,sentence
0,gyarados,evolved,"[EVOLVE, 7, POKEMON]",garados,"[i, 've, evolved, 7, garados, ,, caught, 1, ,,..."
1,dratini,evolve,"[EVOLVE, my, very, first, POKEMON]",dratini,"[i, 'm, about, to, evolve, my, very, first, dr..."
2,dragonite,evolve,"[EVOLVE, a, second, POKEMON]",dragonite,"[i, had, been, waiting, forever, for, my, fina..."
3,dragonite,evolved,"[EVOLVE, my, very, first, POKEMON]",dragonite,"[i, just, evolved, my, very, first, dragonite, .]"
4,geodude,evolve,"[EVOLVE, a, 35, POKEMON]",geodude,"[ivs, matter, ,, sure, ,, but, there, are, way..."
...,...,...,...,...,...
3681,eevee,evolving,"[EVOLVE, a, 550, POKEMON]",eevee,"[i, 'm, level, 22, now, and, the, average, eev..."
3682,eevee,evolve,"[EVOLVE, an, POKEMON]",eevee,"[reading, your, confident, statement, ,, is, t..."
3683,squirtle,evolve,"[EVOLVE, one, from, POKEMON]",squirtle,"[meanwhile, i, have, squirtle, through, blasto..."
3684,weedle,evolving,"[EVOLVE, POKEMON]",weedles,"[i, keep, evolving, weedles, stronger, then, m..."


In [77]:
evolve_df.to_csv('data-processed/pokemon_evolve.csv', index = False)