# Importing our wordlists

Here we import all of our wordlists and add them to an array which me can merge at the end. 

This wordlists should not be filtered at this point. However they should all contain the same columns to make merging easier for later.

In [None]:
wordlists = []

## Dictcc

#### Download the dictionary from http://www.dict.cc/?s=about%3Awordlist

#### Print out the first 20 lines of the dictionary

In [None]:
!head -n 20 de-en.txt

#### Use pandas library to import csv file

In [None]:
import pandas as pd


dictcc_df = pd.read_csv("de-en.txt", 
                        sep='\t',
                        skiprows=8,
                        header=None, 
                        names=["GermanWord","Word","WordType"])

#### Preview a few entries of the wordlist

In [None]:
dictcc_df[90:100]

#### We only need "Word" and "WordType" column

In [None]:
dictcc_df = dictcc_df[["Word", "WordType"]][:].copy()

#### Convert WordType Column to a pandas.Categorical

In [None]:
word_types = dictcc_df["WordType"].astype('category')
dictcc_df["WordType"] = word_types
# show data types of each column in the dataframe
dictcc_df.dtypes

#### List the current distribution of word types in dictcc dataframe

In [None]:
# nltk TaggedCorpusParses requires uppercase WordType
dictcc_df["WordType"] = dictcc_df["WordType"].str.upper()
dictcc_df["WordType"].value_counts().head()

#### Add dictcc corpus to our wordlists array

In [None]:
wordlist_filtered = wordlist_filtered.drop_duplicates("Word")
wordlist_filtered.describe()
wordlist_filtered["WordType"].value_counts()

### Load our wordlists into nltk

In [None]:
# The TaggedCorpusReader likes to use the forward slash character '/'
# as seperator between the word and part-of-speech tag (WordType).
wordlist_filtered.to_csv("dictcc_moby.csv",index=False,sep="/",header=None)

In [None]:
from nltk.corpus import TaggedCorpusReader
from nltk.tokenize import WhitespaceTokenizer
nltk_wordlist = TaggedCorpusReader("./", "dictcc_moby.csv")

# NLTK

- Use NLTK to help us merge our wordlists

In [None]:
# Our custom wordlist
import nltk
custom_cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in nltk_wordlist.tagged_words() if len(word) < 9 and word.isalpha)

In [None]:
# Brown Corpus
import nltk
brown_cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in nltk.corpus.brown.tagged_words() if word.isalpha() and len(word) < 9)

In [None]:
# Merge Nouns from all wordlists
nouns = set(brown_cfd["NN"]) | set(brown_cfd["NP"]) | set(custom_cfd["NOUN"])
# Lowercase all words to remove duplicates
nouns = set([noun.lower() for noun in nouns])
print("Total nouns count: " + str(len(nouns)))

In [None]:
# Merge Verbs from all wordlists
verbs = set(brown_cfd["VB"]) | set(brown_cfd["VBD"]) | set(custom_cfd["VERB"])
# Lowercase all words to remove duplicates
verbs = set([verb.lower() for verb in verbs])
print("Total verbs count: " + str(len(verbs)))

In [None]:
# Merge Adjectives from all wordlists
adjectives = set(brown_cfd["JJ"]) | set(custom_cfd["ADJ"])
# Lowercase all words to remove duplicates
adjectives = set([adjective.lower() for adjective in adjectives])
print("Total adjectives count: " + str(len(adjectives)))

# Make Some Placewords Magic Happen

In [None]:
wordlists.append(dictcc_df)

## Moby

#### Download the corpus from http://icon.shef.ac.uk/Moby/mpos.html

#### Perform some basic cleanup on the wordlist

In [None]:
# the readme file in `nltk/corpora/moby/mpos` gives some information on how to parse the file

result = []
# replace all DOS line endings '\r' with newlines then change encoding to UTF8
moby_words = !cat nltk/corpora/moby/mpos/mobyposi.i | iconv --from-code=ISO88591 --to-code=UTF8 | tr -s '\r' '\n' | tr -s '×' '/'
result.extend(moby_words)
moby_df = pd.DataFrame(data = result, columns = ['Word'])

In [None]:
moby_df.tail(10)

- sort out the nouns, verbs and adjectives

In [None]:
# Matches nouns
nouns = moby_df[moby_df["Word"].str.contains('/[Np]$')].copy()
nouns["WordType"] = "NOUN"
# Matches verbs
verbs = moby_df[moby_df["Word"].str.contains('/[Vti]$')].copy()
verbs["WordType"] = "VERB"
# Magtches adjectives
adjectives = moby_df[moby_df["Word"].str.contains('/A$')].copy()
adjectives["WordType"] = "ADJ"

- remove the trailing stuff and concatenate the nouns, verbs and adjectives

In [None]:
nouns["Word"] = nouns["Word"].str.replace(r'/N$','')
verbs["Word"] = verbs["Word"].str.replace(r'/[Vti]$','')
adjectives["Word"] = adjectives["Word"].str.replace(r'/A$','')
# Merge nouns, verbs and adjectives into one dataframe
moby_df = pd.concat([nouns,verbs,adjectives])

#### Add moby corpus to wordlists array

In [None]:
wordlists.append(moby_df)

## Combine all wordlists

In [None]:
wordlist = pd.concat(wordlists)

# Filter for results that we want

- We want to remove words that aren't associated with a type (null WordType)

In [None]:
wordlist_filtered = wordlist[wordlist["WordType"].notnull()]

- We want to remove words that contain non word characters (whitespace, hypens, etc.)

In [None]:
# we choose [a-z] here and not [A-Za-z] because we do _not_
# want to match words starting with uppercase characters.
# ^to matches verbs in the infinitive from `dictcc`
word_chars = r'^[a-z]+$|^to\s'
is_word_chars = wordlist_filtered["Word"].str.contains(word_chars, na=False)
wordlist_filtered = wordlist_filtered[is_word_chars]
wordlist_filtered.describe()
wordlist_filtered["WordType"].value_counts()

-  We want results that are less than 'x' letters long (x+3 for verbs since they are in their infinitive form in the dictcc wordlist)

In [None]:
lt_x_letters = (wordlist_filtered["Word"].str.len() < 9) |\
               ((wordlist_filtered["Word"].str.contains('^to\s\w+\s')) &\
                (wordlist_filtered["Word"].str.len() < 11)\
               )
wordlist_filtered = wordlist_filtered[lt_x_letters]
wordlist_filtered.describe()

- We want to remove all duplicates