# Word Corpus

In [1]:
# regex expression
import re
import nltk
# using word corpus
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [3]:
len(wordlist)

210687

In [7]:
print(wordlist[:10])

['a', 'aa', 'aal', 'aalii', 'aam', 'aardvark', 'aardwolf', 'aba', 'abac', 'abaca']


#### Many tasks involves finding a word which matches some pattern. For ex to find adverb, we can first find words which end with 'ly'. So for such patterns reg exp are used

In [9]:
# search function in regex
#re.search(pattern, string)
# $ is a meta charceter which looks to search at end
print([w for w in wordlist if re.search('ly$', w)][:10])

['abactinally', 'abandonedly', 'abasedly', 'abashedly', 'abashlessly', 'abbreviately', 'abdominally', 'abhorrently', 'abidingly', 'abiogenetically']


In [10]:
print([w for w in wordlist if re.search('^twi', w)][:10])

['twibil', 'twibilled', 'twice', 'twicer', 'twicet', 'twichild', 'twick', 'twiddle', 'twiddler', 'twiddling']


In [11]:
# search for a bit complex pattern
print([w for w in wordlist if re.search('^..il.g.t$', w)][:10])

['twilight']


In [12]:
print([w for w in wordlist if re.search('..j..t..', w )][:10])

['abjectedness', 'abjection', 'abjective', 'abjectly', 'abjectness', 'adjection', 'adjectional', 'adjectival', 'adjectivally', 'adjective']


In [13]:
print([w for w  in wordlist if re.search('^[ghi][mno]$', w)][:10])

['go', 'ho', 'in', 'io']


# Chat Words

In [14]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))

In [16]:
# '+' indicates one or more of preceding charecter occurences
# '*' means zero or more preceding charecters occurence
# '-' is for range. For example 1-9 
print([w for w in chat_words if re.search('^m+i+n+e+$', w)])

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee', 'miiiiiinnnnnnnnnneeeeeeee', 'mine', 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']


In [20]:
print([w for w in chat_words if re.search('^[^aeiouAIEOU]+$', w)][:200:10])

['!', '!!!!!!!!!!!', '!!!!....', '"', '&', '(', '((((((((((((', '(((((((((((((((((((((((((', '))))))))', ')))))))))))))))))))))', '++', '--', '-s', '.. .', '...........', '.:', '10', '12%', '16', '1985']


# Treebank

In [21]:
corp = sorted(set(nltk.corpus.treebank.words()))

In [23]:
# \ operator just means the following charecter must be matched exactly.
# note our interpretation of . operator was different

print([w for w in corp if re.search('^[0-9]+\.[0-9]+$', w)][:200:10])

['0.0085', '0.50', '1.01', '1.24', '1.5', '1.65', '1.916', '11.57', '118.6', '121.6', '13.65', '14.00', '14.99', '150.00', '16.9', '18.95', '2.1', '2.4', '2.7', '21.1']


In [34]:
# exercise : Get the numbers which start with only 1 and have only 2 decimals after.
print([w for w in corp if re.search('^1\.[0-9]{2}$', w)])

['1.01', '1.14', '1.17', '1.18', '1.19', '1.20', '1.24', '1.25', '1.26', '1.28', '1.35', '1.39', '1.46', '1.49', '1.50', '1.55', '1.56', '1.61', '1.64', '1.65', '1.75', '1.76', '1.82', '1.85', '1.92']


In [37]:
#{4} indicates length of the word you are specifying
print([w for w in corp if re.search('^[0-9]{4}$', w)][:10])

['1614', '1637', '1787', '1901', '1903', '1917', '1925', '1929', '1933', '1934']


In [39]:
# {3,5} - allows 3 times or 5 times. {5,} allows five or more times occurences
print([w for w in corp if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)])    

['black-and-white', 'bread-and-butter', 'father-in-law', 'machine-gun-toting', 'savings-and-loan']


In [40]:
print([w for w in corp if re.search('(ed|ing)$', w)][:10])

['62%-owned', 'Absorbed', 'According', 'Adopting', 'Advanced', 'Advancing', 'Alfred', 'Allied', 'Annualized', 'Anything']


# Download any html page (code from last class)
   1. Identify all types of html tags (how does a html tag look like)
   1. Count the number of articles occurrences.
   1. How many words end with word 'ing'

# Other applications of REGEX

In [42]:
#to find all patterns inside a string and get the pattern rather than word
word = 'GradvalleyDataScience-NLPCourse-Week2'
re.findall(r'[aeiou]',word)

['a', 'a', 'e', 'a', 'a', 'i', 'e', 'e', 'o', 'u', 'e', 'e', 'e']

In [48]:
word = 'GradvalleyDataScience-NLPCourse-Week2'
print(re.search(r'[aeiou]',word))

<re.Match object; span=(2, 3), match='a'>


## Frequency Distribution

In [56]:
words = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in words for vs in re.findall(r'[aieou]{2,}', word))

In [58]:
fd.items()

dict_items([('ea', 476), ('oi', 65), ('ou', 329), ('io', 549), ('ee', 217), ('ie', 331), ('ui', 95), ('ua', 109), ('ai', 261), ('ue', 105), ('ia', 253), ('ei', 86), ('iai', 1), ('oo', 174), ('au', 106), ('eau', 10), ('oa', 59), ('oei', 1), ('oe', 15), ('eo', 39), ('uu', 1), ('eu', 18), ('iu', 14), ('aii', 1), ('aiia', 1), ('ae', 11), ('aa', 3), ('oui', 6), ('ieu', 3), ('ao', 6), ('iou', 27), ('uee', 4), ('eou', 5), ('aia', 1), ('uie', 3), ('iao', 1), ('eei', 2), ('uo', 8), ('uou', 5), ('eea', 1), ('ueui', 1), ('ioa', 1), ('ooi', 1)])

## ToolBox

In [59]:
words = nltk.corpus.toolbox.words('rotokas.dic')

In [60]:
type(words)

list

In [62]:
print(words[:10])

['kaa', 'kaa', 'kaa', 'kaakaaro', 'kaakaaviko', 'kaakaavo', 'kaakaoko', 'kaakasi', 'kaakau', 'kaakauko']


In [64]:
patterns = [cv for w in words for cv in re.findall(r'[ptksvr][aieou]', w)]
cond_dict = nltk.ConditionalFreqDist(patterns)
cond_dict.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 
