In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # to run all arguments and not just the last one

In [2]:
import nltk
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
emma = nltk.corpus.gutenberg.words("austen-emma.txt")

In [4]:
len(emma)

192427

In [5]:
# we had done concordance in chapter 1, but that can only be performed on nltk text so we'll have to convert emma to nltk text
from nltk.corpus import gutenberg
emma = nltk.Text(emma)

type(emma)

nltk.text.Text

In [6]:
# to see info about each text, we can loop over all items of fileid's
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)

5 25 26 austen-emma.txt
5 26 17 austen-persuasion.txt
5 28 22 austen-sense.txt
4 34 79 bible-kjv.txt
5 19 5 blake-poems.txt
4 19 14 bryant-stories.txt
4 18 12 burgess-busterbrown.txt
4 20 13 carroll-alice.txt
5 20 12 chesterton-ball.txt
5 23 11 chesterton-brown.txt
5 18 11 chesterton-thursday.txt
4 21 25 edgeworth-parents.txt
5 26 15 melville-moby_dick.txt
5 52 11 milton-paradise.txt
4 12 9 shakespeare-caesar.txt
4 12 8 shakespeare-hamlet.txt
4 12 7 shakespeare-macbeth.txt
5 36 12 whitman-leaves.txt


In [7]:
# Web and chat text
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], "...")

firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ...
grail.txt SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Whoa there!  [clop ...
overheard.txt White guy: So, do you have any plans for this evening?
Asian girl ...
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ...
singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ...
wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ...


In [8]:
# there is also a corpus of im chat sessions
from nltk.corpus import nps_chat
chatroom = nps_chat.posts("10-19-20s_706posts.xml")
chatroom[123]

['i',
 'do',
 "n't",
 'want',
 'hot',
 'pics',
 'of',
 'a',
 'female',
 ',',
 'I',
 'can',
 'look',
 'in',
 'a',
 'mirror',
 '.']

In [9]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [10]:
brown.words(categories="news")
brown.words(fileids=["cg22"])
brown.sents(categories=["news", "editorial", "reviews"])

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

['Does', 'our', 'society', 'have', 'a', 'runaway', ',', ...]

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [11]:
# the brown repository is a convenient resource to study the systematic differences between genres, a kind of linguistic enquiry
# known as stylistics. To compare the genres in the use of modal verbs
news_text = brown.words(categories="news")
fdist = nltk.FreqDist(w.lower() for w in news_text)
modals = ["can", "could", "may", "might", "must", "will"]
for m in modals:
    print(m + ":", fdist[m], end=" ") # including end = " " puts the print output in a single line

can: 94 could: 87 may: 93 might: 38 must: 53 will: 389 

In [12]:
# obtaining the counts of each genre of iterest using conditional frequency distributions
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))

modals = ['can', 'could', 'may', 'might', 'must', 'will']
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

In [13]:
cfd.tabulate(conditions=genres, samples=modals) # again args and kwargs

                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 


In [14]:
# Reuters corpus: has 10,788 news documents as test and train divided into 90 topics
from nltk.corpus import reuters
reuters.fileids()
reuters.categories()

['test/14826',
 'test/14828',
 'test/14829',
 'test/14832',
 'test/14833',
 'test/14839',
 'test/14840',
 'test/14841',
 'test/14842',
 'test/14843',
 'test/14844',
 'test/14849',
 'test/14852',
 'test/14854',
 'test/14858',
 'test/14859',
 'test/14860',
 'test/14861',
 'test/14862',
 'test/14863',
 'test/14865',
 'test/14867',
 'test/14872',
 'test/14873',
 'test/14875',
 'test/14876',
 'test/14877',
 'test/14881',
 'test/14882',
 'test/14885',
 'test/14886',
 'test/14888',
 'test/14890',
 'test/14891',
 'test/14892',
 'test/14899',
 'test/14900',
 'test/14903',
 'test/14904',
 'test/14907',
 'test/14909',
 'test/14911',
 'test/14912',
 'test/14913',
 'test/14918',
 'test/14919',
 'test/14921',
 'test/14922',
 'test/14923',
 'test/14926',
 'test/14928',
 'test/14930',
 'test/14931',
 'test/14932',
 'test/14933',
 'test/14934',
 'test/14941',
 'test/14943',
 'test/14949',
 'test/14951',
 'test/14954',
 'test/14957',
 'test/14958',
 'test/14959',
 'test/14960',
 'test/14962',
 'test/149

['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee',
 'copper',
 'copra-cake',
 'corn',
 'cotton',
 'cotton-oil',
 'cpi',
 'cpu',
 'crude',
 'dfl',
 'dlr',
 'dmk',
 'earn',
 'fuel',
 'gas',
 'gnp',
 'gold',
 'grain',
 'groundnut',
 'groundnut-oil',
 'heat',
 'hog',
 'housing',
 'income',
 'instal-debt',
 'interest',
 'ipi',
 'iron-steel',
 'jet',
 'jobs',
 'l-cattle',
 'lead',
 'lei',
 'lin-oil',
 'livestock',
 'lumber',
 'meal-feed',
 'money-fx',
 'money-supply',
 'naphtha',
 'nat-gas',
 'nickel',
 'nkr',
 'nzdlr',
 'oat',
 'oilseed',
 'orange',
 'palladium',
 'palm-oil',
 'palmkernel',
 'pet-chem',
 'platinum',
 'potato',
 'propane',
 'rand',
 'rape-oil',
 'rapeseed',
 'reserves',
 'retail',
 'rice',
 'rubber',
 'rye',
 'ship',
 'silver',
 'sorghum',
 'soy-meal',
 'soy-oil',
 'soybean',
 'strategic-metal',
 'sugar',
 'sun-meal',
 'sun-oil',
 'sunseed',
 'tea',
 'tin',
 'trade',
 'veg-oil',
 'wheat',
 'wpi',
 'yen',
 'zinc']

In [15]:
reuters.categories("training/9865")
reuters.categories(["training/9865", "training/9880"])
reuters.fileids("barley")
reuters.fileids(["barley", "corn"])

['barley', 'corn', 'grain', 'wheat']

['barley', 'corn', 'grain', 'money-fx', 'wheat']

['test/15618',
 'test/15649',
 'test/15676',
 'test/15728',
 'test/15871',
 'test/15875',
 'test/15952',
 'test/17767',
 'test/17769',
 'test/18024',
 'test/18263',
 'test/18908',
 'test/19275',
 'test/19668',
 'training/10175',
 'training/1067',
 'training/11208',
 'training/11316',
 'training/11885',
 'training/12428',
 'training/13099',
 'training/13744',
 'training/13795',
 'training/13852',
 'training/13856',
 'training/1652',
 'training/1970',
 'training/2044',
 'training/2171',
 'training/2172',
 'training/2191',
 'training/2217',
 'training/2232',
 'training/3132',
 'training/3324',
 'training/395',
 'training/4280',
 'training/4296',
 'training/5',
 'training/501',
 'training/5467',
 'training/5610',
 'training/5640',
 'training/6626',
 'training/7205',
 'training/7579',
 'training/8213',
 'training/8257',
 'training/8759',
 'training/9865',
 'training/9958']

['test/14832',
 'test/14858',
 'test/15033',
 'test/15043',
 'test/15106',
 'test/15287',
 'test/15341',
 'test/15618',
 'test/15648',
 'test/15649',
 'test/15676',
 'test/15686',
 'test/15720',
 'test/15728',
 'test/15845',
 'test/15856',
 'test/15860',
 'test/15863',
 'test/15871',
 'test/15875',
 'test/15877',
 'test/15890',
 'test/15904',
 'test/15906',
 'test/15910',
 'test/15911',
 'test/15917',
 'test/15952',
 'test/15999',
 'test/16012',
 'test/16071',
 'test/16099',
 'test/16147',
 'test/16525',
 'test/16624',
 'test/16751',
 'test/16765',
 'test/17503',
 'test/17509',
 'test/17722',
 'test/17767',
 'test/17769',
 'test/18024',
 'test/18035',
 'test/18263',
 'test/18482',
 'test/18614',
 'test/18908',
 'test/18954',
 'test/18973',
 'test/19165',
 'test/19275',
 'test/19668',
 'test/19721',
 'test/19821',
 'test/20018',
 'test/20366',
 'test/20637',
 'test/20645',
 'test/20649',
 'test/20723',
 'test/20763',
 'test/21091',
 'test/21243',
 'test/21493',
 'training/10120',
 'trai

In [16]:
reuters.words("training/9865")[:14]
reuters.words(['training/9865', 'training/9880'])
reuters.words(categories='barley')
reuters.words(categories=['barley', 'corn'])

['FRENCH',
 'FREE',
 'MARKET',
 'CEREAL',
 'EXPORT',
 'BIDS',
 'DETAILED',
 'French',
 'operators',
 'have',
 'requested',
 'licences',
 'to',
 'export']

['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]

['FRENCH', 'FREE', 'MARKET', 'CEREAL', 'EXPORT', ...]

['THAI', 'TRADE', 'DEFICIT', 'WIDENS', 'IN', 'FIRST', ...]

In [17]:
from nltk.corpus import inaugural
inaugural.fileids()

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1

In [18]:
[fileid[:4] for fileid in inaugural.fileids()]

['1789',
 '1793',
 '1797',
 '1801',
 '1805',
 '1809',
 '1813',
 '1817',
 '1821',
 '1825',
 '1829',
 '1833',
 '1837',
 '1841',
 '1845',
 '1849',
 '1853',
 '1857',
 '1861',
 '1865',
 '1869',
 '1873',
 '1877',
 '1881',
 '1885',
 '1889',
 '1893',
 '1897',
 '1901',
 '1905',
 '1909',
 '1913',
 '1917',
 '1921',
 '1925',
 '1929',
 '1933',
 '1937',
 '1941',
 '1945',
 '1949',
 '1953',
 '1957',
 '1961',
 '1965',
 '1969',
 '1973',
 '1977',
 '1981',
 '1985',
 '1989',
 '1993',
 '1997',
 '2001',
 '2005',
 '2009']

In [19]:
inaugural.fileids()[:4] # this is different than the one above! this gives the first 4 fileids while the above gives the first 4
# of every individual fileid

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt']

In [20]:
# to look at how the words America and Citizen are used over time
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ["america", "citizen"]
    if w.lower().startswith(target)) # startswith ensures that it will also count words like American's or citizens

In [21]:
cfd.plot()

In [24]:
# Annotated text corpora
# Corpora in other languages
# Loading your own corpus
# we can load our own corpus that can be accessed using the above methods using
from nltk.corpus import PlaintextCorpusReader
# we can have fileid's, categories, words, sents

In [25]:
# Conditions and events
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...] # the form of a pair is (condition, event)

# we have already seen conditional frequency distribution where the condition was the selection of the brown corpus and
# for each condition, we counted words
# FreqDist() takes a list of words as input, ConditionalFreqDist() takes a list of pairs
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))

# for each genre, we loop over every word in the genre producing pairs consisting of the genre and the word

In [26]:
cfd

ConditionalFreqDist(nltk.probability.FreqDist,
                    {'adventure': FreqDist({'severed': 1,
                               'released': 4,
                               'maneuverability': 1,
                               "day's": 2,
                               'contemptible': 1,
                               'shapes': 2,
                               'pinnacles': 1,
                               'feels': 1,
                               "truck's": 1,
                               'job': 16,
                               'past': 19,
                               'duller': 1,
                               'spare': 2,
                               'friendly': 2,
                               "sheriff's": 1,
                               'sighed': 3,
                               'clothing': 2,
                               'derisively': 1,
                               'sort': 15,
                               'hoodlum': 2,
                               'R

In [28]:
cfd.conditions()

['belles_lettres',
 'fiction',
 'hobbies',
 'editorial',
 'religion',
 'romance',
 'mystery',
 'news',
 'learned',
 'humor',
 'government',
 'lore',
 'adventure',
 'reviews',
 'science_fiction']

In [29]:
cfd["news"] # this is just a simple frequency distribution

FreqDist({'suburbs': 1,
          'automatically': 3,
          'released': 5,
          'satisfactory': 3,
          'Cleaner': 1,
          'Harris': 8,
          "day's": 1,
          'contributors': 1,
          'illusion': 2,
          'B.': 31,
          'preoccupied': 2,
          'feels': 6,
          'job': 16,
          'Jaross': 1,
          'past': 35,
          'Lady': 5,
          'fancy': 1,
          'spare': 1,
          'friendly': 3,
          'Ricci': 1,
          'concerts': 6,
          'Stewart': 2,
          'clothing': 3,
          'cleaner': 1,
          'trampled': 1,
          'sort': 4,
          'hoodlum': 1,
          'managing': 2,
          'Hooked': 1,
          'German': 8,
          'buy': 10,
          'Thruston': 1,
          '$15,000,000': 2,
          'action': 20,
          'Duffy': 1,
          'far-out': 1,
          'knees': 1,
          'Rte.': 1,
          'rumor': 1,
          'Armisteads': 1,
          "Giants'": 2,
          'Ordinarily'

In [32]:
# Plotting and tabulating distributions
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ["america", "citizen"]
    if w.lower().startswith(target))
# a conditional frequency distribution object has plot and tabulate methods and we can specify which conditions to display with 
# conditions=. we get all conditions if we don't give this parameter

In [35]:
# Generating random text with bigrams
# We can use a conditional frequency distribution to create a table of bigrams(word pairs)
sent = ["In", "the", "beginning", "God", "created", "the", "heaven", "and", "the", "earth", "."]
list(nltk.bigrams(sent))

[('In', 'the'),
 ('the', 'beginning'),
 ('beginning', 'God'),
 ('God', 'created'),
 ('created', 'the'),
 ('the', 'heaven'),
 ('heaven', 'and'),
 ('and', 'the'),
 ('the', 'earth'),
 ('earth', '.')]

In [37]:
# we can even treat each word as a condition and for each one, create a frequency distribution over the following words
def generate_model(cfdist, word, num=15):  # generate_model contains a simple loop to generate text. It gets all bigrams from 
    for i in range(num):                   # the text and constructs a conditional freq distribution to record which words are
        print(word, end=" ")               # most likely to follow a given word to generate random text
        word = cfdist[word].max()
        
text = nltk.corpus.genesis.words("english-kjv.txt")
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)

In [38]:
cfd["living"] # words used just after living

FreqDist({',': 1,
          '.': 1,
          'creature': 7,
          'soul': 1,
          'substance': 2,
          'thing': 4})

In [40]:
def plural(word):
    if word.endswith("y"):
        return word[:-1] + "ies"
    elif word[-1] in "sx" or word[-2:] in ["sh", "ch"]:
        return word + "es"
    elif word.endswith("an"):
        return word[:-2] + "en"
    else:
        return word + "s"

In [50]:
plural("elephant")
str("elephant")[:-2]
str("elephant")[-1]
str("elephant")[:-1]
str("elephant")[-2:]

'elephants'

'elepha'

't'

'elephan'

'nt'

In [52]:
# Stopwords
from nltk.corpus import stopwords
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [55]:
# define a function to compute what fraction of words in a text are not in the stopwords list
def content_fraction(text):
    stopwords = nltk.corpus.stopwords.words("english")
    content = [w for w in text if w.lower() not in stopwords]
    return len(content)/len(text)

content_fraction(nltk.corpus.reuters.words())

0.735240435097661

In [56]:
# Word puzzle
puzzle_letters = nltk.FreqDist("egivrvonl")
obligatory = "r"
wordlist = nltk.corpus.words.words()

In [60]:
[w for w in wordlist if len(w) > 6 and obligatory in w and nltk.FreqDist(w) <= puzzle_letters]

['involver', 'lovering', 'overling', 'revolving']

In [61]:
# Names corpus. Let us find the names that are in the list of male as well female
names = nltk.corpus.names
names.fileids()

['female.txt', 'male.txt']

In [62]:
male_names = names.words("male.txt")
female_names = names.words("female.txt")

In [63]:
[w for w in male_names if w in female_names]

['Abbey',
 'Abbie',
 'Abby',
 'Addie',
 'Adrian',
 'Adrien',
 'Ajay',
 'Alex',
 'Alexis',
 'Alfie',
 'Ali',
 'Alix',
 'Allie',
 'Allyn',
 'Andie',
 'Andrea',
 'Andy',
 'Angel',
 'Angie',
 'Ariel',
 'Ashley',
 'Aubrey',
 'Augustine',
 'Austin',
 'Averil',
 'Barrie',
 'Barry',
 'Beau',
 'Bennie',
 'Benny',
 'Bernie',
 'Bert',
 'Bertie',
 'Bill',
 'Billie',
 'Billy',
 'Blair',
 'Blake',
 'Bo',
 'Bobbie',
 'Bobby',
 'Brandy',
 'Brett',
 'Britt',
 'Brook',
 'Brooke',
 'Brooks',
 'Bryn',
 'Cal',
 'Cam',
 'Cammy',
 'Carey',
 'Carlie',
 'Carlin',
 'Carmine',
 'Carroll',
 'Cary',
 'Caryl',
 'Casey',
 'Cass',
 'Cat',
 'Cecil',
 'Chad',
 'Chris',
 'Chrissy',
 'Christian',
 'Christie',
 'Christy',
 'Clair',
 'Claire',
 'Clare',
 'Claude',
 'Clem',
 'Clemmie',
 'Cody',
 'Connie',
 'Constantine',
 'Corey',
 'Corrie',
 'Cory',
 'Courtney',
 'Cris',
 'Daffy',
 'Dale',
 'Dallas',
 'Dana',
 'Dani',
 'Daniel',
 'Dannie',
 'Danny',
 'Darby',
 'Darcy',
 'Darryl',
 'Daryl',
 'Deane',
 'Del',
 'Dell',
 'Deme

In [64]:
# names that end with a are almost always female. we can check this
cfd = nltk.ConditionalFreqDist(
    (fileid, name[-1])
    for fileid in names.fileids()
    for name in names.words(fileid))
cfd.plot()

In [66]:
# A pronouncing dictionary
entries = nltk.corpus.cmudict.entries()
len(entries)
for entry in entries[42371:42379]:
    print(entry)

133737

('fir', ['F', 'ER1'])
('fire', ['F', 'AY1', 'ER0'])
('fire', ['F', 'AY1', 'R'])
('firearm', ['F', 'AY1', 'ER0', 'AA2', 'R', 'M'])
('firearm', ['F', 'AY1', 'R', 'AA2', 'R', 'M'])
('firearms', ['F', 'AY1', 'ER0', 'AA2', 'R', 'M', 'Z'])
('firearms', ['F', 'AY1', 'R', 'AA2', 'R', 'M', 'Z'])
('fireball', ['F', 'AY1', 'ER0', 'B', 'AO2', 'L'])


In [67]:
# each entry has 2 parts which can be processed using a more complex for loop. instead of for entry in entries, we use 2 variables
for word, pron in entries:
    if len(pron) == 3:
        ph1, ph2, ph3 = pron
        if ph1 == "P" and ph3 == "T":
            print(word, ph2, end=" ")

pait EY1 pat AE1 pate EY1 patt AE1 peart ER1 peat IY1 peet IY1 peete IY1 pert ER1 pet EH1 pete IY1 pett EH1 piet IY1 piette IY1 pit IH1 pitt IH1 pot AA1 pote OW1 pott AA1 pout AW1 puett UW1 purt ER1 put UH1 putt AH1 

In [71]:
# another example of a for statement used inside a list comprehension to find all words whose pronunciation ends with a syllable
# sounding like nicks. this method can be used to find rhyming words
syllable = ["N", "IH0", "K", "S"]
[word for word, pron in entries if pron[-4:] == syllable]

["atlantic's",
 'audiotronics',
 'avionics',
 'beatniks',
 'calisthenics',
 'centronics',
 'chamonix',
 'chetniks',
 "clinic's",
 'clinics',
 'conics',
 'conics',
 'cryogenics',
 'cynics',
 'diasonics',
 "dominic's",
 'ebonics',
 'electronics',
 "electronics'",
 "endotronics'",
 'endotronics',
 'enix',
 'environics',
 'ethnics',
 'eugenics',
 'fibronics',
 'flextronics',
 'harmonics',
 'hispanics',
 'histrionics',
 'identics',
 'ionics',
 'kibbutzniks',
 'lasersonics',
 'lumonics',
 'mannix',
 'mechanics',
 "mechanics'",
 'microelectronics',
 'minix',
 'minnix',
 'mnemonics',
 'mnemonics',
 'molonicks',
 'mullenix',
 'mullenix',
 'mullinix',
 'mulnix',
 "munich's",
 'nucleonics',
 'onyx',
 'organics',
 "panic's",
 'panics',
 'penix',
 'pennix',
 'personics',
 'phenix',
 "philharmonic's",
 'phoenix',
 'phonics',
 'photronics',
 'pinnix',
 'plantronics',
 'pyrotechnics',
 'refuseniks',
 "resnick's",
 'respironics',
 'sconnix',
 'siliconix',
 'skolniks',
 'sonics',
 'sputniks',
 'technics

In [72]:
# we can use a conditional frequency distribution to find a list of minimally contrasting set of words. We'll find all the p-words
# consisting of 3 sounds and group them according to their first and last sounds
p3 = [(pron[0] + "-" + pron[2], word)
     for (word, pron) in entries
     if pron[0] == "P" and len(pron) == 3]
cfd = nltk.ConditionalFreqDist(p3)

In [73]:
for template in sorted(cfd.conditions()):
    if len(cfd[template]) > 10:
        words = sorted(cfd[template])
        wordstring = " ".join(words)
        print(template, wordstring[:70] + "...")

P-CH patch pautsch peach perch petsch petsche piche piech pietsch pitch pit...
P-K pac pack paek paik pak pake paque peak peake pech peck peek perc perk ...
P-L pahl pail paille pal pale pall paul paule paull peal peale pearl pearl...
P-N paign pain paine pan pane pawn payne peine pen penh penn pin pine pinn...
P-P paap paape pap pape papp paup peep pep pip pipe pipp poop pop pope pop...
P-R paar pair par pare parr pear peer pier poor poore por pore porr pour...
P-S pace pass pasts peace pearse pease perce pers perse pesce piece piss p...
P-T pait pat pate patt peart peat peet peete pert pet pete pett piet piett...
P-UW1 peru peugh pew plew plue prew pru prue prugh pshew pugh...
P-Z p's p.'s p.s pais paiz pao's pas pause paws pays paz peas pease pei's ...


In [76]:
# Rather than iterating over a dictionary, we can also access by looking up a particular list of words using python's dict
prondict = nltk.corpus.cmudict.dict()

In [80]:
prondict["fire"]

[['F', 'AY1', 'ER0'], ['F', 'AY1', 'R']]

In [83]:
# Comparative wordlists is another example of a tabular lexicon
from nltk.corpus import swadesh # a list of most common 200 words in different languages
swadesh.fileids()
swadesh.words("en")

['be',
 'bg',
 'bs',
 'ca',
 'cs',
 'cu',
 'de',
 'en',
 'es',
 'fr',
 'hr',
 'it',
 'la',
 'mk',
 'nl',
 'pl',
 'pt',
 'ro',
 'ru',
 'sk',
 'sl',
 'sr',
 'sw',
 'uk']

['I',
 'you (singular), thou',
 'he',
 'we',
 'you (plural)',
 'they',
 'this',
 'that',
 'here',
 'there',
 'who',
 'what',
 'where',
 'when',
 'how',
 'not',
 'all',
 'many',
 'some',
 'few',
 'other',
 'one',
 'two',
 'three',
 'four',
 'five',
 'big',
 'long',
 'wide',
 'thick',
 'heavy',
 'small',
 'short',
 'narrow',
 'thin',
 'woman',
 'man (adult male)',
 'man (human being)',
 'child',
 'wife',
 'husband',
 'mother',
 'father',
 'animal',
 'fish',
 'bird',
 'dog',
 'louse',
 'snake',
 'worm',
 'tree',
 'forest',
 'stick',
 'fruit',
 'seed',
 'leaf',
 'root',
 'bark (from tree)',
 'flower',
 'grass',
 'rope',
 'skin',
 'meat',
 'blood',
 'bone',
 'fat (noun)',
 'egg',
 'horn',
 'tail',
 'feather',
 'hair',
 'head',
 'ear',
 'eye',
 'nose',
 'mouth',
 'tooth',
 'tongue',
 'fingernail',
 'foot',
 'leg',
 'knee',
 'hand',
 'wing',
 'belly',
 'guts',
 'neck',
 'back',
 'breast',
 'heart',
 'liver',
 'drink',
 'eat',
 'bite',
 'suck',
 'spit',
 'vomit',
 'blow',
 'breathe',
 'laugh',

In [85]:
# we can access words from multiple languages using the entries method
fr2en = swadesh.entries(["fr", "en"])
fr2en

[('je', 'I'),
 ('tu, vous', 'you (singular), thou'),
 ('il', 'he'),
 ('nous', 'we'),
 ('vous', 'you (plural)'),
 ('ils, elles', 'they'),
 ('ceci', 'this'),
 ('cela', 'that'),
 ('ici', 'here'),
 ('là', 'there'),
 ('qui', 'who'),
 ('quoi', 'what'),
 ('où', 'where'),
 ('quand', 'when'),
 ('comment', 'how'),
 ('ne...pas', 'not'),
 ('tout', 'all'),
 ('plusieurs', 'many'),
 ('quelques', 'some'),
 ('peu', 'few'),
 ('autre', 'other'),
 ('un', 'one'),
 ('deux', 'two'),
 ('trois', 'three'),
 ('quatre', 'four'),
 ('cinq', 'five'),
 ('grand', 'big'),
 ('long', 'long'),
 ('large', 'wide'),
 ('épais', 'thick'),
 ('lourd', 'heavy'),
 ('petit', 'small'),
 ('court', 'short'),
 ('étroit', 'narrow'),
 ('mince', 'thin'),
 ('femme', 'woman'),
 ('homme', 'man (adult male)'),
 ('homme', 'man (human being)'),
 ('enfant', 'child'),
 ('femme, épouse', 'wife'),
 ('mari, époux', 'husband'),
 ('mère', 'mother'),
 ('père', 'father'),
 ('animal', 'animal'),
 ('poisson', 'fish'),
 ('oiseau', 'bird'),
 ('chien', 'dog'

In [87]:
translate = dict(fr2en)
translate["chien"]

'dog'

In [90]:
# we can also compare multiple languages
languages = ['en', 'de', 'nl', 'es', 'fr', 'pt', 'la']
for i in [139, 140, 141, 142]:
    print(swadesh.entries(languages)[i])

('say', 'sagen', 'zeggen', 'decir', 'dire', 'dizer', 'dicere')
('sing', 'singen', 'zingen', 'cantar', 'chanter', 'cantar', 'canere')
('play', 'spielen', 'spelen', 'jugar', 'jouer', 'jogar, brincar', 'ludere')
('float', 'schweben', 'zweven', 'flotar', 'flotter', 'flutuar, boiar', 'fluctuare')


In [93]:
# Wordnet: a semantically oriented dictionary of english
from nltk.corpus import wordnet as wn
wn.synsets("motorcar") # motorcar has only one meaning and is identified as car.n.01 (synset = synonym set)

[Synset('car.n.01')]

In [94]:
wn.synset("car.n.01").lemma_names() # a synset is a collection of synonymous words (or lemmas)

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [98]:
wn.synset("car.n.01").definition() # synsets also come with definition and examples
wn.synset("car.n.01").examples() 

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

['he needs a car to get to work']

In [99]:
# the pairing of a synset with a word is called a lemma
wn.synset("car.n.01").lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

In [100]:
wn.lemma('car.n.01.automobile')
wn.lemma('car.n.01.automobile').synset()
wn.lemma('car.n.01.automobile').name()

Lemma('car.n.01.automobile')

Synset('car.n.01')

'automobile'

In [104]:
# motorcar has one synset but car has 5
wn.synsets("car")

for synset in wn.synsets("car"):
    print(synset.lemma_names())
    
# we can also access all lemma's involving the word car as follows
wn.lemmas("car")

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

['car', 'auto', 'automobile', 'machine', 'motorcar']
['car', 'railcar', 'railway_car', 'railroad_car']
['car', 'gondola']
['car', 'elevator_car']
['cable_car', 'car']


[Lemma('car.n.01.car'),
 Lemma('car.n.02.car'),
 Lemma('car.n.03.car'),
 Lemma('car.n.04.car'),
 Lemma('cable_car.n.01.car')]

In [107]:
# wordnet makes it easy to navigate between concepts, e.g. we can look at the hyponyms
motorcar = wn.synset("car.n.01")
types_of_motorcar = motorcar.hyponyms()
types_of_motorcar[0]

sorted(lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas())

Synset('ambulance.n.01')

['Model_T',
 'S.U.V.',
 'SUV',
 'Stanley_Steamer',
 'ambulance',
 'beach_waggon',
 'beach_wagon',
 'bus',
 'cab',
 'compact',
 'compact_car',
 'convertible',
 'coupe',
 'cruiser',
 'electric',
 'electric_automobile',
 'electric_car',
 'estate_car',
 'gas_guzzler',
 'hack',
 'hardtop',
 'hatchback',
 'heap',
 'horseless_carriage',
 'hot-rod',
 'hot_rod',
 'jalopy',
 'jeep',
 'landrover',
 'limo',
 'limousine',
 'loaner',
 'minicar',
 'minivan',
 'pace_car',
 'patrol_car',
 'phaeton',
 'police_car',
 'police_cruiser',
 'prowl_car',
 'race_car',
 'racer',
 'racing_car',
 'roadster',
 'runabout',
 'saloon',
 'secondhand_car',
 'sedan',
 'sport_car',
 'sport_utility',
 'sport_utility_vehicle',
 'sports_car',
 'squad_car',
 'station_waggon',
 'station_wagon',
 'stock_car',
 'subcompact',
 'subcompact_car',
 'taxi',
 'taxicab',
 'tourer',
 'touring_car',
 'two-seater',
 'used-car',
 'waggon',
 'wagon']

In [110]:
# we can also naviget up the heirarchy by visiting hypernyms. Some words have multiple paths as they can be classified in more
# than one way
motorcar.hypernyms()
paths = motorcar.hypernym_paths()
len(paths)
[synset.name() for synset in paths[0]]
[synset.name() for synset in paths[1]]

[Synset('motor_vehicle.n.01')]

2

['entity.n.01',
 'physical_entity.n.01',
 'object.n.01',
 'whole.n.02',
 'artifact.n.01',
 'instrumentality.n.03',
 'container.n.01',
 'wheeled_vehicle.n.01',
 'self-propelled_vehicle.n.01',
 'motor_vehicle.n.01',
 'car.n.01']

['entity.n.01',
 'physical_entity.n.01',
 'object.n.01',
 'whole.n.02',
 'artifact.n.01',
 'instrumentality.n.03',
 'conveyance.n.03',
 'vehicle.n.01',
 'wheeled_vehicle.n.01',
 'self-propelled_vehicle.n.01',
 'motor_vehicle.n.01',
 'car.n.01']

In [113]:
# Hypernyms and hyponyms relate one synset to another and are hence called lexical relations
# Another way to navigate the wordnet network is from items to their components (meronyms) or the things they are contained in(holonyms)
# e.g. parts of a tree are its trunk, crown - part_meronyms and a tree is made up of heartwood and sapwood - substance_meronyms
# A collection of trees forms a forest - member_holonyms
wn.synset("tree.n.01").part_meronyms()
wn.synset("tree.n.01").substance_meronyms()
wn.synset("tree.n.01").member_holonyms()

[Synset('burl.n.02'),
 Synset('crown.n.07'),
 Synset('limb.n.02'),
 Synset('stump.n.01'),
 Synset('trunk.n.01')]

[Synset('heartwood.n.01'), Synset('sapwood.n.01')]

[Synset('forest.n.01')]

In [116]:
# to see how intricate things can get, consider the word mint
for synset in wn.synsets("mint", wn.NOUN):
    print(synset.name() + ":" + synset.definition()) 
# mint.n.02 is a part of mint.n.04 and the substance from which mint.n.02 is made

batch.n.02:(often followed by `of') a large number or amount or extent
mint.n.02:any north temperate plant of the genus Mentha with aromatic leaves and small mauve flowers
mint.n.03:any member of the mint family of plants
mint.n.04:the leaves of a mint plant used fresh or candied
mint.n.05:a candy that is flavored with a mint oil
mint.n.06:a plant where money is coined by authority of the government


In [118]:
wn.synset("mint.n.04").part_holonyms()
wn.synset("mint.n.04").substance_holonyms()

[Synset('mint.n.02')]

[Synset('mint.n.05')]

In [119]:
# there are also relationships with verbs
wn.lemma('supply.n.02.supply').antonyms()
wn.lemma('rush.v.01.rush').antonyms()
wn.lemma('horizontal.a.01.horizontal').antonyms()
wn.lemma('staccato.r.01.staccato').antonyms()

[Lemma('demand.n.02.demand')]

[Lemma('linger.v.04.linger')]

[Lemma('inclined.a.02.inclined'), Lemma('vertical.a.01.vertical')]

[Lemma('legato.r.01.legato')]

In [121]:
# Semantic Similarity 
# Synsets are linked by a network of lexical relations. given a particular synset, we can traverse the wordnet network to find 
# synsets with related meanings. each synset has one or more hypernym paths that link it to a root hypernym such as entity.n.01
# Two synsets linked to the same root may have several hypernyms in common and if two share a very specific hypernym, they
# must be closely related
right = wn.synset("right_whale.n.01")
orca = wn.synset("orca.n.01")
minke = wn.synset("minke_whale.n.01")
tortoise = wn.synset("tortoise.n.01")
novel = wn.synset("novel.n.01")
right.lowest_common_hypernyms(minke)
right.lowest_common_hypernyms(orca)
right.lowest_common_hypernyms(tortoise)
right.lowest_common_hypernyms(novel)

[Synset('baleen_whale.n.01')]

[Synset('whale.n.02')]

[Synset('vertebrate.n.01')]

[Synset('entity.n.01')]

In [123]:
# we can quantify the generality of the lowest common hypernym by looking at the depth of each synset
wn.synset("baleen_whale.n.01").min_depth()
wn.synset("whale.n.02").min_depth()
wn.synset("vertebrate.n.01").min_depth()
wn.synset("entity.n.01").min_depth()

14

13

8

0

In [125]:
# Similarity measures have been defined over the collection of WordNet synsets which incorporate the above insight (range = [0, 1])
right.path_similarity(minke)
right.path_similarity(orca)
right.path_similarity(tortoise)
right.path_similarity(novel)

0.25

0.16666666666666666

0.07692307692307693

0.043478260869565216