## Gutenberg

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
from nltk.corpus import gutenberg
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
modals = ['can', 'could', 'may', 'might', 'will', 'would','should']
cfdist = nltk.ConditionalFreqDist((word, file) 
                                  for file in gutenberg.fileids() 
                                  for word in gutenberg.words(file) 
                                  if word in modals)

In [4]:
import pandas as pd
cfdist = pd.DataFrame(cfdist)
cfdist

Unnamed: 0,would,could,might,will,may,should,can
austen-emma.txt,815,825,322,559,213,366,270
austen-persuasion.txt,351,444,166,162,87,185,100
austen-sense.txt,507,568,215,354,169,228,206
bible-kjv.txt,443,165,475,3807,1024,768,213
blake-poems.txt,3,3,2,3,5,6,20
bryant-stories.txt,110,154,23,144,18,38,75
burgess-busterbrown.txt,46,56,17,19,3,13,23
carroll-alice.txt,70,73,28,24,11,27,57
chesterton-ball.txt,139,117,69,198,90,75,131
chesterton-brown.txt,132,170,71,111,47,56,126


In [5]:
modal_count = cfdist.sum().sort_values(ascending=False)
modal_count

will      7130
would     3932
could     3528
should    2496
may       2435
can       2163
might     1938
dtype: int64

In [6]:
most_frequent_modal = modal_count.index[0]
print("Most Frequent Modal : "+ str(most_frequent_modal))
print("\n")
print("Text with maximum count of '" + str(most_frequent_modal) + "': " + cfdist.loc[cfdist[most_frequent_modal]==max(cfdist[most_frequent_modal]),most_frequent_modal].index[0])
print("\n")
print("Text with minimum count of '" + str(most_frequent_modal) + "': " + cfdist.loc[cfdist[most_frequent_modal]==min(cfdist[most_frequent_modal]),most_frequent_modal].index[0])

Most Frequent Modal : will


Text with maximum count of 'will': bible-kjv.txt


Text with minimum count of 'will': blake-poems.txt


In [7]:
least_frequent_modal = modal_count.index[-1]
print("Least Frequent Modal : "+ str(least_frequent_modal))
print("\n")
print("Text with maximum count of '" + str(least_frequent_modal) + "': " + cfdist.loc[cfdist[least_frequent_modal]==max(cfdist[least_frequent_modal]),least_frequent_modal].index[0])
print("\n")
print("Text with minimum count of '" + str(least_frequent_modal) + "': " + cfdist.loc[cfdist[least_frequent_modal]==min(cfdist[least_frequent_modal]),least_frequent_modal].index[0])

Least Frequent Modal : might


Text with maximum count of 'might': bible-kjv.txt


Text with minimum count of 'might': blake-poems.txt


In [8]:
text1 =  nltk.Text(nltk.corpus.gutenberg.words('bible-kjv.txt'))
text2 =  nltk.Text(nltk.corpus.gutenberg.words('blake-poems.txt'))

In [9]:
print(text1.concordance(most_frequent_modal))
print("\n")
print(text2.concordance(most_frequent_modal))

Displaying 25 of 3836 matches:
ood that the man should be alone ; I will make him an help meet for him . 2 : 
 the days of thy life : 3 : 15 And I will put enmity between thee and the woma
 . 3 : 16 Unto the woman he said , I will greatly multiply thy sorrow and thy 
 heart . 6 : 7 And the LORD said , I will destroy man whom I have created from
ence through them ; and , behold , I will destroy them with the earth . 6 : 14
rth shall die . 6 : 18 But with thee will I establish my covenant ; and thou s
h . 7 : 4 For yet seven days , and I will cause it to rain upon the earth fort
ry living substance that I have made will I destroy from off the face of the e
; and the LORD said in his heart , I will not again curse the ground any more 
art is evil from his youth ; neither will I again smite any more every thing l
 And surely your blood of your lives will I require ; at the hand of every bea
require ; at the hand of every beast will I require it , and at the hand of ma
at the hand of every 

In [10]:
print(text1.concordance(least_frequent_modal))
print("\n")
print(text2.concordance(least_frequent_modal))

Displaying 25 of 475 matches:
aidst thou , She is my sister ? so I might have taken her to me to wife : now t
as not able to bear them , that they might dwell together : for their substance
raham said unto God , O that Ishmael might live before thee ! 17 : 19 And God s
ast done unto us ? one of the people might lightly have lien with thy wife , an
And Laban said , Behold , I would it might be according to thy word . 30 : 35 A
he cattle in the gutters , that they might conceive among the rods . 30 : 42 Bu
 me ; and didst not tell me , that I might have sent thee away with mirth , and
heir riches were more than that they might dwell together ; and the land wherei
, and lay no hand upon him ; that he might rid him out of their hands , to deli
y themselves : because the Egyptians might not eat bread with the Hebrews ; for
 Reuben , thou art my firstborn , my might , and the beginning of my strength ,
d the heart of his servants , that I might shew these my signs before him : 10 
e urgent u

In [11]:
print(text1.similar(most_frequent_modal))
print("\n")
print(text2.similar(most_frequent_modal))

shall and may that to should said have for would do might not but did
is when was as hath
None


have
None


In [12]:
print(text1.similar(least_frequent_modal))
print("\n")
print(text2.similar(least_frequent_modal))

shall may will should would and to not that said god cannot did do
must life people name mouth strength
None



None


## Inaugural 

In [24]:
import nltk
from nltk.corpus import inaugural
print(inaugural.fileids())

['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', '1909-Taft.txt', '1913-Wilson.txt', '1917-Wilson.txt', '1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt', '1933-Roosevelt.txt', '1937-Roosevelt.txt', '1941-Roosevelt.txt', '1945-Roosevelt.txt', '1949-Truman.txt', '1953-Eisenhower.txt', '1957-Eisenhower.txt', '1961-Kennedy.txt', '1965-Johnson.txt', '1969-Nixon.txt', '1973-Nixon.txt', '1977-Carter.txt', '1981-Reagan.txt', '1985-Reaga

In [14]:
inaugural_upper_case = [word.upper() for word in inaugural.words()]

In [15]:
fdist = nltk.FreqDist(word for word in inaugural_upper_case if len(word)>7)

In [16]:
fdist = pd.Series(fdist).sort_values(ascending=False)
fdist

GOVERNMENT      600
CITIZENS        247
CONSTITUTION    206
AMERICAN        163
NATIONAL        157
               ... 
PETITIONS         1
DISCLOSED         1
MANUFACTURER      1
SHIPMASTER        1
ATTAINING         1
Length: 4799, dtype: int64

In [17]:
top10_used_words = list(fdist[:10].index)
top10_used_words

['GOVERNMENT',
 'CITIZENS',
 'CONSTITUTION',
 'AMERICAN',
 'NATIONAL',
 'CONGRESS',
 'INTERESTS',
 'POLITICAL',
 'EXECUTIVE',
 'PRINCIPLES']

In [18]:
from nltk.corpus import wordnet as wn                             # wordnet is a semantically oriented dictionary of English

synonym_dict_top10 = {}                                                                                   # blank dictionary

for word in top10_used_words:                                                                # traverse through top 10 words
    synonym_list = set()                                                                    # create a blank set of synonyms
    for synset in wn.synsets(word):               #travese through every word's synset (a collection of synonymous entities)
        for lemma in synset.lemma_names():                                           #travese through lemmas of every sysnet
            synonym_list.add(lemma.upper())                                    # add lemma in upper case to synonym list/set
    synonym_dict_top10[word]=synonym_list          # populate synonym dictionary with key as word, and value as synonym list

In [26]:
for key in synonym_dict_top10.keys():
    print(key+" : "+str(synonym_dict_top10[key])+"\n")

GOVERNMENT : {'GOVERNMENT', 'POLITICS', 'ADMINISTRATION', 'GOVERNANCE', 'GOVERNMENT_ACTIVITY', 'REGIME', 'AUTHORITIES', 'POLITICAL_SCIENCE', 'GOVERNING'}

CITIZENS : {'CITIZEN'}

CONSTITUTION : {'MAKEUP', 'US_CONSTITUTION', 'CONSTITUTION_OF_THE_UNITED_STATES', 'CONSTITUTION', 'COMPOSITION', 'ORGANIC_LAW', 'FORMATION', 'ORGANIZATION', 'FUNDAMENTAL_LAW', 'MAKE-UP', 'PHYSICAL_COMPOSITION', 'ORGANISATION', 'U.S._CONSTITUTION', 'UNITED_STATES_CONSTITUTION', 'OLD_IRONSIDES', 'ESTABLISHMENT'}

AMERICAN : {'AMERICAN', 'AMERICAN_LANGUAGE', 'AMERICAN_ENGLISH'}

NATIONAL : {'SUBJECT', 'INTERNAL', 'NATIONAL', 'INTERIOR', 'HOME'}

CONGRESS : {'CONGRESS', 'COPULATION', 'US_CONGRESS', 'COITION', 'CARNAL_KNOWLEDGE', 'SEXUAL_CONGRESS', 'SEXUAL_INTERCOURSE', 'INTERCOURSE', 'U.S._CONGRESS', 'SEX_ACT', 'COITUS', 'RELATION', 'SEXUAL_RELATION', 'UNITED_STATES_CONGRESS'}

INTERESTS : {'SAKE', 'CONCERN', 'STAKE', 'PASTIME', 'INVOLVEMENT', 'INTEREST_GROUP', 'MATTER_TO', 'PURSUIT', 'OCCUPY', 'INTEREST', 'INTERE

In [20]:
len_synonym_dict_top10 = {} # blank dictionary with key as word, value as synonym count

for key in synonym_dict_top10.keys(): # travese through all words
    len_synonym_dict_top10[key] = len(synonym_dict_top10[key]) # add to len dictionary the synonym count for a word
    
print(len_synonym_dict_top10) # print the word-synonym length dictionary

max(len_synonym_dict_top10, key=len_synonym_dict_top10.get) # get the word with most number of synonyms

{'GOVERNMENT': 9, 'CITIZENS': 1, 'CONSTITUTION': 16, 'AMERICAN': 3, 'NATIONAL': 5, 'CONGRESS': 14, 'INTERESTS': 12, 'POLITICAL': 1, 'EXECUTIVE': 3, 'PRINCIPLES': 4}


'CONSTITUTION'

In [21]:
hyponym_dict_top10 = {}                                                                                   # blank dictionary
for word in top10_used_words:                                                                # traverse through top 10 words
    hyponym_list = set()                                                                    # create a blank set of synonyms
    for synset in wn.synsets(word):             # travese through every word's synset (a collection of synonymous entities)
        for hyponym in synset.hyponyms():#travese through every synset's hyponym (immediate concepts that are more specific)
            for lemma in hyponym.lemma_names():                                      #travese through every hyponmyn's lemma
                hyponym_list.add(lemma.upper())                               # add lemma in upper case to hyponmyn list/set
    hyponym_dict_top10[word]=hyponym_list        # populate hyponmyn dictionary with key as word, and value as hyponmyn list

In [22]:
for key in hyponym_dict_top10.keys():
    print(key+" : "+str(hyponym_dict_top10[key])+"\n")

GOVERNMENT : {'STATE_GOVERNMENT', 'PAPACY', 'TOTALITARIAN_STATE', 'REALPOLITIK', 'PUPPET_STATE', 'COURT', 'DOWNING_STREET', 'PRACTICAL_POLITICS', 'AUTHORITARIAN_REGIME', 'BUREAUCRACY', 'FEDERAL_GOVERNMENT', 'EMPIRE', 'STRATOCRACY', 'ANCIEN_REGIME', 'TRUST_BUSTING', 'GEOPOLITICS', 'PONTIFICATE', 'MISRULE', 'LEGISLATING', 'ROYAL_COURT', 'LOCAL_GOVERNMENT', 'LEGISLATION', 'AUTHORITARIAN_STATE', 'PALACE', 'MILITARY_GOVERNMENT', 'PUPPET_GOVERNMENT', 'TOTALITATION_REGIME', 'PUPET_REGIME', 'GOVERNMENT-IN-EXILE', 'STATE', 'LAWMAKING', 'MISGOVERNMENT'}

CITIZENS : {'PRIVATE_CITIZEN', 'FREEWOMAN', 'THANE', 'CIVILIAN', 'ACTIVE_CITIZEN', 'REPATRIATE', 'VOTER', 'FREEMAN', 'ELECTOR'}

CONSTITUTION : {'COLONISATION', 'UNIONIZATION', 'GENETIC_CONSTITUTION', 'GENOTYPE', 'KARYOTYPE', 'FEDERATION', 'PHENOTYPE', 'UNIONISATION', 'STRUCTURE', 'COLLECTIVISATION', 'COMMUNISATION', 'TEXTURE', 'COLONIZATION', 'GRAIN', 'COMMUNIZATION', 'SETTLEMENT', 'COLLECTIVIZATION'}

AMERICAN : {'SOUTH_DAKOTAN', 'LOUISIANIAN'

In [23]:
len_hyponym_dict_top10 = {}

for key in hyponym_dict_top10.keys():
    len_hyponym_dict_top10[key] = len(hyponym_dict_top10[key])
    
print(len_hyponym_dict_top10)

max(len_hyponym_dict_top10, key=len_hyponym_dict_top10.get)

{'GOVERNMENT': 32, 'CITIZENS': 9, 'CONSTITUTION': 17, 'AMERICAN': 109, 'NATIONAL': 4, 'CONGRESS': 18, 'INTERESTS': 43, 'POLITICAL': 0, 'EXECUTIVE': 18, 'PRINCIPLES': 62}


'AMERICAN'