# Basics of Natural Language Processing using NLTK

### Python Imports

In [1]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.util import ngrams

In [2]:
%reload_ext watermark

In [3]:
%watermark -n -v -iv

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.7.0

nltk: 3.8.1



### Set path to nltk data

In [4]:
nltk.data.path.append("./nltk_data")

### Subset of Texts from 'Project GutenBerg' packaged in NLTK

In [5]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

### Texts from 'Presedential Inaugural Speeches' packaged in NLTK

In [6]:
nltk.corpus.inaugural.fileids()

['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1

### Load and tokenize the Charles Dickens - The Christmas Carol

To load the 'Alice in the Wonderland' text from the NLTK Gutenberg corpus use the following code:

md_words = nltk.corpus.gutenberg.words('carroll-alice.txt')

#### Read each line, convert to lowercase, and tokenize

In [7]:
with open ('txt_data/charles-christmas_carol.txt') as fcc:
    all_tokens = word_tokenize(fcc.read().lower())

#### Remove all punctuations

In [8]:
cc_tokens = [word for word in all_tokens if word.isalpha()]

#### Use the WordPunctTokenizer for NLTK

In [9]:
tokenizer = WordPunctTokenizer()
with open ('txt_data/charles-christmas_carol.txt') as fcc:
    all_tokens2 = tokenizer.tokenize(fcc.read().lower())
all_tokens2

['title',
 ':',
 'a',
 'christmas',
 'carol',
 'author',
 ':',
 'charles',
 'dickens',
 'illustrator',
 ':',
 'john',
 'leech',
 'release',
 'date',
 ':',
 'october',
 '30',
 ',',
 '2009',
 'language',
 ':',
 'english',
 'a',
 'christmas',
 'carol',
 'by',
 'charles',
 'dickens',
 'being',
 'a',
 'ghost',
 'story',
 'of',
 'christmas',
 'preface',
 'i',
 'have',
 'endeavoured',
 'in',
 'this',
 'ghostly',
 'little',
 'book',
 ',',
 'to',
 'raise',
 'the',
 'ghost',
 'of',
 'an',
 'idea',
 ',',
 'which',
 'shall',
 'not',
 'put',
 'my',
 'readers',
 'out',
 'of',
 'humour',
 'with',
 'themselves',
 ',',
 'with',
 'each',
 'other',
 ',',
 'with',
 'the',
 'season',
 ',',
 'or',
 'with',
 'me',
 '.',
 'may',
 'it',
 'haunt',
 'their',
 'houses',
 'pleasantly',
 ',',
 'and',
 'no',
 'one',
 'wish',
 'to',
 'lay',
 'it',
 '.',
 'their',
 'faithful',
 'friend',
 'and',
 'servant',
 ',',
 'c',
 '.',
 'd',
 '.',
 'december',
 ',',
 '1843',
 '.',
 'stave',
 'i',
 '.',
 'marley',
 "'",
 's',
 'g

#### Remove all punctuations

In [10]:
cc_tokens2 = [word for word in all_tokens2 if not word in string.punctuation]
cc_tokens2

['title',
 'a',
 'christmas',
 'carol',
 'author',
 'charles',
 'dickens',
 'illustrator',
 'john',
 'leech',
 'release',
 'date',
 'october',
 '30',
 '2009',
 'language',
 'english',
 'a',
 'christmas',
 'carol',
 'by',
 'charles',
 'dickens',
 'being',
 'a',
 'ghost',
 'story',
 'of',
 'christmas',
 'preface',
 'i',
 'have',
 'endeavoured',
 'in',
 'this',
 'ghostly',
 'little',
 'book',
 'to',
 'raise',
 'the',
 'ghost',
 'of',
 'an',
 'idea',
 'which',
 'shall',
 'not',
 'put',
 'my',
 'readers',
 'out',
 'of',
 'humour',
 'with',
 'themselves',
 'with',
 'each',
 'other',
 'with',
 'the',
 'season',
 'or',
 'with',
 'me',
 'may',
 'it',
 'haunt',
 'their',
 'houses',
 'pleasantly',
 'and',
 'no',
 'one',
 'wish',
 'to',
 'lay',
 'it',
 'their',
 'faithful',
 'friend',
 'and',
 'servant',
 'c',
 'd',
 'december',
 '1843',
 'stave',
 'i',
 'marley',
 's',
 'ghost',
 'marley',
 'was',
 'dead',
 'to',
 'begin',
 'with',
 'there',
 'is',
 'no',
 'doubt',
 'whatever',
 'about',
 'that',

#### Display all the stop words

In [11]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

#### Remove all the stop words

In [12]:
all_words = [word for word in cc_tokens if word not in stop_words]

#### Find the frequency of all of the words

In [13]:
all_fd = nltk.FreqDist(all_words)
all_fd

FreqDist({'scrooge': 357, 'said': 221, 'upon': 120, 'one': 111, 'ghost': 94, 'would': 94, 'spirit': 88, 'christmas': 86, 'man': 76, 'old': 69, ...})

#### Find the top 20 commonly used words

In [14]:
all_fd.most_common(20)

[('scrooge', 357),
 ('said', 221),
 ('upon', 120),
 ('one', 111),
 ('ghost', 94),
 ('would', 94),
 ('spirit', 88),
 ('christmas', 86),
 ('man', 76),
 ('old', 69),
 ('could', 69),
 ('time', 66),
 ('little', 64),
 ('good', 63),
 ('like', 60),
 ('know', 59),
 ('bob', 53),
 ('cried', 52),
 ('see', 47),
 ('came', 46)]

#### Word(s) used only one are called Hapaxes

In [15]:
single_words = all_fd.hapaxes()
single_words

['title',
 'author',
 'illustrator',
 'john',
 'leech',
 'date',
 'october',
 'english',
 'preface',
 'readers',
 'faithful',
 'register',
 'burial',
 'clergyman',
 'chief',
 'particularly',
 'regard',
 'deadest',
 'ironmongery',
 'wisdom',
 'ancestors',
 'simile',
 'unhallowed',
 'disturb',
 'permit',
 'repeat',
 'emphatically',
 'executor',
 'administrator',
 'assign',
 'residuary',
 'legatee',
 'dreadfully',
 'sad',
 'excellent',
 'solemnised',
 'undoubted',
 'brings',
 'distinctly',
 'relate',
 'hamlet',
 'stroll',
 'easterly',
 'ramparts',
 'rashly',
 'breezy',
 'spot',
 'paul',
 'instance',
 'literally',
 'astonish',
 'painted',
 'names',
 'grindstone',
 'squeezing',
 'wrenching',
 'covetous',
 'sinner',
 'flint',
 'oyster',
 'froze',
 'stiffened',
 'gait',
 'thin',
 'blue',
 'shrewdly',
 'grating',
 'frosty',
 'rime',
 'wiry',
 'temperature',
 'iced',
 'thaw',
 'external',
 'heat',
 'chill',
 'bitterer',
 'pelting',
 'heaviest',
 'hail',
 'sleet',
 'boast',
 'advantage',
 'respe

#### Remove all single words (hapaxes)

In [16]:
cc_words = [word for word in all_words if word not in single_words]

#### Number of words in The Christmas Carol

In [17]:
len(cc_words)

11044

#### Number of unique words in The Christmas Carol

In [18]:
cc_fd = nltk.FreqDist(cc_words)
len(cc_fd)

1697

#### Use Porter Stemmer to create the root words

In [19]:
porter = nltk.PorterStemmer()
cc_stems = [porter.stem(word) for word in cc_words]
cc_stems

['christma',
 'carol',
 'charl',
 'dicken',
 'releas',
 'languag',
 'christma',
 'carol',
 'charl',
 'dicken',
 'ghost',
 'stori',
 'christma',
 'endeavour',
 'ghostli',
 'littl',
 'book',
 'rais',
 'ghost',
 'idea',
 'shall',
 'put',
 'humour',
 'season',
 'may',
 'haunt',
 'hous',
 'pleasantli',
 'one',
 'wish',
 'lay',
 'friend',
 'servant',
 'decemb',
 'stave',
 'marley',
 'ghost',
 'marley',
 'dead',
 'begin',
 'doubt',
 'whatev',
 'sign',
 'clerk',
 'undertak',
 'mourner',
 'scroog',
 'sign',
 'name',
 'good',
 'upon',
 'anyth',
 'chose',
 'put',
 'hand',
 'old',
 'marley',
 'dead',
 'mind',
 'mean',
 'say',
 'know',
 'knowledg',
 'dead',
 'might',
 'inclin',
 'piec',
 'trade',
 'hand',
 'shall',
 'countri',
 'done',
 'therefor',
 'marley',
 'dead',
 'scroog',
 'knew',
 'dead',
 'cours',
 'could',
 'otherwis',
 'scroog',
 'partner',
 'know',
 'mani',
 'year',
 'scroog',
 'sole',
 'sole',
 'sole',
 'sole',
 'sole',
 'friend',
 'sole',
 'mourner',
 'even',
 'scroog',
 'cut',
 'even

#### Unique words in The Christmas Carol (after Porter Stemming)

In [20]:
cc_stem_fd = nltk.FreqDist(cc_stems)
cc_stem_fd

FreqDist({'scroog': 357, 'said': 221, 'upon': 120, 'one': 111, 'spirit': 103, 'ghost': 96, 'would': 94, 'christma': 86, 'hand': 78, 'man': 76, ...})

#### Use Lancaster Stemmer to create the root words

In [21]:
lancaster = nltk.LancasterStemmer()
cc_stems = [lancaster.stem(word) for word in cc_words]
cc_stems

['christmas',
 'carol',
 'charl',
 'dick',
 'releas',
 'langu',
 'christmas',
 'carol',
 'charl',
 'dick',
 'ghost',
 'story',
 'christmas',
 'endeavo',
 'ghost',
 'littl',
 'book',
 'rais',
 'ghost',
 'ide',
 'shal',
 'put',
 'humo',
 'season',
 'may',
 'haunt',
 'hous',
 'pleas',
 'on',
 'wish',
 'lay',
 'friend',
 'serv',
 'decemb',
 'stav',
 'marley',
 'ghost',
 'marley',
 'dead',
 'begin',
 'doubt',
 'whatev',
 'sign',
 'clerk',
 'undertak',
 'mourn',
 'scrooge',
 'sign',
 'nam',
 'good',
 'upon',
 'anyth',
 'chos',
 'put',
 'hand',
 'old',
 'marley',
 'dead',
 'mind',
 'mean',
 'say',
 'know',
 'knowledg',
 'dead',
 'might',
 'inclin',
 'piec',
 'trad',
 'hand',
 'shal',
 'country',
 'don',
 'theref',
 'marley',
 'dead',
 'scrooge',
 'knew',
 'dead',
 'cours',
 'could',
 'otherw',
 'scrooge',
 'partn',
 'know',
 'many',
 'year',
 'scrooge',
 'sol',
 'sol',
 'sol',
 'sol',
 'sol',
 'friend',
 'sol',
 'mourn',
 'ev',
 'scrooge',
 'cut',
 'ev',
 'man',
 'busy',
 'day',
 'fun',
 'bar

#### Unique words in The Christmas Carol (after Lancaster Stemming)

In [22]:
cc_stem_fd = nltk.FreqDist(cc_stems)
cc_stem_fd

FreqDist({'scrooge': 357, 'said': 221, 'upon': 120, 'on': 111, 'ghost': 103, 'spirit': 103, 'tim': 97, 'would': 94, 'christmas': 86, 'man': 79, ...})

#### Use Word Net Lemmatizer to create the root words

In [23]:
wordnet = nltk.WordNetLemmatizer()
cc_stems = [wordnet.lemmatize(word) for word in cc_words]
cc_stems

['christmas',
 'carol',
 'charles',
 'dickens',
 'release',
 'language',
 'christmas',
 'carol',
 'charles',
 'dickens',
 'ghost',
 'story',
 'christmas',
 'endeavoured',
 'ghostly',
 'little',
 'book',
 'raise',
 'ghost',
 'idea',
 'shall',
 'put',
 'humour',
 'season',
 'may',
 'haunt',
 'house',
 'pleasantly',
 'one',
 'wish',
 'lay',
 'friend',
 'servant',
 'december',
 'stave',
 'marley',
 'ghost',
 'marley',
 'dead',
 'begin',
 'doubt',
 'whatever',
 'signed',
 'clerk',
 'undertaker',
 'mourner',
 'scrooge',
 'signed',
 'name',
 'good',
 'upon',
 'anything',
 'chose',
 'put',
 'hand',
 'old',
 'marley',
 'dead',
 'mind',
 'mean',
 'say',
 'know',
 'knowledge',
 'dead',
 'might',
 'inclined',
 'piece',
 'trade',
 'hand',
 'shall',
 'country',
 'done',
 'therefore',
 'marley',
 'dead',
 'scrooge',
 'knew',
 'dead',
 'course',
 'could',
 'otherwise',
 'scrooge',
 'partner',
 'know',
 'many',
 'year',
 'scrooge',
 'sole',
 'sole',
 'sole',
 'sole',
 'sole',
 'friend',
 'sole',
 'mour

#### Unique words in The Christmas Carol (after Word Net Lemmatization)

In [24]:
cc_stem_fd = nltk.FreqDist(cc_stems)
cc_stem_fd

FreqDist({'scrooge': 357, 'said': 221, 'upon': 120, 'one': 111, 'spirit': 103, 'ghost': 96, 'would': 94, 'christmas': 86, 'hand': 78, 'man': 76, ...})

#### Display the abbreviations (along with their meaning) for the various Parts of Speech tags

In [25]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

#### Determine the Parts of Speech tag for the words in The Christmas Carol

In [26]:
cc_pos = nltk.pos_tag(cc_words)
cc_pos

[('christmas', 'NNS'),
 ('carol', 'VBP'),
 ('charles', 'NNS'),
 ('dickens', 'NNS'),
 ('release', 'VBP'),
 ('language', 'NN'),
 ('christmas', 'NN'),
 ('carol', 'NN'),
 ('charles', 'VBZ'),
 ('dickens', 'VBZ'),
 ('ghost', 'NN'),
 ('story', 'NN'),
 ('christmas', 'NN'),
 ('endeavoured', 'VBD'),
 ('ghostly', 'RB'),
 ('little', 'JJ'),
 ('book', 'NN'),
 ('raise', 'NN'),
 ('ghost', 'NN'),
 ('idea', 'NN'),
 ('shall', 'MD'),
 ('put', 'VB'),
 ('humour', 'CD'),
 ('season', 'NN'),
 ('may', 'MD'),
 ('haunt', 'VB'),
 ('houses', 'NNS'),
 ('pleasantly', 'RB'),
 ('one', 'CD'),
 ('wish', 'JJ'),
 ('lay', 'NN'),
 ('friend', 'NN'),
 ('servant', 'JJ'),
 ('december', 'NN'),
 ('stave', 'VBP'),
 ('marley', 'NN'),
 ('ghost', 'NN'),
 ('marley', 'NN'),
 ('dead', 'JJ'),
 ('begin', 'NN'),
 ('doubt', 'NN'),
 ('whatever', 'WDT'),
 ('signed', 'VBD'),
 ('clerk', 'NN'),
 ('undertaker', 'NN'),
 ('mourner', 'NN'),
 ('scrooge', 'NN'),
 ('signed', 'VBD'),
 ('name', 'RB'),
 ('good', 'JJ'),
 ('upon', 'IN'),
 ('anything', 'NN'),

#### Tag the Parts of Speech without granularity (like past tense, present, etc)

In [27]:
cc_pos2 = nltk.pos_tag(cc_words, tagset='universal')
cc_pos2

[('christmas', 'NOUN'),
 ('carol', 'VERB'),
 ('charles', 'NOUN'),
 ('dickens', 'NOUN'),
 ('release', 'VERB'),
 ('language', 'NOUN'),
 ('christmas', 'NOUN'),
 ('carol', 'NOUN'),
 ('charles', 'VERB'),
 ('dickens', 'VERB'),
 ('ghost', 'NOUN'),
 ('story', 'NOUN'),
 ('christmas', 'NOUN'),
 ('endeavoured', 'VERB'),
 ('ghostly', 'ADV'),
 ('little', 'ADJ'),
 ('book', 'NOUN'),
 ('raise', 'NOUN'),
 ('ghost', 'NOUN'),
 ('idea', 'NOUN'),
 ('shall', 'VERB'),
 ('put', 'VERB'),
 ('humour', 'NUM'),
 ('season', 'NOUN'),
 ('may', 'VERB'),
 ('haunt', 'VERB'),
 ('houses', 'NOUN'),
 ('pleasantly', 'ADV'),
 ('one', 'NUM'),
 ('wish', 'ADJ'),
 ('lay', 'NOUN'),
 ('friend', 'NOUN'),
 ('servant', 'ADJ'),
 ('december', 'NOUN'),
 ('stave', 'VERB'),
 ('marley', 'NOUN'),
 ('ghost', 'NOUN'),
 ('marley', 'NOUN'),
 ('dead', 'ADJ'),
 ('begin', 'NOUN'),
 ('doubt', 'NOUN'),
 ('whatever', 'DET'),
 ('signed', 'VERB'),
 ('clerk', 'NOUN'),
 ('undertaker', 'NOUN'),
 ('mourner', 'NOUN'),
 ('scrooge', 'NOUN'),
 ('signed', 'VERB'

#### Use Chunking to find Noun Phrases

In [28]:
parser = nltk.RegexpParser(r'NOUN_PHRASE: {<ADJ>*<NOUN>+}')
cc_chunks = parser.parse(cc_pos2)
print(cc_chunks)

(S
  (NOUN_PHRASE christmas/NOUN)
  carol/VERB
  (NOUN_PHRASE charles/NOUN dickens/NOUN)
  release/VERB
  (NOUN_PHRASE language/NOUN christmas/NOUN carol/NOUN)
  charles/VERB
  dickens/VERB
  (NOUN_PHRASE ghost/NOUN story/NOUN christmas/NOUN)
  endeavoured/VERB
  ghostly/ADV
  (NOUN_PHRASE little/ADJ book/NOUN raise/NOUN ghost/NOUN idea/NOUN)
  shall/VERB
  put/VERB
  humour/NUM
  (NOUN_PHRASE season/NOUN)
  may/VERB
  haunt/VERB
  (NOUN_PHRASE houses/NOUN)
  pleasantly/ADV
  one/NUM
  (NOUN_PHRASE wish/ADJ lay/NOUN friend/NOUN)
  (NOUN_PHRASE servant/ADJ december/NOUN)
  stave/VERB
  (NOUN_PHRASE marley/NOUN ghost/NOUN marley/NOUN)
  (NOUN_PHRASE dead/ADJ begin/NOUN doubt/NOUN)
  whatever/DET
  signed/VERB
  (NOUN_PHRASE clerk/NOUN undertaker/NOUN mourner/NOUN scrooge/NOUN)
  signed/VERB
  name/ADV
  good/ADJ
  upon/ADP
  (NOUN_PHRASE anything/NOUN)
  chose/ADJ
  put/VERB
  (NOUN_PHRASE hand/NOUN)
  (NOUN_PHRASE old/ADJ marley/NOUN)
  (NOUN_PHRASE dead/ADJ mind/NOUN)
  mean/ADJ
  say/

#### Simple example of Chunking

In [29]:
text = 'Alice likes to travel to New York City'
words = nltk.word_tokenize(text)
tags = nltk.pos_tag(words, tagset='universal')
chunks = parser.parse(tags)
print(chunks)

(S
  (NOUN_PHRASE Alice/NOUN)
  likes/VERB
  to/PRT
  travel/VERB
  to/PRT
  (NOUN_PHRASE New/NOUN York/NOUN City/NOUN))


#### Example of Named Entity Recognition

In [30]:
text = 'Alice is the CEO of Acme International and has been tasked with building a Space Ship for Mars'
words = nltk.word_tokenize(text)
tags = nltk.pos_tag(words)
ner = nltk.ne_chunk(tags)
print(ner)

(S
  (GPE Alice/NNP)
  is/VBZ
  the/DT
  (ORGANIZATION CEO/NN of/IN Acme/NNP International/NNP)
  and/CC
  has/VBZ
  been/VBN
  tasked/VBN
  with/IN
  building/VBG
  a/DT
  Space/NNP
  Ship/NNP
  for/IN
  (PERSON Mars/NNP))


#### NLTK has built-in function for bigrams and trigrams - nltk.bigram() and nltk.trigrams()

In [31]:
all_bigrams = list(ngrams(all_words, 2))
all_bigrams

[('title', 'christmas'),
 ('christmas', 'carol'),
 ('carol', 'author'),
 ('author', 'charles'),
 ('charles', 'dickens'),
 ('dickens', 'illustrator'),
 ('illustrator', 'john'),
 ('john', 'leech'),
 ('leech', 'release'),
 ('release', 'date'),
 ('date', 'october'),
 ('october', 'language'),
 ('language', 'english'),
 ('english', 'christmas'),
 ('christmas', 'carol'),
 ('carol', 'charles'),
 ('charles', 'dickens'),
 ('dickens', 'ghost'),
 ('ghost', 'story'),
 ('story', 'christmas'),
 ('christmas', 'preface'),
 ('preface', 'endeavoured'),
 ('endeavoured', 'ghostly'),
 ('ghostly', 'little'),
 ('little', 'book'),
 ('book', 'raise'),
 ('raise', 'ghost'),
 ('ghost', 'idea'),
 ('idea', 'shall'),
 ('shall', 'put'),
 ('put', 'readers'),
 ('readers', 'humour'),
 ('humour', 'season'),
 ('season', 'may'),
 ('may', 'haunt'),
 ('haunt', 'houses'),
 ('houses', 'pleasantly'),
 ('pleasantly', 'one'),
 ('one', 'wish'),
 ('wish', 'lay'),
 ('lay', 'faithful'),
 ('faithful', 'friend'),
 ('friend', 'servant'),