In [1]:
import os
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
paragraph = '''Artificial Intelligence (AI) is a branch of computer science focused on creating systems that can perform tasks that typically require human intelligence. These tasks include reasoning, learning, problem-solving, perception, and natural language understanding. AI algorithms and models are designed to analyze large datasets, extract patterns, and make predictions or decisions based on the data. AI technologies, such as machine learning and deep learning, have revolutionized various industries, including healthcare, finance, transportation, and entertainment. As AI continues to advance, it holds the potential to automate repetitive tasks, enhance productivity, and address complex challenges facing society.'''

## Word tokenization: To break a sentence into words seqenece in a list

In [3]:
from nltk.tokenize import word_tokenize
words = word_tokenize(paragraph) #seperates each blankspaces
words[0:10]

['Artificial',
 'Intelligence',
 '(',
 'AI',
 ')',
 'is',
 'a',
 'branch',
 'of',
 'computer']

## Sentence tokenization: To break a paragraph into each sentence

In [4]:
from nltk.tokenize import sent_tokenize
sent = sent_tokenize(paragraph)
for each_sent in sent:
    print(each_sent + '\n' + '-'*120)

Artificial Intelligence (AI) is a branch of computer science focused on creating systems that can perform tasks that typically require human intelligence.
------------------------------------------------------------------------------------------------------------------------
These tasks include reasoning, learning, problem-solving, perception, and natural language understanding.
------------------------------------------------------------------------------------------------------------------------
AI algorithms and models are designed to analyze large datasets, extract patterns, and make predictions or decisions based on the data.
------------------------------------------------------------------------------------------------------------------------
AI technologies, such as machine learning and deep learning, have revolutionized various industries, including healthcare, finance, transportation, and entertainment.
-------------------------------------------------------------------------

## Blankline: This tokenizes the paragraphs

In [5]:
from nltk.tokenize import blankline_tokenize
blankline_tokenize(paragraph)

['Artificial Intelligence (AI) is a branch of computer science focused on creating systems that can perform tasks that typically require human intelligence. These tasks include reasoning, learning, problem-solving, perception, and natural language understanding. AI algorithms and models are designed to analyze large datasets, extract patterns, and make predictions or decisions based on the data. AI technologies, such as machine learning and deep learning, have revolutionized various industries, including healthcare, finance, transportation, and entertainment. As AI continues to advance, it holds the potential to automate repetitive tasks, enhance productivity, and address complex challenges facing society.']

## Whitespace tokenization: Tokenizes on blankspace

In [6]:
data = "The quick brown fox jumps over the lazy dog, but the dog doesn't seem to care! #nlp_rocks"
from nltk.tokenize import WhitespaceTokenizer
WhitespaceTokenizer().tokenize(data)

['The',
 'quick',
 'brown',
 'fox',
 'jumps',
 'over',
 'the',
 'lazy',
 'dog,',
 'but',
 'the',
 'dog',
 "doesn't",
 'seem',
 'to',
 'care!',
 '#nlp_rocks']

## Word_punct tokenize : Tokenize on blankspaces and punctuations

In [7]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(data)

['The',
 'quick',
 'brown',
 'fox',
 'jumps',
 'over',
 'the',
 'lazy',
 'dog',
 ',',
 'but',
 'the',
 'dog',
 'doesn',
 "'",
 't',
 'seem',
 'to',
 'care',
 '!',
 '#',
 'nlp_rocks']

## Bigrams: Gives pairs of words

In [8]:
from nltk.util import bigrams, trigrams, ngrams
list(bigrams(words))[:10] # words must be a list of tokenized words

[('Artificial', 'Intelligence'),
 ('Intelligence', '('),
 ('(', 'AI'),
 ('AI', ')'),
 (')', 'is'),
 ('is', 'a'),
 ('a', 'branch'),
 ('branch', 'of'),
 ('of', 'computer'),
 ('computer', 'science')]

## Trigrams

In [9]:
list(trigrams(words))[:10]

[('Artificial', 'Intelligence', '('),
 ('Intelligence', '(', 'AI'),
 ('(', 'AI', ')'),
 ('AI', ')', 'is'),
 (')', 'is', 'a'),
 ('is', 'a', 'branch'),
 ('a', 'branch', 'of'),
 ('branch', 'of', 'computer'),
 ('of', 'computer', 'science'),
 ('computer', 'science', 'focused')]

## Ngrams

In [10]:
list(ngrams(words, 5))[:10]

[('Artificial', 'Intelligence', '(', 'AI', ')'),
 ('Intelligence', '(', 'AI', ')', 'is'),
 ('(', 'AI', ')', 'is', 'a'),
 ('AI', ')', 'is', 'a', 'branch'),
 (')', 'is', 'a', 'branch', 'of'),
 ('is', 'a', 'branch', 'of', 'computer'),
 ('a', 'branch', 'of', 'computer', 'science'),
 ('branch', 'of', 'computer', 'science', 'focused'),
 ('of', 'computer', 'science', 'focused', 'on'),
 ('computer', 'science', 'focused', 'on', 'creating')]

## Stemming: Getting the root words. Three types- 
### 1. Porter 2.Lancaster 3. Snowball

In [11]:
words = ["running", "jumps", "happily", "running", "happily"]
from nltk.stem import PorterStemmer
ps = PorterStemmer()
for word in words:
    print(f'{word} : {ps.stem(word)}')

running : run
jumps : jump
happily : happili
running : run
happily : happili


In [12]:
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()
for word in words:
    print(f'{word} : {ls.stem(word)}')

running : run
jumps : jump
happily : happy
running : run
happily : happy


In [13]:
from nltk.stem import SnowballStemmer
sb = SnowballStemmer(language='english')
for word in words:
    print(f'{word} : {sb.stem(word)}')

running : run
jumps : jump
happily : happili
running : run
happily : happili


## Lemmatization: Shows the root word

In [14]:
from nltk.stem import wordnet

In [15]:
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
for word in words:
    print(f'{word} : {lm.lemmatize(word)}')

running : running
jumps : jump
happily : happily
running : running
happily : happily


## NLTK corpora

In [16]:
import nltk.corpus
print(os.listdir(nltk.data.find('corpora'))) #list of all corpora

['abc', 'abc.zip', 'alpino', 'alpino.zip', 'bcp47.zip', 'biocreative_ppi', 'biocreative_ppi.zip', 'brown', 'brown.zip', 'brown_tei', 'brown_tei.zip', 'cess_cat', 'cess_cat.zip', 'cess_esp', 'cess_esp.zip', 'chat80', 'chat80.zip', 'city_database', 'city_database.zip', 'cmudict', 'cmudict.zip', 'comparative_sentences', 'comparative_sentences.zip', 'comtrans.zip', 'conll2000', 'conll2000.zip', 'conll2002', 'conll2002.zip', 'conll2007.zip', 'crubadan', 'crubadan.zip', 'dependency_treebank', 'dependency_treebank.zip', 'dolch', 'dolch.zip', 'europarl_raw', 'europarl_raw.zip', 'extended_omw.zip', 'floresta', 'floresta.zip', 'framenet_v15', 'framenet_v15.zip', 'framenet_v17', 'framenet_v17.zip', 'gazetteers', 'gazetteers.zip', 'genesis', 'genesis.zip', 'gutenberg', 'gutenberg.zip', 'ieer', 'ieer.zip', 'inaugural', 'inaugural.zip', 'indian', 'indian.zip', 'jeita.zip', 'kimmo', 'kimmo.zip', 'knbc.zip', 'lin_thesaurus', 'lin_thesaurus.zip', 'machado.zip', 'mac_morpho', 'mac_morpho.zip', 'masc_tag

In [17]:
nltk.corpus.brown.words() #tokenised words in brown corpora

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [18]:
nltk.corpus.brown.fileids() #files inside brown corpora

['ca01',
 'ca02',
 'ca03',
 'ca04',
 'ca05',
 'ca06',
 'ca07',
 'ca08',
 'ca09',
 'ca10',
 'ca11',
 'ca12',
 'ca13',
 'ca14',
 'ca15',
 'ca16',
 'ca17',
 'ca18',
 'ca19',
 'ca20',
 'ca21',
 'ca22',
 'ca23',
 'ca24',
 'ca25',
 'ca26',
 'ca27',
 'ca28',
 'ca29',
 'ca30',
 'ca31',
 'ca32',
 'ca33',
 'ca34',
 'ca35',
 'ca36',
 'ca37',
 'ca38',
 'ca39',
 'ca40',
 'ca41',
 'ca42',
 'ca43',
 'ca44',
 'cb01',
 'cb02',
 'cb03',
 'cb04',
 'cb05',
 'cb06',
 'cb07',
 'cb08',
 'cb09',
 'cb10',
 'cb11',
 'cb12',
 'cb13',
 'cb14',
 'cb15',
 'cb16',
 'cb17',
 'cb18',
 'cb19',
 'cb20',
 'cb21',
 'cb22',
 'cb23',
 'cb24',
 'cb25',
 'cb26',
 'cb27',
 'cc01',
 'cc02',
 'cc03',
 'cc04',
 'cc05',
 'cc06',
 'cc07',
 'cc08',
 'cc09',
 'cc10',
 'cc11',
 'cc12',
 'cc13',
 'cc14',
 'cc15',
 'cc16',
 'cc17',
 'cd01',
 'cd02',
 'cd03',
 'cd04',
 'cd05',
 'cd06',
 'cd07',
 'cd08',
 'cd09',
 'cd10',
 'cd11',
 'cd12',
 'cd13',
 'cd14',
 'cd15',
 'cd16',
 'cd17',
 'ce01',
 'ce02',
 'ce03',
 'ce04',
 'ce05',
 'ce06',
 

## Stopwords

In [19]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each