## Text Analysis and Natural Language Processing Tasks

##### Tokenization

In [52]:
from nltk.tokenize import word_tokenize
text = "This is a sample sentence."
tokens = word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', '.']


##### Stemming and Lemmatization

In [54]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print(stemmer.stem("running"))  # Output: run

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running", pos='v'))  # Output: run

run
run


##### POS Tagging

In [56]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Tokenize and tag the sentence
tokens = word_tokenize("This is a sample sentence.")
tagged = pos_tag(tokens)
print(tagged)


[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NN'), ('.', '.')]


## Corpora for Various Language Resources

##### Gutenberg corpus

In [64]:
from nltk.corpus import gutenberg
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


##### brown corpus

In [72]:
from nltk.corpus import brown
print(brown.categories())


['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


###### WordNet

In [78]:
from nltk.corpus import wordnet
syns = wordnet.synsets("program")
print(syns[0].lemmas()[0].name())  # Output: program

plan


## Training and Testing NLP Models

##### Text Classification

In [93]:
from nltk.corpus import movie_reviews
import random
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

##### Named Entity Recognition

In [108]:
from nltk.corpus import conll2002
train_sents = conll2002.iob_sents('esp.train')


## Lexical and Semantic Analysis

##### Collocations

In [112]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
bigram_finder = BigramCollocationFinder.from_words(tokens)
bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)
print(bigrams)


[('This', 'is'), ('a', 'sample'), ('is', 'a'), ('sample', 'sentence'), ('sentence', '.')]


##### Concordance

In [115]:
from nltk.text import Text
text = Text(word_tokenize("This is a sample sentence. This is another example sentence."))
text.concordance("sentence")


Displaying 2 of 2 matches:
This is a sample sentence . This is another example sentence
sentence . This is another example sentence .


## Language Model Development

##### N-gram Models

In [119]:
from nltk.util import ngrams
text = "This is a sample sentence."
n_grams = ngrams(word_tokenize(text), 2)
for grams in n_grams:
    print(grams)

('This', 'is')
('is', 'a')
('a', 'sample')
('sample', 'sentence')
('sentence', '.')
