# NLTK

In [None]:
from urllib import request
url_pinocchio = "http://esuli.it/demo/data/pinocchio.txt"
response = request.urlopen(url_pinocchio)
testo = response.read().decode()
testo[:1000]

## Tokenization

Splitting text into sentences

In [None]:
import nltk
nltk.download('punkt')

nltk.sent_tokenize(testo)

Tokenization of the whole text

In [None]:
tokenized_text = nltk.word_tokenize(testo)
tokenized_text

Tokenization of each sentence

In [None]:
tokenized_sentences = list()
for sent in nltk.sent_tokenize(testo):
    tokenized_sentences.append(nltk.word_tokenize(sent))
tokenized_sentences

## Counting frequencies

In [None]:
freq_dist = nltk.FreqDist(tokenized_text)
freq_dist

In [None]:
freq_dist.most_common(20)

In [None]:
freq_dist.plot(50)

## Inspecting text with the NLTK.Text object

In [None]:
nltk_text = nltk.Text(tokenized_text)

In [None]:
nltk_text.concordance('Gatto')

In [None]:
nltk_text.dispersion_plot(['Pinocchio','Geppetto','Gatto', 'Volpe','Fata','Mangiafoco'])

In [None]:
nltk.download('stopwords')

nltk_text.collocations()

## PoS tagging

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
text1 = 'I saw a bird.'
text2 = 'Can you lend me a saw?'

In [None]:
token1 = nltk.word_tokenize(text1)
token2 = nltk.word_tokenize(text2)

In [None]:
token1, token2

In [None]:
nltk.pos_tag(token1),nltk.pos_tag(token2)

## Stemming

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [None]:
stemmer.stem('cars')

In [None]:
stemmer.stem('was')

## Lemmatization

In [None]:
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

In [None]:
lmtzr.lemmatize('cars')

In [None]:
lmtzr.lemmatize('was')

In [None]:
lmtzr.lemmatize('was', pos='v')

## Bag of Words

In [None]:
t1 = nltk.word_tokenize('I won, and thus you lose.')
t2 = nltk.word_tokenize('I lose, and thus you won.')

In [None]:
t1,t2,t1==t2 

Bag of word representation. The sequence of term, of variable length, becomes a set.

In [None]:
b1 = set(t1)
b2 = set(t2)

In [None]:
b1,b2,b1==b2 

We lost information. Why?

## Word N-grams

In [None]:
b1 = set(nltk.ngrams(nltk.word_tokenize('I won, and thus, you lose.'),2))
b2 = set(nltk.ngrams(nltk.word_tokenize('I lose, and thus, you won.'),2))

In [None]:
b1, b2, b1==b2

## Character level n-grams

In [None]:
b1 = set(nltk.ngrams('rainbow',3))
b2 = set(nltk.ngrams('rainbaw',3))

In [None]:
b1,b2

In [None]:
b1.intersection(b2)

## Stopword removal

In [None]:
from nltk.corpus import stopwords
stopwords.words('italian')

In [None]:
features = set(nltk.word_tokenize('the president of the united states of america'))
less_features = features.difference(stopwords.words('english'))
less_features