# NLTK (Natural Language Toolkit)

## Introduction

In [1]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

1.1 million words:

In [2]:
len(brown.words())

1161192

In [19]:
brown.tagged_words(categories='news', tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [15]:
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [3]:
import nltk
from nltk.corpus import brown
from nltk.tag import hmm
from nltk.metrics.scores import accuracy as nltk_accuracy # Import the accuracy function
from sklearn.model_selection import train_test_split

# 1) Загрузим корпус и разобьём на train/test
nltk.download('brown')
nltk.download('universal_tagset')
tagged = brown.tagged_sents(tagset='universal')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [4]:
train_sents, test_sents = train_test_split(tagged, test_size=0.2, random_state=42)

In [6]:
# 2) Соберём список всех тегов и слов из train
tags = list({t for sent in train_sents for (_, t) in sent})
words = list({w.lower() for sent in train_sents for (w, _) in sent})

In [7]:
# 3) Обучим HMM-теггер
trainer = hmm.HiddenMarkovModelTrainer(tags, words)
hmm_tagger = trainer.train_supervised(train_sents)


Viterbi to predict sents:

In [27]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])


Accuracy: 0.7348217327003755


### Smaller dataset

In [76]:
len(brown.tagged_sents(tagset='universal'))

57340

In [83]:
import random

#indices = random.sample(range(len(train_sents)), 1000)
train_sents_short = random.sample(train_sents, 30000) #[ train_sents[i] for i in indices ]
tags_short = list({t for sent in train_sents_short for (_, t) in sent})
words_short = list({w for sent in train_sents_short for (w, _) in sent})

In [84]:
trainer = hmm.HiddenMarkovModelTrainer(tags_short, words_short)


hmm_tagger = trainer.train_supervised(
    train_sents_short,
)

In [85]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

Accuracy: 0.6769543865095483


## Baum-Welch algorithm

In [8]:
# Get supervised initialization
labeled_sents = train_sents[:1000]
trainer = hmm.HiddenMarkovModelTrainer(tags, words)
supervised_tagger = trainer.train_supervised(labeled_sents)

# Run Baum-Welch with supervised initialization
hmm_tagger = trainer.train_unsupervised(
    train_sents[1000:],
    # random.sample(train_sents[10000:], 10000),
    model=supervised_tagger,
    threshold=0.1,
    max_iterations=10
)




iteration 0 logprob -1.7625099999996204e+305
iteration 1 logprob -9.839079349580974e+290
iteration 2 logprob -10234143.195433395
iteration 3 logprob -9099610.570115816
iteration 4 logprob -8981320.608538745
iteration 5 logprob -8900639.266782697
iteration 6 logprob -8854538.824658303
iteration 7 logprob -8827572.699781725
iteration 8 logprob -8810634.346843222
iteration 9 logprob -8799270.107523344


In [9]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  P[i] = self._priors.logprob(si)


Accuracy: 0.6696029354063995


# 