# NLTK (Natural Language Toolkit)

## Introduction

In [13]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

1.1 million words:

In [18]:
len(brown.words())

1161192

In [19]:
brown.tagged_words(categories='news', tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [15]:
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [47]:
import nltk
from nltk.corpus import brown
from nltk.tag import hmm
from nltk.metrics.scores import accuracy as nltk_accuracy # Import the accuracy function
from sklearn.model_selection import train_test_split

# 1) Загрузим корпус и разобьём на train/test
nltk.download('brown')
nltk.download('universal_tagset')
tagged = brown.tagged_sents(tagset='universal')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [48]:
train_sents, test_sents = train_test_split(tagged, test_size=0.2, random_state=42)

In [49]:
# 2) Соберём список всех тегов и слов из train
tags = list({t for sent in train_sents for (_, t) in sent})
words = list({w.lower() for sent in train_sents for (w, _) in sent})

# 3) Обучим HMM-теггер
trainer = hmm.HiddenMarkovModelTrainer(tags, words)
hmm_tagger = trainer.train_supervised(train_sents)


Viterbi to predict sents:

In [27]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])


Accuracy: 0.7348217327003755


## Baum-Welch algorithm

In [52]:
len(brown.tagged_sents(tagset='universal'))

57340

In [54]:
import random

indices = random.sample(range(len(tagged)), 1000)
tagged_short = [ tagged[i] for i in indices ]

In [20]:
hmm_tagger_unsupervised = trainer.train_unsupervised(train_sents)
print("Accuracy:", hmm_tagger_unsupervised.evaluate(test_sents))

iteration 0 logprob -14814579.960571732
iteration 1 logprob -9571880.273423187


KeyboardInterrupt: 

In [45]:
# 1. Split off 50 tagged sentences and 950 raw sentences
labeled_sents = train_sents[:100]
# raw_sents     = train_sents[50:1000]

raw_sents = random.sample(train_sents[100:], 950)

# 2. Build the full tag set (states) and symbol set (vocab)
states  = list({ tag for sent in labeled_sents for (word, tag) in sent })
symbols = list({ word 
                 for sent in raw_sents + labeled_sents   # include both labeled + unlabeled 
                 for (word, *_) in sent })         # unpack to ignore tags if present

# 3. Convert the raw ones to the required “unlabeled” format
unlabeled_sents = [
    [ (word, None) for (word, *_) in sent ]
    for sent in raw_sents
]

# 4. Instantiate the trainer with _both_ your states and symbols
trainer = hmm.HiddenMarkovModelTrainer(states, symbols)

# 5. Get a little supervised head‐start
supervised_tagger = trainer.train_supervised(labeled_sents)

# 6. Run Baum–Welch on the rest
hmm_tagger = trainer.train_unsupervised(
    unlabeled_sents,
    estimator=supervised_tagger,
    threshold=1.0,
    max_iterations=10
)

iteration 0 logprob -245447.04101194412
iteration 1 logprob -188015.57286495733
iteration 2 logprob -187670.30757980066
iteration 3 logprob -187251.14900020816
iteration 4 logprob -186649.38035198193
iteration 5 logprob -185743.10368546678
iteration 6 logprob -184419.30920252914
iteration 7 logprob -182656.8376481745
iteration 8 logprob -180611.9704021879
iteration 9 logprob -178539.30891697144


In [46]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger_unsupervised.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

Accuracy: 0.04677764986396582


# 