# NLTK (Natural Language Toolkit)

## Supervised

In [1]:
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

1.1 million words:

In [2]:
len(brown.words())

1161192

In [19]:
brown.tagged_words(categories='news', tagset='universal')

[('The', 'DET'), ('Fulton', 'NOUN'), ...]

In [15]:
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [2]:
import nltk
from nltk.corpus import brown
from nltk.tag import hmm
from nltk.metrics.scores import accuracy as nltk_accuracy # Import the accuracy function
from sklearn.model_selection import train_test_split

# 1) Загрузим корпус и разобьём на train/test
nltk.download('brown')
nltk.download('universal_tagset')
tagged = brown.tagged_sents(tagset='universal')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [3]:
train_sents, test_sents = train_test_split(tagged, test_size=0.2, random_state=42)

In [12]:
# 2) Соберём список всех тегов и слов из train
tags = list({t for sent in train_sents for (_, t) in sent})
words = list({w for sent in train_sents for (w, _) in sent})

In [13]:
# 3) Обучим HMM-теггер
trainer = hmm.HiddenMarkovModelTrainer(tags, words)
hmm_tagger = trainer.train_supervised(train_sents)


Viterbi to predict sents:

In [14]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

  O[i, k] = self._output_logprob(si, self._symbols[k])
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])


Accuracy: 0.7316612554812506


### Smaller dataset

In [76]:
len(brown.tagged_sents(tagset='universal'))

57340

In [83]:
import random

#indices = random.sample(range(len(train_sents)), 1000)
train_sents_short = random.sample(train_sents, 30000) #[ train_sents[i] for i in indices ]
tags_short = list({t for sent in train_sents_short for (_, t) in sent})
words_short = list({w for sent in train_sents_short for (w, _) in sent})

In [84]:
trainer = hmm.HiddenMarkovModelTrainer(tags_short, words_short)


hmm_tagger = trainer.train_supervised(
    train_sents_short,
)

In [85]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

Accuracy: 0.6769543865095483


## Baum-Welch algorithm

In [10]:
# Get supervised initialization
labeled_sents = train_sents[:1000]
trainer = hmm.HiddenMarkovModelTrainer(tags, words)
supervised_tagger = trainer.train_supervised(labeled_sents)

# Run Baum-Welch with supervised initialization
hmm_tagger = trainer.train_unsupervised(
    train_sents,
    # random.sample(train_sents[10000:], 10000),
    model=supervised_tagger,
    threshold=0.1,
    max_iterations=10
)




iteration 0 logprob -1.7625099999996204e+305
iteration 1 logprob -1.0079714373778483e+291
iteration 2 logprob -10459570.625052601
iteration 3 logprob -9303013.47956818
iteration 4 logprob -9183131.69229908
iteration 5 logprob -9102243.048878297
iteration 6 logprob -9056322.578358807
iteration 7 logprob -9029640.782758556
iteration 8 logprob -9012946.461595282
iteration 9 logprob -9001744.229187535


In [11]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

Accuracy: 0.6714224734506978


# 

## Advanced techniques

### Supervised

In [5]:
tags = list({t for sent in train_sents for (_, t) in sent})
words = list({w for sent in train_sents for (w, _) in sent})

In [15]:
from nltk.tag import RegexpTagger, HiddenMarkovModelTagger
from nltk.tag.hmm import HiddenMarkovModelTrainer
from nltk.probability import LidstoneProbDist

# 1) Prepare your data (no lowercasing!)
# train_sents, test_sents = train_test_split(tagged, test_size=0.2, random_state=42)

# 2) Collect tagset and vocabulary (with original casing)
tags = list({t for sent in train_sents for (_, t) in sent})
words = list({w for sent in train_sents for (w, _) in sent})

# 3) Train your HMM with additive (Lidstone) smoothing
trainer = HiddenMarkovModelTrainer(tags, words)
estimator = lambda freqdist, bins: LidstoneProbDist(freqdist, 0.1, bins)
hmm_tagger: HiddenMarkovModelTagger = trainer.train_supervised(
    train_sents,
    estimator=estimator,
    # you can also pass order=3 here for a trigram HMM
)

In [16]:
# 1. Build the raw word-lists from your gold-standard sentences
word_lists = [[w for (w, t) in sent] for sent in test_sents]

# 2. Tag them
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

Accuracy: 0.9512519025382987


### Unsupervised

In [20]:
# Get supervised initialization
labeled_sents = train_sents[:1000]
trainer = hmm.HiddenMarkovModelTrainer(tags, words)
supervised_tagger = trainer.train_supervised(labeled_sents, estimator=estimator)

# Run Baum-Welch with supervised initialization
hmm_tagger = trainer.train_unsupervised(
    train_sents,
    # random.sample(train_sents[10000:], 10000),
    model=supervised_tagger,
    threshold=0.1,
    max_iterations=10,
    estimator=estimator,
)




iteration 0 logprob -10323439.847916776
iteration 1 logprob -9096593.341661517
iteration 2 logprob -8988874.992472248
iteration 3 logprob -8910284.645916255
iteration 4 logprob -8854316.305425018
iteration 5 logprob -8813000.566213284
iteration 6 logprob -8782337.353903037
iteration 7 logprob -8759631.020962598
iteration 8 logprob -8742290.362671502
iteration 9 logprob -8728995.441892426


In [21]:
predicted_sents = hmm_tagger.tag_sents(word_lists)

# 3. Flatten both the gold and the predicted into one big list of (word, tag)
gold_flat = [pair for sent in test_sents         for pair in sent]
pred_flat = [pair for sent in predicted_sents    for pair in sent]

# 4. Compute accuracy
print("Accuracy:", nltk_accuracy(gold_flat, pred_flat))

Accuracy: 0.5697094344341107
