In [1]:
pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp312-cp312-win_amd64.whl.metadata (4.4 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp312-cp312-win_amd64.whl (301 kB)
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import nltk
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [2]:
nltk.download('treebank')
corpus = nltk.corpus.treebank.tagged_sents()
print(corpus)

[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\AHINA\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [3]:
# Define a function to extract features for each word in a sentence
def word_features(sentence, i):
    word = sentence[i][0]
    features = {
        'word': word,
        'is_first': i == 0, #if the word is a first word
        'is_last': i == len(sentence) - 1,  #if the word is a last word
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,      #word is in uppercase
        'is_all_lower': word.lower() == word,      #word is in lowercase
         #prefix of the word
        'prefix-1': word[0],   
        'prefix-2': word[:2],
        'prefix-3': word[:3],
         #suffix of the word
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
         #extracting previous word
        'prev_word': '' if i == 0 else sentence[i-1][0],
         #extracting next word
        'next_word': '' if i == len(sentence)-1 else sentence[i+1][0],
        'has_hyphen': '-' in word,    #if word has hypen
        'is_numeric': word.isdigit(),  #if word is in numeric
        'capitals_inside': word[1:].lower() != word[1:]
    }
    return features

In [5]:
# Extract features for each sentence in the corpus
X = []
y = []
for sentence in corpus:
    X_sentence = []
    y_sentence = []
    for i in range(len(sentence)):
        X_sentence.append(word_features(sentence, i))
        y_sentence.append(sentence[i][1])
    X.append(X_sentence)
    y.append(y_sentence)


# Split the data into training and testing sets
split = int(0.8 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]

In [6]:
# Train a CRF model on the training data
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# Make predictions on the test data and evaluate the performance
y_pred = crf.predict(X_test)

print(metrics.flat_accuracy_score(y_test, y_pred))

0.9632716203403363


In [5]:
import pycrfsuite

# 1. Feature extraction function for each word
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        prev_word = sent[i-1]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent) - 1:
        next_word = sent[i+1]
        features.update({
            '+1:word.lower()': next_word.lower(),
            '+1:word.istitle()': next_word.istitle(),
            '+1:word.isupper()': next_word.isupper(),
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

# Convert sentence to features
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
train_sents = [
    (["Ahina", "is", "trying", "out", "CRF", "algorithm", "for", "Natural", "Language", "Processing", "POS", "Tagging", "."],
     ["NNP", "VBZ", "VBG", "RP", "NNP", "NN", "IN", "NNP", "NNP", "NNP", "NNP", "NN", "."]),

    (["The", "CRF", "model", "performs", "well", "on", "sequence", "labeling", "tasks", "."],
     ["DT", "NNP", "NN", "VBZ", "RB", "IN", "NN", "NN", "NNS", "."]),

    (["She", "loves", "to", "learn", "about", "machine", "learning", "and", "natural", "language", "processing", "."],
     ["PRP", "VBZ", "TO", "VB", "IN", "NN", "NN", "CC", "JJ", "NN", "NN", "."]),

    (["Python", "is", "a", "popular", "programming", "language", "for", "data", "science", "."],
     ["NNP", "VBZ", "DT", "JJ", "NN", "NN", "IN", "NN", "NN", "."]),

    (["We", "can", "build", "custom", "POS", "taggers", "using", "CRF", "models", "."],
     ["PRP", "MD", "VB", "JJ", "NNP", "NNS", "VBG", "NNP", "NNS", "."]),

    (["Natural", "language", "processing", "involves", "many", "complex", "tasks", "."],
     ["JJ", "NN", "NN", "VBZ", "DT", "JJ", "NNS", "."]),

    (["Machine", "learning", "algorithms", "can", "improve", "text", "classification", "."],
     ["NNP", "NN", "NNS", "MD", "VB", "NN", "NN", "."]),

    (["The", "researcher", "analyzed", "the", "results", "carefully", "."],
     ["DT", "NN", "VBD", "DT", "NNS", "RB", "."])
]



# Prepare training data in CRF input format
X_train = [sent2features(s) for s, _ in train_sents]
y_train = [tags for _, tags in train_sents]

# 2. Train the CRF model
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,  # L1 regularization
    'c2': 1e-3, # L2 regularization
    'max_iterations': 100,
    'feature.possible_transitions': True
})

trainer.train('pos_model.crfsuite')

# 3. Load the trained model to tag new sentences
tagger = pycrfsuite.Tagger()
tagger.open('pos_model.crfsuite')

test_sentence = "Ahina is testing CRF model for POS tagging .".split()
X_test = sent2features(test_sentence)
predicted_tags = tagger.tag(X_test)

print(list(zip(test_sentence, predicted_tags)))


[('Ahina', 'NNP'), ('is', 'VBZ'), ('testing', 'NN'), ('CRF', 'NNP'), ('model', 'NN'), ('for', 'IN'), ('POS', 'NNP'), ('tagging', 'NN'), ('.', '.')]
