In [None]:
import os
import joblib
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy, nltk
from datasets import load_dataset
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import LdaModel
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")
nltk.download('treebank')
stop_words = set(stopwords.words("english"))
corpus = nltk.corpus.treebank.tagged_sents()

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.isalpha()': word.isalpha(),
        'word.istitle()': word.istitle(),
    }
    if i > 0:
        prev_word = sent[i-1][0]
        features.update({
            '-1:word.lower()': prev_word.lower(),
            '-1:word.istitle()': prev_word.istitle(),
            '-1:word.isupper()': prev_word.isupper(),
        })
    else:
        features['BOS'] = True

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

In [None]:
X = [sent2features(c) for c in corpus]
y = [sent2labels(c) for c in corpus]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
crf = CRF(
    algorithm='lbfgs'
)
crf.fit(X_train, y_train)

In [None]:
y_pred = crf.predict(X_test)

In [None]:
metrics.flat_f1_score(y_pred, y_test, average='macro')