# SVM-HMM One Go - without O tag F1 Score

In [3]:
import nltk
import joblib
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split

# Load the data
def load_data(filepath):
    sentences = []
    tags = []
    with open(filepath, 'r', encoding='utf-8') as file:
        sentence = []
        tag_seq = []
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag_seq)
                    sentence = []
                    tag_seq = []
            else:
                word, tag = line.split()
                sentence.append(word)
                tag_seq.append(tag)
        if sentence:  # Add the last sentence if there's no trailing newline
            sentences.append(sentence)
            tags.append(tag_seq)
    return sentences, tags

# Replace 'your_dataset.txt' with the path to your dataset file
sentences, tags = load_data('wlina_bd.txt')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(sentences, tags, test_size=0.2, random_state=42)

# Feature extraction functions
def word2features(sent, i):
    word = sent[i]
    
    features = {
        'word': word,
        'is_first': i == 0,
        'is_last': i == len(sent) - 1,
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,
        'is_all_lower': word.lower() == word,
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'prev_word': '' if i == 0 else sent[i - 1],
        'next_word': '' if i == len(sent) - 1 else sent[i + 1],
    }
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

X_train_feats = [sent2features(s) for s in X_train]
X_test_feats = [sent2features(s) for s in X_test]

# Train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_feats, y_train)

# Predict the tags for the test set
y_pred = crf.predict(X_test_feats)

# Filter out 'O' tag when calculating metrics
labels = list(crf.classes_)
labels.remove('O')

# Evaluate the performance excluding 'O' tag
f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)
print(f'F1 Score (excluding O): {f1_score:.4f}')

# Display classification report excluding 'O' tag
report = metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3)
print(report)

# Save the model to a file
joblib.dump(crf, 'crf_ner_model_noO.pkl')

# Prediction on new text sentence
def predict_entities(sentence):
    tokens = nltk.word_tokenize(sentence)
    features = [sent2features(tokens)]
    prediction = crf.predict(features)[0]
    
    # Combine tokens and predictions
    return list(zip(tokens, prediction))

# Example usage
new_sentence = "مەهدی ئۆزدەمیر لە بارۆی ئامەد رایگەیاند"
entities = predict_entities(new_sentence)

for word, tag in entities:
    print(f"{word} -> {tag}")

F1 Score (excluding O): 0.9446
              precision    recall  f1-score   support

       B-LOC      0.969     0.960     0.965      2431
      B-MISC      0.966     0.918     0.941      1294
       B-ORG      0.933     0.927     0.930      1062
       B-PER      0.974     0.908     0.940       651
       I-PER      0.978     0.927     0.952       614
       I-ORG      0.944     0.944     0.944      1259
      I-MISC      0.933     0.949     0.941       547
       I-LOC      0.932     0.944     0.938       591
      B-DATE      0.910     0.910     0.910       670
      I-DATE      0.937     0.951     0.944       824

   micro avg      0.951     0.938     0.945      9943
   macro avg      0.948     0.934     0.940      9943
weighted avg      0.952     0.938     0.945      9943

مەهدی -> B-PER
ئۆزدەمیر -> I-PER
لە -> O
بارۆی -> B-ORG
ئامەد -> I-ORG
رایگەیاند -> O
