# STEP 1: Prepare Your Data

In [1]:
from sklearn.model_selection import train_test_split

# Load the data
def load_data(filepath):
    sentences = []
    tags = []
    with open(filepath, 'r', encoding='utf-8') as file:
        sentence = []
        tag_seq = []
        for line in file:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    tags.append(tag_seq)
                    sentence = []
                    tag_seq = []
            else:
                word, tag = line.split()
                sentence.append(word)
                tag_seq.append(tag)
        if sentence:  # Add the last sentence if there's no trailing newline
            sentences.append(sentence)
            tags.append(tag_seq)
    return sentences, tags

# Replace 'your_dataset.txt' with the path to your dataset file
sentences, tags = load_data('wlina_bd.txt')

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(sentences, tags, test_size=0.2, random_state=42)

# Step 2: Feature Extraction

In [2]:
def word2features(sent, i):
    word = sent[i]
    
    features = {
        'word': word,
        'is_first': i == 0,
        'is_last': i == len(sent) - 1,
        'is_capitalized': word[0].upper() == word[0],
        'is_all_caps': word.upper() == word,
        'is_all_lower': word.lower() == word,
        'prefix-1': word[0],
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-1': word[-1],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'prev_word': '' if i == 0 else sent[i - 1],
        'next_word': '' if i == len(sent) - 1 else sent[i + 1],
    }
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

X_train_feats = [sent2features(s) for s in X_train]
X_test_feats = [sent2features(s) for s in X_test]

# Step 3: Train the Model

In [3]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Train the model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train_feats, y_train)

# Step 4: Evaluate the Model

In [4]:
# Predict the tags for the test set
y_pred = crf.predict(X_test_feats)

# Evaluate the performance
f1_score = metrics.flat_f1_score(y_test, y_pred, average='weighted')
print(f'F1 Score: {f1_score:.4f}')

# Display classification report
report = metrics.flat_classification_report(y_test, y_pred, digits=3)
print(report)

F1 Score: 0.9795
              precision    recall  f1-score   support

      B-DATE      0.910     0.910     0.910       670
       B-LOC      0.969     0.960     0.965      2431
      B-MISC      0.966     0.918     0.941      1294
       B-ORG      0.933     0.927     0.930      1062
       B-PER      0.974     0.908     0.940       651
      I-DATE      0.937     0.951     0.944       824
       I-LOC      0.932     0.944     0.938       591
      I-MISC      0.933     0.949     0.941       547
       I-ORG      0.944     0.944     0.944      1259
       I-PER      0.978     0.927     0.952       614
           O      0.988     0.992     0.990     32377

    accuracy                          0.980     42320
   macro avg      0.951     0.939     0.945     42320
weighted avg      0.979     0.980     0.979     42320



# Step 5: Save the Model

In [5]:
import joblib

# Save the model to a file
joblib.dump(crf, 'crf_ner_model.pkl')

['crf_ner_model.pkl']

# Step 6: Load and Use the Model

In [13]:
import nltk
import joblib

# Load the CRF model
crf = joblib.load('crf_ner_model.pkl')

# Example raw sentence
new_sentence = "مەهدی ئۆزدەمیر لە بارۆی ئامەد رایگەیاند کە هەرێمی کوردستان و بەڕازیل و بەریتانیا و ئەحمەد و محەممەد لەگەڵ وەزارەتی تەندروستی هەرێمی کوردستان"

# Tokenize the sentence
tokens = nltk.word_tokenize(new_sentence)

# Convert the tokenized sentence into features
new_feats = [sent2features(tokens)]

# Predict the tags for the new sentence
new_pred = crf.predict(new_feats)

# Combine tokens and predicted tags
for word, tag in zip(tokens, new_pred[0]):
    print(f"{word} -> {tag}")

مەهدی -> B-PER
ئۆزدەمیر -> I-PER
لە -> O
بارۆی -> B-ORG
ئامەد -> I-ORG
رایگەیاند -> O
کە -> O
هەرێمی -> B-LOC
کوردستان -> I-LOC
و -> O
بەڕازیل -> B-LOC
و -> O
بەریتانیا -> B-LOC
و -> O
ئەحمەد -> B-PER
و -> O
محەممەد -> B-PER
لەگەڵ -> O
وەزارەتی -> B-ORG
تەندروستی -> I-ORG
هەرێمی -> I-ORG
کوردستان -> I-ORG
