In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.model_selection import train_test_split
import joblib

# Function to load data
def load_data(filepath):
    sentences = []
    sentence = []
    
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                token, tag = line.split()
                sentence.append((token, tag))
                
    if sentence:  # Add last sentence if file doesn't end with a newline
        sentences.append(sentence)
    
    return sentences

# Function to extract features from a token
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

# Function to prepare features for all sentences
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

# Load your dataset
data = load_data('wlina_bd.txt')

# Prepare data for CRF model
X = [sent2features(s) for s in data]
y = [sent2labels(s) for s in data]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train CRF MODEL

In [2]:
# Define the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # Coefficient for L1 penalty
    c2=0.1,  # Coefficient for L2 penalty
    max_iterations=100,
    all_possible_transitions=True
)

# Train the CRF model
crf.fit(X_train, y_train)

# Predict on the test set
y_pred = crf.predict(X_test)

# Evaluate the model
labels = list(crf.classes_)
labels.remove('O')  # Remove 'O' for evaluation

print("F1 score:")
print(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))

print("\nClassification Report:")
print(metrics.flat_classification_report(y_test, y_pred, labels=labels))

F1 score:
0.943574619876435

Classification Report:
              precision    recall  f1-score   support

       B-LOC       0.97      0.96      0.96      2431
      B-MISC       0.97      0.92      0.94      1294
       B-ORG       0.94      0.93      0.93      1062
       B-PER       0.97      0.88      0.92       651
       I-PER       0.97      0.91      0.94       614
       I-ORG       0.95      0.95      0.95      1259
      I-MISC       0.93      0.95      0.94       547
       I-LOC       0.93      0.95      0.94       591
      B-DATE       0.92      0.91      0.92       670
      I-DATE       0.94      0.95      0.95       824

   micro avg       0.95      0.94      0.94      9943
   macro avg       0.95      0.93      0.94      9943
weighted avg       0.95      0.94      0.94      9943



# SAVE THE MODEL

In [4]:
# Save the trained CRF model
model_filename = 'crf_ner_model.pkl'
joblib.dump(crf, model_filename)

print(f"Model saved to {model_filename}")

Model saved to crf_ner_model.pkl


# Loading and Using the Saved Model

In [6]:
# Load the model
crf = joblib.load('crf_ner_model.pkl')

# Use the model for prediction on new sentences
new_sentence = [('هەرێمی',), ('کوردستان',), ('تورکیا',)]
new_features = [sent2features(new_sentence)]
predicted_labels = crf.predict(new_features)

print(predicted_labels)

[['B-LOC' 'I-LOC' 'B-LOC']]
