<a href="https://colab.research.google.com/github/arqavan94/Persian_NLP_Task/blob/main/crf_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install hazm

In [None]:
!pip install python-crfsuite

In [None]:
!pip install sklearn_crfsuite

In [None]:
from hazm import *
import os, codecs
import numpy as np
from hazm import PeykareReader
from sklearn.model_selection import train_test_split
from hazm.PeykareReader import coarse_pos_e as peykare_coarse_pos_e
from nltk.tag.util import untag
from sklearn_crfsuite import metrics
from sklearn_crfsuite.utils import flatten
from sklearn.metrics import classification_report

# Load Data

In [None]:
peykare= PeykareReader(root='/content/drive/MyDrive/Peykare-full.zip/TextLabelData')
print(next(peykare.sents()))

In [None]:
text_labels= np.load('/content/drive/MyDrive/text_labels.npz', allow_pickle=True)
text_labels.files
text_labels= text_labels['arr_0']

# python-crfsuit

In [None]:
def is_punc(value):
  punctuation_list=['"', '#', '(', ')', '*', ',', '-', '.', '/', ':', '[', ']', '«', '»', '،',';','?','!']
  if value in punctuation_list:
      return True
  else:
      return False

In [None]:
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],

        'prev_word': '' if index == 0 else sentence[index - 1],
        'two_prev_word':'' if index == 0 else sentence[index - 2],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'two_next_word': '' if (index == len(sentence) - 1 or index == len(sentence) - 2) else sentence[index + 2],
        #digit
        'is_numeric': sentence[index].isdigit(),
        'prev_is_numeric': '' if index == 0 else sentence[index - 1].isdigit(),
        'next_is_numeric': '' if index == len(sentence) - 1 else sentence[index + 1].isdigit(),
        #punc
        'is_punc': is_punc(sentence[index]),
        'prev_is_punc':  '' if  index==0 else is_punc(sentence[index-1]),
        'next_is_punc':  '' if index== len(sentence) -1 else is_punc(sentence[index+1]),
        
    }

In [None]:
text_labels.shape

(344736,)

In [None]:
point = int(.3 * len(text_labels))
main_data = text_labels[:point]
remain_data = text_labels[point:]

In [None]:
main_data.shape

(103420,)

# Train-Test Split

In [None]:
def make_data(main_data):
    X, y = [], []
    for tagged in main_data:
      X.append([features(untag(tagged), index) for index in range(len(tagged))])
      y.append([tag for _, tag in tagged])
    return X,y
    

X, y= make_data(main_data)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train CRF Model

In [None]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)


trainer.set_params({
    
    'c1': 0.1,

   
    'c2': 0.01,  

    
    'max_iterations':100,

    
    'feature.possible_transitions': True
})

In [None]:
trainer.train('/content/drive/MyDrive/crf.model')

# Evaluation pycrfsuite

In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('/content/drive/MyDrive/crf.model')
y_pred_train = [tagger.tag(xseq) for xseq in X_train]

In [None]:
tagger2 = pycrfsuite.Tagger()
tagger2.open('/content/drive/MyDrive/crf.model')
y_pred_test = [tagger2.tag(xseq2) for xseq2 in X_test]

**Flatten data**

In [None]:
y_train_flat = flatten(y_train)
y_pred_train_flat= flatten(y_pred_train)

In [None]:
y_test_flat = flatten(y_test)
y_pred_test_flat= flatten(y_pred_test)

# Accuracy and Report pycrfsuite

In [None]:
print('Train set classification report: \n\n{}'.format(classification_report(y_train_flat, y_pred_train_flat)))
print('Train set accuracy score:{}'.format(metrics.flat_accuracy_score(y_train, y_pred_train)))
print('Train set f1-score:{}'.format(metrics.flat_f1_score(y_train, y_pred_train, average='weighted')))
print('Train set precision-score:{}'.format(metrics.flat_precision_score(y_train, y_pred_train, average='weighted')))
print('Train set recall-score:{}'.format(metrics.flat_recall_score(y_train, y_pred_train, average='weighted')))

Train set classification report: 

              precision    recall  f1-score   support

         ADV       0.99      0.99      0.99     45138
        ADVe       0.99      1.00      1.00      2553
          AJ       0.98      0.98      0.98    136020
         AJe       0.98      0.98      0.98     41203
          CL       0.98      0.98      0.98      4124
        CONJ       1.00      1.00      1.00    180975
       CONJe       1.00      1.00      1.00       156
         DET       0.99      0.99      0.99     49712
        DETe       0.99      1.00      1.00      4724
         INT       1.00      1.00      1.00       255
           N       0.99      0.98      0.98    450739
         NUM       1.00      1.00      1.00     55294
        NUMe       0.98      0.99      0.98      4843
          Ne       0.98      0.99      0.98    350549
           P       1.00      1.00      1.00    206434
       POSTP       1.00      1.00      1.00     34933
         PRO       0.99      1.00      0.99   

In [None]:
print('Test set classification report: \n\n{}'.format(classification_report(y_test_flat, y_pred_test_flat)))
print('Test set accuracy score:{}'.format(metrics.flat_accuracy_score(y_test, y_pred_test)))
print('Test set f1-score: {}'.format(metrics.flat_f1_score(y_test, y_pred_test, average='weighted')))
print('Test set precision-score: {}'.format(metrics.flat_precision_score(y_test, y_pred_test, average='weighted')))
print('Test set recall-score:{}'.format(metrics.flat_recall_score(y_test, y_pred_test, average='weighted')))

Test set classification report: 

              precision    recall  f1-score   support

         ADV       0.93      0.91      0.92     11549
        ADVe       0.97      0.92      0.94       667
          AJ       0.91      0.91      0.91     34252
         AJe       0.86      0.84      0.85     10365
          CL       0.86      0.76      0.81      1050
        CONJ       0.99      0.99      0.99     45488
       CONJe       1.00      0.97      0.99        34
         DET       0.96      0.97      0.96     12493
        DETe       0.91      0.91      0.91      1094
         INT       0.93      0.97      0.95        68
           N       0.95      0.94      0.94    112410
         NUM       0.98      0.98      0.98     13305
        NUMe       0.87      0.84      0.85      1154
          Ne       0.94      0.95      0.95     87940
           P       0.99      1.00      1.00     51463
       POSTP       1.00      1.00      1.00      8813
         PRO       0.96      0.96      0.96    

# sklearn-crfsuite

In [None]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn_crfsuite import CRF
model = CRF(verbose =True, max_iterations=100)
try:
    model.fit(X_train2, y_train2)
except AttributeError:
    pass

loading training data to CRFsuite: 100%|██████████| 82736/82736 [00:49<00:00, 1657.81it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 941227
Seconds required: 15.530

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=25.09 loss=3799420.07 active=941227 feature_norm=5.00
Iter 2   time=9.27  loss=2528737.43 active=941227 feature_norm=12.88
Iter 3   time=10.85 loss=2014463.98 active=941227 feature_norm=14.30
Iter 4   time=7.79  loss=1816760.23 active=941227 feature_norm=15.52
Iter 5   time=9.13  loss=1489920.36 active=941227 feature_norm=22.36
Iter 6   time=8.22  loss=1307848.19 active=941227 feature_norm=24.93
Iter 7   time=10.47 loss=1222389.59 active=941227 feature_norm=26.24
Iter 8   time=9.57  loss=1147497.63 active=941227 feature_norm=29.22
Iter 9   time=7.90  loss=1079104.44 active=941

AttributeError: ignored

AttributeError: ignored

AttributeError: ignored

In [None]:
y_pred_test2 = model.predict(X_test2)
y_test_flat2 = flatten(y_test2)
y_pred_test_flat2= flatten(y_pred_test2)

In [None]:
print('Test set classification report: \n\n{}'.format(classification_report(y_test_flat2, y_pred_test_flat2)))
print('Test set accuracy score:{}'.format(metrics.flat_accuracy_score(y_test2, y_pred_test2)))
print('Test set f1-score: {}'.format(metrics.flat_f1_score(y_test2, y_pred_test2, average='weighted')))
print('Test set precision-score: {}'.format(metrics.flat_precision_score(y_test2, y_pred_test2, average='weighted')))
print('Test set recall-score:{}'.format(metrics.flat_recall_score(y_test2, y_pred_test2, average='weighted')))

Test set classification report: 

              precision    recall  f1-score   support

         ADV       0.92      0.90      0.91     11397
        ADVe       0.95      0.91      0.93       630
          AJ       0.90      0.87      0.89     34091
         AJe       0.84      0.82      0.83     10420
          CL       0.86      0.80      0.83      1108
        CONJ       0.99      0.99      0.99     45104
       CONJe       0.97      0.85      0.91        41
         DET       0.96      0.97      0.96     12522
        DETe       0.90      0.91      0.90      1189
         INT       0.89      0.87      0.88        63
           N       0.94      0.93      0.93    112265
         NUM       0.98      0.98      0.98     13404
        NUMe       0.85      0.81      0.83      1147
          Ne       0.92      0.95      0.94     87583
           P       1.00      1.00      1.00     51640
       POSTP       1.00      1.00      1.00      8839
         PRO       0.96      0.96      0.96    