In [1]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# CRF1.ipynb
#
# (1) Token lemmatization using Zemberek TurkishMorphology  (DO NOT lemmatize tokens starting with uppercase letter)
# (2) Token features:
#     (a) token
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# May, 2019
# --------------------------------------------------

In [23]:
from sklearn.model_selection import RandomizedSearchCV
from seqeval.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF
import jpype as jp
import pickle
import scipy

In [3]:
ZEMBEREK_PATH = 'bin/zemberek-full.jar'

# Start the JVM
jp.startJVM(jp.getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))

In [4]:
TurkishMorphology = jp.JClass('zemberek.morphology.TurkishMorphology')
morphology        = TurkishMorphology.createWithDefaults()

In [5]:
def readFile(filepath):

    text     = []
    sentence = []

    with open(filepath, 'r') as infile:
        for line in infile:
            word, _, _, _ = line.strip().split('\t')

            if word == '<S>':
                text.append(sentence)
                sentence = []
                continue

            sentence.append(line.strip())

    return text

In [6]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt')

In [8]:
def getFeature(word, word_index, sentence):

    feature = {'word' : word}
    
    return feature

In [9]:
trainFeatures = []
trainLabels   = []

for sentence in trainText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    trainFeatures.append(features)
    trainLabels.append(labels)

In [10]:
validFeatures = []
validLabels   = []

for sentence in validText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    validFeatures.append(features)
    validLabels.append(labels)

In [11]:
testFeatures = []
testLabels   = []

for sentence in testText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    testFeatures.append(features)
    testLabels.append(labels)

In [12]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels + validLabels

In [13]:
# define fixed parameters and parameters to search
crf = CRF(  algorithm='lbfgs',
            max_iterations=100,
            all_possible_transitions=True,
            verbose=True)

params_space = {'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05)}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

rs.fit(trainvalFeatures, trainvalLabels)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 61.1min finished
loading training data to CRFsuite: 100%|██████████| 32171/32171 [00:01<00:00, 16860.61it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 34673
Seconds required: 0.404

L-BFGS optimization
c1: 0.008804
c2: 0.002801
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.73  loss=840729.12 active=34648 feature_norm=1.00
Iter 2   time=1.84  loss=477898.36 active=34654 feature_norm=5.94
Iter 3   time=0.33  loss=397499.61 active=33526 feature_norm=4.97
Iter 4   time=1.35  loss=304123.62 active=34388 feature_norm=3.51
Iter 5   time=0.32  loss=300509.23 active=34637 feature_norm=3.99
Iter 6   time=0.31  loss=289494.28 active=34636 feature_norm=4.17
Iter 7   time=0.32  loss=287070.89 active=34608 feature_norm=6.96
Iter 8   time=0.31  loss=258969.09 active=34648 feature_norm=6.18
Iter 9   time=0.29  loss=254119.65 active=34664 feature_norm=6.79
Iter 

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...ne,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1b03e1b080>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1b020b76a0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn',
          scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [14]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.008804228477113443, 'c2': 0.0028006600124066087}
best CV score: 0.9522287185032302
model size: 2.36M


In [16]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.008804228477113443,   # <-- Found by applying grid search
    c2=0.0028006600124066087,  # <-- Found by applying grid search
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True)

In [17]:
crf.fit(trainvalFeatures, trainvalLabels)

loading training data to CRFsuite: 100%|██████████| 32171/32171 [00:01<00:00, 21579.56it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 34673
Seconds required: 0.303

L-BFGS optimization
c1: 0.008804
c2: 0.002801
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.61  loss=840729.12 active=34648 feature_norm=1.00
Iter 2   time=2.20  loss=477898.36 active=34654 feature_norm=5.94
Iter 3   time=0.47  loss=397499.61 active=33526 feature_norm=4.97
Iter 4   time=1.15  loss=304123.62 active=34388 feature_norm=3.51
Iter 5   time=0.35  loss=300509.23 active=34637 feature_norm=3.99
Iter 6   time=0.53  loss=289494.28 active=34636 feature_norm=4.17
Iter 7   time=0.50  loss=287070.89 active=34608 feature_norm=6.96
Iter 8   time=0.38  loss=258969.09 active=34648 feature_norm=6.18
Iter 9   time=0.28  loss=254119.65 active=34664 feature_norm=6.79
Iter 

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None,
  c1=0.008804228477113443, c2=0.0028006600124066087,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

In [24]:
# SAVE CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf1.pickle', 'wb') as outfile:
    pickle.dump(crf, outfile, pickle.HIGHEST_PROTOCOL)
    print("model is saved.")

model is saved.


In [27]:
# LOAD CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf1.pickle', 'rb') as infile:
    crf = pickle.load(infile)

In [28]:
trainPredLabels = crf.predict(trainFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainLabels, trainPredLabels))

### TRAIN CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.98      0.98      0.98     11079
     PERCENT       0.98      1.00      0.99       521
    LOCATION       0.98      0.99      0.98      7762
ORGANIZATION       0.97      0.97      0.97      7073
       MONEY       0.96      0.94      0.95       484
        DATE       0.77      0.77      0.77      2553
        TIME       0.99      0.99      0.99       156

   micro avg       0.96      0.96      0.96     29628
   macro avg       0.96      0.96      0.96     29628



In [19]:
validPredLabels = crf.predict(validFeatures)

print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(validLabels, validPredLabels))

### VAL CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.99      0.98      0.98      3397
    LOCATION       0.98      0.98      0.98      1647
ORGANIZATION       0.98      0.97      0.97      1961
        DATE       0.75      0.76      0.76       550
     PERCENT       0.97      1.00      0.98        96
       MONEY       1.00      0.97      0.99       110
        TIME       1.00      1.00      1.00        19

   micro avg       0.97      0.96      0.96      7780
   macro avg       0.97      0.96      0.96      7780



In [29]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.91      0.70      0.79      1594
    LOCATION       0.89      0.76      0.82      1091
ORGANIZATION       0.82      0.72      0.77       862
     PERCENT       0.86      0.91      0.88       107
        DATE       0.66      0.65      0.66       364
       MONEY       0.85      0.73      0.78       113
        TIME       0.90      0.39      0.55        23

   micro avg       0.86      0.72      0.78      4154
   macro avg       0.86      0.72      0.78      4154



In [21]:
# Shutting down the JVM
jp.shutdownJVM()