In [3]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# CRF2.ipynb
#
# (1) Token lemmatization using Zemberek TurkishMorphology  (DO NOT lemmatize tokens starting with uppercase letter)
# (2) Token features:
#     (a) token
#     (b) is_first           : is token at the beginning of the sentence?
#     (c) is_last            : is token at the end of the sentence?
#     (d) is_capitalized     : does token start with a capital letter? 
#     (e) is_all_capitalized : is all letters of the token capitalized?
#     (f) is_capitals_inside : is there any capitalized letter inside the token?
#     (g) is_numeric         : is token numeric?
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# May, 2019
# --------------------------------------------------

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from seqeval.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF
import pickle
import jpype as jp
import scipy

In [5]:
ZEMBEREK_PATH = 'bin/zemberek-full.jar'

# Start the JVM
jp.startJVM(jp.getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))

In [6]:
TurkishMorphology = jp.JClass('zemberek.morphology.TurkishMorphology')
morphology        = TurkishMorphology.createWithDefaults()

In [7]:
def readFile(filepath):

    text     = []
    sentence = []

    with open(filepath, 'r') as infile:
        for line in infile:
            word, _, _, _ = line.strip().split('\t')

            if word == '<S>':
                text.append(sentence)
                sentence = []
                continue

            sentence.append(line.strip())

    return text

In [8]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt')

In [9]:
def getFeature(word, word_index, sentence):

    feature = {'word'              : word,
               'is_first'          : word_index == 0,
               'is_last'           : word_index == len(sentence) - 1,
               
               'is_capitalized'    : word[0].upper() == word[0],
               'is_all_capitalized': word.upper() == word,
               'is_capitals_inside': word[1:].lower() != word[1:],
               'is_numeric'        : word.isdigit()
              }
    
    return feature

In [10]:
trainFeatures = []
trainLabels   = []

for sentence in trainText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    trainFeatures.append(features)
    trainLabels.append(labels)

In [11]:
validFeatures = []
validLabels   = []

for sentence in validText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    validFeatures.append(features)
    validLabels.append(labels)

In [12]:
testFeatures = []
testLabels   = []

for sentence in testText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    testFeatures.append(features)
    testLabels.append(labels)

In [13]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels + validLabels

In [14]:
# define fixed parameters and parameters to search
crf = CRF(  algorithm='lbfgs',
            max_iterations=100,
            all_possible_transitions=True,
            verbose=True)

params_space = {'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05)}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

rs.fit(trainvalFeatures, trainvalLabels)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 25.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 109.4min finished
loading training data to CRFsuite: 100%|██████████| 32171/32171 [00:02<00:00, 13515.39it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 34763
Seconds required: 0.512

L-BFGS optimization
c1: 0.014335
c2: 0.005420
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.76  loss=822999.79 active=34738 feature_norm=1.00
Iter 2   time=1.54  loss=691955.70 active=34689 feature_norm=8.47
Iter 3   time=0.38  loss=556603.24 active=33593 feature_norm=7.12
Iter 4   time=2.64  loss=461626.12 active=34365 feature_norm=3.39
Iter 5   time=0.38  loss=410845.70 active=34653 feature_norm=6.39
Iter 6   time=0.38  loss=328087.35 active=34737 feature_norm=5.92
Iter 7   time=1.55  loss=234488.13 active=34665 feature_norm=5.44
Iter 8   time=0.39  loss=222218.11 active=34665 feature_norm=5.82
Iter 9   time=0.40  loss=202895.66 active=34701 feature_norm=6.49
Iter 

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...ne,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1af88af4a8>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1af88af5c0>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn',
          scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [15]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.01433488503417938, 'c2': 0.005420250818904982}
best CV score: 0.9604030884996774
model size: 1.76M


In [16]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01433488503417938,  # <-- Found by applying grid search
    c2=0.005420250818904982,  # <-- Found by applying grid search
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True)

In [17]:
crf.fit(trainvalFeatures, trainvalLabels)

loading training data to CRFsuite: 100%|██████████| 32171/32171 [00:02<00:00, 12407.35it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 34763
Seconds required: 0.508

L-BFGS optimization
c1: 0.014335
c2: 0.005420
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.88  loss=822999.79 active=34738 feature_norm=1.00
Iter 2   time=1.59  loss=691955.70 active=34689 feature_norm=8.47
Iter 3   time=0.38  loss=556603.24 active=33593 feature_norm=7.12
Iter 4   time=3.15  loss=461626.12 active=34365 feature_norm=3.39
Iter 5   time=0.40  loss=410845.70 active=34653 feature_norm=6.39
Iter 6   time=0.43  loss=328087.35 active=34737 feature_norm=5.92
Iter 7   time=1.85  loss=234488.13 active=34665 feature_norm=5.44
Iter 8   time=0.48  loss=222218.11 active=34665 feature_norm=5.82
Iter 9   time=0.50  loss=202895.66 active=34701 feature_norm=6.49
Iter 

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None,
  c1=0.01433488503417938, c2=0.005420250818904982,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

In [22]:
# SAVE CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf2.pickle', 'wb') as outfile:
    pickle.dump(crf, outfile, pickle.HIGHEST_PROTOCOL)
    print("model is saved.")

model is saved.


In [23]:
# LOAD CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf2.pickle', 'rb') as infile:
    crf = pickle.load(infile)

In [24]:
trainPredLabels = crf.predict(trainFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainLabels, trainPredLabels))

### TRAIN CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

    LOCATION       0.98      0.99      0.98      7762
ORGANIZATION       0.97      0.97      0.97      7073
      PERSON       0.98      0.98      0.98     11079
        DATE       0.78      0.77      0.78      2553
     PERCENT       1.00      1.00      1.00       521
       MONEY       0.97      0.94      0.95       484
        TIME       0.99      1.00      1.00       156

   micro avg       0.96      0.96      0.96     29628
   macro avg       0.96      0.96      0.96     29628



In [25]:
validPredLabels = crf.predict(validFeatures)

print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(validLabels, validPredLabels))

### VAL CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

       MONEY       0.97      0.98      0.98       110
ORGANIZATION       0.98      0.97      0.98      1961
    LOCATION       0.98      0.98      0.98      1647
      PERSON       0.99      0.98      0.98      3397
     PERCENT       1.00      1.00      1.00        96
        DATE       0.77      0.77      0.77       550
        TIME       1.00      1.00      1.00        19

   micro avg       0.97      0.97      0.97      7780
   macro avg       0.97      0.97      0.97      7780



In [26]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

ORGANIZATION       0.79      0.76      0.77       862
    LOCATION       0.90      0.76      0.82      1091
      PERSON       0.89      0.76      0.82      1594
     PERCENT       0.98      0.93      0.96       107
        DATE       0.68      0.67      0.67       364
       MONEY       0.87      0.72      0.79       113
        TIME       0.78      0.30      0.44        23

   micro avg       0.85      0.75      0.80      4154
   macro avg       0.86      0.75      0.80      4154



In [None]:
# Shutting down the JVM
jp.shutdownJVM()