In [1]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# CRF9.ipynb
#
# Token features:
#     (a) token (surface form)
#     (b) is_first           : is token at the beginning of the sentence?
#     (c) is_last            : is token at the end of the sentence?
#     (d) is_capitalized     : does token start with a capital letter? 
#     (e) is_all_capitalized : is all letters of the token capitalized?
#     (f) is_capitals_inside : is there any capitalized letter inside the token?
#     (g) is_numeric         : is token numeric?
#     (h) is_numeric_inside  : is numeric characters inside the token?
#     (i) is_alphanumeric    : is token alphanumeric?
#     (j) prefix-1           : first letter of the token
#     (k) suffix-1           : last letter of the token
#     (l) prefix-2           : first two letters of the token
#     (m) suffix-2           : last two letters of the token
#     (n) prefix-3           : first three letters of the token
#     (o) suffix-3           : last three letters of the token
#     (p) prefix-4           : first four letters of the token
#     (q) suffix-4           : last four letters of the token
#     (r) next-token         : following token
#     (s) prev-token         : preceding token
#     (t) 2-next-token       : second following token
#     (u) 2-prev-token       : second preceding token
#     (v) pos                : part-of-speech tag
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# May, 2019
# --------------------------------------------------

In [2]:
from sklearn.model_selection import RandomizedSearchCV
from seqeval.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF
from tqdm import tqdm
import jpype as jp
import pickle
import scipy

In [3]:
ZEMBEREK_PATH = 'bin/zemberek-full.jar'

# Start the JVM
jp.startJVM(jp.getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))

In [4]:
TurkishMorphology = jp.JClass('zemberek.morphology.TurkishMorphology')
morphology        = TurkishMorphology.createWithDefaults()

In [5]:
def readFile(filepath):

    text     = []
    sentence = []

    with open(filepath, 'r') as infile:
        for line in infile:
            word, _, _, _ = line.strip().split('\t')

            if word == '<S>':
                text.append(sentence)
                sentence = []
                continue

            sentence.append(line.strip())

    return text

In [6]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt')

In [7]:
def getFeature(token, token_index, sentence, pos):

    feature = {'token'             : token,
               'is_first'          : token_index == 0,
               'is_last'           : token_index == len(sentence) - 1,
               
               'is_capitalized'    : token[0].upper() == token[0],
               'is_all_capitalized': token.upper() == token,
               'is_capitals_inside': token[1:].lower() != token[1:],
               'is_numeric'        : token.isdigit(),
               'is_numeric_inside' : any([c.isdigit() for c in token]),
               'is_alphanumeric'   : token.isalnum(),
               
               'prefix-1'          : token[0],
               'suffix-1'          : token[-1],
               
               'prefix-2'          : '' if len(token) < 2  else token[:2],
               'suffix-2'          : '' if len(token) < 2  else token[-2:],

               'prefix-3'          : '' if len(token) < 3  else token[:3],
               'suffix-3'          : '' if len(token) < 3  else token[-3:],
                
               'prefix-4'          : '' if len(token) < 4  else token[:4],
               'suffix-4'          : '' if len(token) < 4  else token[-4:],
               
               'prev-token'        : '' if token_index == 0 else sentence[token_index - 1],
               'next-token'        : '' if token_index == len(sentence) - 1 else sentence[token_index + 1],
               
               '2-prev-token'      : '' if token_index <= 1 else sentence[token_index - 2],
               '2-next-token'      : '' if token_index >= len(sentence) - 2 else sentence[token_index + 2],
               
               'pos'               : pos, 
              }
    
    return feature

In [8]:
trainFeatures = []
trainLabels   = []

for sentence_ in tqdm(trainText):

    sentence = [token.split('\t')[0] for token in sentence_]
    labels   = [token.split('\t')[-1] for token in sentence_]
    features = []
    
    sentence_str = ' '.join(sentence)
    analysis = morphology.analyzeAndDisambiguate(sentence_str).bestAnalysis()
    
    for i, word in enumerate(sentence):
        features.append(getFeature(word, i, sentence, analysis[i].getPos().shortForm))
    
    trainFeatures.append(features)
    trainLabels.append(labels)

100%|██████████| 25736/25736 [10:37<00:00, 40.37it/s]  


In [9]:
validFeatures = []
validLabels   = []

for sentence_ in tqdm(validText):
    
    sentence = [token.split('\t')[0] for token in sentence_]
    labels   = [token.split('\t')[-1] for token in sentence_]
    features = []
    
    sentence_str = ' '.join(sentence)
    analysis = morphology.analyzeAndDisambiguate(sentence_str).bestAnalysis()
    
    for i, word in enumerate(sentence):

        features.append(getFeature(word, i, sentence, analysis[i].getPos().shortForm))
    
    validFeatures.append(features)
    validLabels.append(labels)

100%|██████████| 6435/6435 [04:10<00:00, 25.67it/s]


In [10]:
testFeatures = []
testLabels   = []

for sentence_ in tqdm(testText):
    
    sentence = [token.split('\t')[0] for token in sentence_]
    labels   = [token.split('\t')[-1] for token in sentence_]
    features = []
    
    sentence_str = ' '.join(sentence)
    analysis = morphology.analyzeAndDisambiguate(sentence_str).bestAnalysis()
    
    for i, word in enumerate(sentence):
        features.append(getFeature(word, i, sentence, analysis[i].getPos().shortForm))
    
    testFeatures.append(features)
    testLabels.append(labels)

100%|██████████| 3328/3328 [01:27<00:00, 37.99it/s] 


In [11]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels   + validLabels

In [14]:
# define fixed parameters and parameters to search
crf = CRF(  algorithm='lbfgs',
            max_iterations=100,
            all_possible_transitions=True,
            verbose=True)

params_space = {'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05)}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=30,
                        random_state=123,
                        scoring=f1_scorer)

rs.fit(trainvalFeatures, trainvalLabels)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 158.8min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 361.9min finished
loading training data to CRFsuite: 100%|██████████| 32171/32171 [00:37<00:00, 846.61it/s] 



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 450815
Seconds required: 7.291

L-BFGS optimization
c1: 0.048313
c2: 0.028432
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=3.12  loss=639643.01 active=449993 feature_norm=1.00
Iter 2   time=3.11  loss=443201.51 active=446235 feature_norm=4.42
Iter 3   time=1.56  loss=363049.67 active=429690 feature_norm=3.87
Iter 4   time=5.93  loss=321010.83 active=429390 feature_norm=2.61
Iter 5   time=1.45  loss=271637.21 active=440989 feature_norm=3.52
Iter 6   time=1.46  loss=229306.66 active=447038 feature_norm=3.35
Iter 7   time=1.48  loss=220139.69 active=443867 feature_norm=3.53
Iter 8   time=1.48  loss=198955.43 active=448788 feature_norm=3.65
Iter 9   time=1.49  loss=191322.64 active=447972 feature_norm=

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...ne,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True),
          fit_params=None, iid='warn', n_iter=30, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1b09834518>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1b098347f0>},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn',
          scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [15]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.04831323984330459, 'c2': 0.028431668831944812}
best CV score: 0.9728854782825247
model size: 4.36M


In [16]:
crf = rs.best_estimator_
crf

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None,
  c1=0.04831323984330459, c2=0.028431668831944812,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

In [17]:
#crf.fit(trainvalFeatures, trainvalLabels)

In [18]:
# SAVE CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf09.pickle', 'wb') as outfile:
    pickle.dump(crf, outfile, pickle.HIGHEST_PROTOCOL)
    print("model is saved.")

model is saved.


In [19]:
# LOAD CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf09.pickle', 'rb') as infile:
    crf = pickle.load(infile)

In [20]:
trainvalPredLabels = crf.predict(trainvalFeatures)

print("### TRAINVAL CLASSIFICATION REPORT ###\n")
print(classification_report(trainvalLabels, trainvalPredLabels))

### TRAINVAL CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

       MONEY       1.00      1.00      1.00       594
ORGANIZATION       1.00      1.00      1.00      9034
      PERSON       1.00      1.00      1.00     14476
    LOCATION       1.00      1.00      1.00      9409
        DATE       1.00      1.00      1.00      3103
     PERCENT       1.00      1.00      1.00       617
        TIME       1.00      1.00      1.00       175

   micro avg       1.00      1.00      1.00     37408
   macro avg       1.00      1.00      1.00     37408



In [21]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

ORGANIZATION       0.86      0.79      0.83       862
       MONEY       0.93      0.76      0.84       113
      PERSON       0.91      0.87      0.89      1594
    LOCATION       0.90      0.88      0.89      1091
     PERCENT       0.99      0.93      0.96       107
        DATE       0.91      0.89      0.90       364
        TIME       0.90      0.83      0.86        23

   micro avg       0.90      0.85      0.88      4154
   macro avg       0.90      0.85      0.88      4154



In [22]:
# Shutting down the JVM
jp.shutdownJVM()