In [1]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# CRF1.ipynb
#
# Token features:
#     (a) token (surface form)
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# May, 2019
# --------------------------------------------------

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from seqeval.metrics import classification_report
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
from sklearn_crfsuite import CRF
from tqdm import tqdm
import pickle
import scipy

In [5]:
def readFile(filepath):

    text     = []
    sentence = []

    with open(filepath, 'r') as infile:
        for line in infile:
            word, _, _, _ = line.strip().split('\t')

            if word == '<S>':
                text.append(sentence)
                sentence = []
                continue

            sentence.append(line.strip())

    return text

In [6]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt')

In [7]:
def getFeature(token, token_index, sentence):

    feature = {'token' : token}
    
    return feature

In [10]:
trainFeatures = []
trainLabels   = []

for sentence in tqdm(trainText):

    features = []
    labels   = []
    for i, token in enumerate(sentence):
        word, _, _, label = token.split('\t')
        
        features.append(getFeature(word, i, sentence))
        labels.append(label)
    
    trainFeatures.append(features)
    trainLabels.append(labels)

100%|██████████| 25736/25736 [00:00<00:00, 44647.07it/s]


In [13]:
validFeatures = []
validLabels   = []

for sentence in tqdm(validText):
    
    features = []
    labels   = []
    for i, token in enumerate(sentence):
        word, _, _, label = token.split('\t')
        
        features.append(getFeature(word, i, sentence))
        labels.append(label)
    
    validFeatures.append(features)
    validLabels.append(labels)

100%|██████████| 6435/6435 [00:00<00:00, 25127.49it/s]


In [14]:
testFeatures = []
testLabels   = []

for sentence in tqdm(testText):
    
    features = []
    labels   = []
    for i, token in enumerate(sentence):
        word, _, _, label = token.split('\t')
        
        features.append(getFeature(word, i, sentence))
        labels.append(label)
    
    testFeatures.append(features)
    testLabels.append(labels)

100%|██████████| 3328/3328 [00:00<00:00, 49880.98it/s]


In [16]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels   + validLabels

In [18]:
# define fixed parameters and parameters to search
crf = CRF(  algorithm='lbfgs',
            max_iterations=100,
            all_possible_transitions=True,
            verbose=True)

params_space = {'c1': scipy.stats.expon(scale=0.5),
                'c2': scipy.stats.expon(scale=0.05)}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=35,
                        scoring=f1_scorer,
                        random_state=123)

rs.fit(trainvalFeatures, trainvalLabels)

Fitting 3 folds for each of 35 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 20.4min
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed: 48.3min finished
loading training data to CRFsuite: 100%|██████████| 32171/32171 [00:02<00:00, 11730.62it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 75320
Seconds required: 0.672

L-BFGS optimization
c1: 0.100747
c2: 0.009646
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.70  loss=840897.89 active=75260 feature_norm=1.00
Iter 2   time=2.00  loss=478493.44 active=75245 feature_norm=5.95
Iter 3   time=0.38  loss=397970.44 active=74095 feature_norm=4.97
Iter 4   time=1.72  loss=304593.99 active=75041 feature_norm=3.50
Iter 5   time=0.39  loss=301300.81 active=75240 feature_norm=4.00
Iter 6   time=0.40  loss=290595.76 active=75268 feature_norm=4.15
Iter 7   time=0.36  loss=282419.68 active=58849 feature_norm=6.47
Iter 8   time=0.35  loss=261314.14 active=75287 feature_norm=6.01
Iter 9   time=0.33  loss=255918.85 active=74837 feature_norm=6.60
Iter 

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...ne,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True),
          fit_params=None, iid='warn', n_iter=35, n_jobs=-1,
          param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1b03605cc0>, 'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1b0253e1d0>},
          pre_dispatch='2*n_jobs', random_state=123, refit=True,
          return_train_score='warn',
          scoring=make_scorer(flat_f1_score, average=weighted), verbose=1)

In [19]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.10074713030434022, 'c2': 0.009645981291748817}
best CV score: 0.9462335278963963
model size: 2.42M


In [20]:
crf = rs.best_estimator_
crf

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None,
  c1=0.10074713030434022, c2=0.009645981291748817,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

In [None]:
#crf.fit(trainvalFeatures, trainvalLabels)

In [23]:
# SAVE CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf01.pickle', 'wb') as outfile:
    pickle.dump(crf, outfile, pickle.HIGHEST_PROTOCOL)
    print("model is saved.")

model is saved.


In [24]:
# LOAD CONDITIONAL RANDOM FIELDS MODEL
with open('model/crf01.pickle', 'rb') as infile:
    crf = pickle.load(infile)

In [25]:
trainvalPredLabels = crf.predict(trainvalFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainvalLabels, trainvalPredLabels))

### TRAIN CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

ORGANIZATION       0.96      0.96      0.96      9034
      PERSON       0.98      0.98      0.98     14476
     PERCENT       1.00      1.00      1.00       617
        DATE       0.95      0.96      0.95      3103
    LOCATION       0.98      0.99      0.98      9409
       MONEY       0.98      0.98      0.98       594
        TIME       0.99      1.00      1.00       175

   micro avg       0.97      0.98      0.97     37408
   macro avg       0.97      0.98      0.97     37408



In [26]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

    LOCATION       0.88      0.74      0.80      1091
ORGANIZATION       0.84      0.68      0.75       862
      PERSON       0.78      0.58      0.66      1594
       MONEY       0.88      0.76      0.82       113
        DATE       0.85      0.80      0.82       364
        TIME       1.00      0.43      0.61        23
     PERCENT       0.99      0.94      0.97       107

   micro avg       0.84      0.67      0.75      4154
   macro avg       0.83      0.67      0.74      4154

