In [40]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# CRF1.ipynb
#
# (1) Token lemmatization using Zemberek TurkishMorphology  (DO NOT lemmatize tokens starting with uppercase letter)
# (2) Token features:
#     (a) token
#     (b) is_first           : is token at the beginning of the sentence?
#     (c) is_last            : is token at the end of the sentence?
#     (d) is_capitalized     : does token start with a capital letter? 
#     (e) is_all_capitalized : is all letters of the token capitalized?
#     (f) is_capitals_inside : is there any capitalized letter inside the token?
#     (g) is_numeric         : is token numeric?
#     (h) prefix-1           : prefix of the token with length 1
#     (i) prefix-2           : prefix of the token with length 2
#     (j) suffix-1           : suffix of the token with length 1
#     (k) suffix-2           : suffix of the token with length 2
#     (l) prev-token         : token coming before the token
#     (m) 2-prev-token       : token coming 2 before the token 
#     (n) next-token         : token coming after the token
#     (o) 2-next-token       : token coming 2 after the token
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# May, 2019
# --------------------------------------------------

In [38]:
from seqeval.metrics import classification_report
from sklearn_crfsuite import CRF
import jpype as jp

In [2]:
ZEMBEREK_PATH = 'bin/zemberek-full.jar'

# Start the JVM
jp.startJVM(jp.getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))

In [30]:
TurkishMorphology   = jp.JClass('zemberek.morphology.TurkishMorphology')
morphology = TurkishMorphology.createWithDefaults()

In [31]:
def readFile(filepath):

    text     = []
    sentence = []

    with open(filepath, 'r') as infile:
        for line in infile:
            word, _, _, _ = line.strip().split('\t')

            if word == '<S>':
                text.append(sentence)
                sentence = []
                continue

            sentence.append(line.strip())

    return text

In [32]:
trainText = readFile('train.txt')
validText = readFile('valid.txt')
testText  = readFile('test.txt')

In [33]:
def getFeature(word, word_index, sentence):

    feature = {'word'              : word,
               'is_first'          : word_index == 0,
               'is_last'           : word_index == len(sentence) - 1,
               
               'is_capitalized'    : word[0].upper() == word[0],
               'is_all_capitalized': word.upper() == word,
               'is_capitals_inside': word[1:].lower() != word[1:],
               'is_numeric'        : word.isdigit(),

               'prefix-1'          : word[0],
               'prefix-2'          : '' if len(word) < 2  else word[:2],

               'suffix-1'          : word[-1],
               'suffix-2'          : '' if len(word) < 2  else word[-2:],

               'prev-token'        : '' if word_index == 0     else sentence[word_index - 1][0],
               '2-prev-token'      : '' if word_index <= 1     else sentence[word_index - 2][0],

               'next-token'        : '' if word_index == len(sentence) - 1     else sentence[word_index + 1][0],
               '2-next-token'      : '' if word_index >= len(sentence) - 2     else sentence[word_index + 2][0]
              }
    
    return feature

In [34]:
trainFeatures = []
trainLabels   = []

for sentence in trainText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    trainFeatures.append(features)
    trainLabels.append(labels)

In [35]:
validFeatures = []
validLabels   = []

for sentence in validText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    validFeatures.append(features)
    validLabels.append(labels)

In [36]:
testFeatures = []
testLabels   = []

for sentence in testText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    testFeatures.append(features)
    testLabels.append(labels)

In [24]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels + validLabels

In [25]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True)

In [26]:
crf.fit(trainvalFeatures, trainvalLabels)

loading training data to CRFsuite: 100%|██████████| 32171/32171 [00:03<00:00, 8150.50it/s]



Feature generation
type: CRF1d
feature.minfreq: 0,000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=True)

In [27]:
trainPredLabels = crf.predict(trainFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainLabels, trainPredLabels))

### TRAIN CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

    LOCATION       0.89      0.85      0.87      7762
        DATE       0.86      0.87      0.86      2553
       MONEY       0.94      0.91      0.92       484
      PERSON       0.89      0.88      0.89     11079
ORGANIZATION       0.90      0.84      0.87      7073
        TIME       0.89      0.87      0.88       156
     PERCENT       0.97      0.98      0.98       521

   micro avg       0.89      0.86      0.88     29628
   macro avg       0.89      0.86      0.88     29628



In [28]:
validPredLabels = crf.predict(validFeatures)

print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(validLabels, validPredLabels))

### VAL CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

    LOCATION       0.87      0.85      0.86      1647
        DATE       0.85      0.88      0.86       550
        TIME       0.74      0.89      0.81        19
      PERSON       0.91      0.89      0.90      3397
ORGANIZATION       0.89      0.81      0.85      1961
       MONEY       0.90      0.95      0.92       110
     PERCENT       1.00      1.00      1.00        96

   micro avg       0.89      0.86      0.88      7780
   macro avg       0.89      0.86      0.88      7780



In [29]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.83      0.78      0.80      1594
        DATE       0.75      0.74      0.75       364
        TIME       0.71      0.74      0.72        23
    LOCATION       0.81      0.76      0.78      1091
ORGANIZATION       0.78      0.70      0.74       862
       MONEY       0.91      0.75      0.83       113
     PERCENT       0.98      0.95      0.97       107

   micro avg       0.81      0.76      0.78      4154
   macro avg       0.81      0.76      0.78      4154



In [10]:
# Shutting down the JVM
jp.shutdownJVM()