In [1]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# CRF1.ipynb
#
# (1) Token lemmatization using Zemberek TurkishMorphology  (DO NOT lemmatize tokens starting with uppercase letter)
# (2) Token features:
#     (a) token
#     (b) is_first           : is token at the beginning of the sentence?
#     (c) is_last            : is token at the end of the sentence?
#     (d) is_capitalized     : does token start with a capital letter? 
#     (e) is_all_capitalized : is all letters of the token capitalized?
#     (f) is_capitals_inside : is there any capitalized letter inside the token?
#     (g) is_numeric         : is token numeric?
#     (h) prefix-1           : prefix of the token with length 1
#     (i) prefix-2           : prefix of the token with length 2
#     (j) suffix-1           : suffix of the token with length 1
#     (k) suffix-2           : suffix of the token with length 2
#     (l) prev-token         : token coming before the token
#     (m) 2-prev-token       : token coming 2 before the token 
#     (n) next-token         : token coming after the token
#     (o) 2-next-token       : token coming 2 after the token
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# May, 2019
# --------------------------------------------------

In [2]:
from seqeval.metrics import classification_report
from sklearn_crfsuite import CRF
import jpype as jp

In [3]:
ZEMBEREK_PATH = 'bin/zemberek-full.jar'

# Start the JVM
jp.startJVM(jp.getDefaultJVMPath(), '-ea', '-Djava.class.path=%s' % (ZEMBEREK_PATH))

In [4]:
TurkishMorphology   = jp.JClass('zemberek.morphology.TurkishMorphology')
morphology = TurkishMorphology.createWithDefaults()

In [5]:
def readFile(filepath):

    text     = []
    sentence = []

    with open(filepath, 'r') as infile:
        for line in infile:
            word, _, _, _ = line.strip().split('\t')

            if word == '<S>':
                text.append(sentence)
                sentence = []
                continue

            sentence.append(line.strip())

    return text

In [10]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt')

In [11]:
def getFeature(word, word_index, sentence):

    feature = {'word'              : word,
               'is_first'          : word_index == 0,
               'is_last'           : word_index == len(sentence) - 1,
               
               'is_capitalized'    : word[0].upper() == word[0],
               'is_all_capitalized': word.upper() == word,
               'is_capitals_inside': word[1:].lower() != word[1:],
               'is_numeric'        : word.isdigit(),

               'prefix-1'          : word[0],
               'prefix-2'          : '' if len(word) < 2  else word[:2],

               'suffix-1'          : word[-1],
               'suffix-2'          : '' if len(word) < 2  else word[-2:],

               'prev-token'        : '' if word_index == 0     else sentence[word_index - 1][0],
               '2-prev-token'      : '' if word_index <= 1     else sentence[word_index - 2][0],

               'next-token'        : '' if word_index == len(sentence) - 1     else sentence[word_index + 1][0],
               '2-next-token'      : '' if word_index >= len(sentence) - 2     else sentence[word_index + 2][0]
              }
    
    return feature

In [None]:
trainFeatures = []
trainLabels   = []

for sentence in trainText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    trainFeatures.append(features)
    trainLabels.append(labels)

In [None]:
validFeatures = []
validLabels   = []

for sentence in validText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    validFeatures.append(features)
    validLabels.append(labels)

In [None]:
testFeatures = []
testLabels   = []

for sentence in testText:
    SENT     = []
    features = []
    labels   = []
    for token in sentence:
        word, _, _, label = token.split('\t')
        
        if word[0] == word[0].upper():
            root = word
        else:
            results = morphology.analyze(word).analysisResults
            root    = results[0].getLemmas()[0] if results else word
        SENT.append(root)
        labels.append(label)
        
    for i, word in enumerate(SENT):
        features.append(getFeature(word, i, SENT))
    
    testFeatures.append(features)
    testLabels.append(labels)

In [None]:
trainvalFeatures = trainFeatures + validFeatures
trainvalLabels   = trainLabels + validLabels

In [None]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True)

In [None]:
crf.fit(trainvalFeatures, trainvalLabels)

In [None]:
trainPredLabels = crf.predict(trainFeatures)

print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainLabels, trainPredLabels))

In [None]:
validPredLabels = crf.predict(validFeatures)

print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(validLabels, validPredLabels))

In [None]:
testPredLabels  = crf.predict(testFeatures)

print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabels))

In [None]:
# Shutting down the JVM
jp.shutdownJVM()