In [1]:
import os
import pymorphy2
morph = pymorphy2.MorphAnalyzer(lang="uk")
PATH = "/Users/bmarchenko/projects/ner-uk-master/"
def read_tokens(filename):
    tokens = []
    pos = 0
    with open(filename, "r") as f:
        text = f.read().split("\n")
        for line in text:
            if len(line) == 0:
                pos += 1
            else:
                for token in line.split(" "):
                    tokens.append((token, pos, pos + len(token)))
                    pos += len(token) + 1
    return tokens

# Read annotations and positions of annotations from a file

def read_annotations(filename):
    anno = []
    with open(filename, "r") as f:
        for line in f.readlines():
            annotations = line.split()
            anno.append((int(annotations[2]), int(annotations[3]), annotations[1]))
    return anno

def extract_labels(anno, tokens):
    labels = []
    ann_id = 0
    for token in tokens:
        if ann_id < len(anno):
            beg, end, label = anno[ann_id]
            if token[1] < beg:
                labels.append("--")
            # if token[1] == beg or (token[1] > beg and token[1] < end)
            else:
                labels.append(label)
                if token[2] == end:
                    ann_id += 1
        else:
            labels.append("--")    
    return labels

# tokens = read_tokens(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.txt")
# anno = read_annotations(PATH + "data/A_alumni.krok.edu.ua_Prokopenko_Vidrodzhennia_velotreku(5).tok.ann")
# labels = extract_labels(anno, tokens)

# for i, j in zip(tokens, labels):
#     print(i[0], j)

# Extract list of files for training and testing

dev_test = {"dev": [], "test": []}
category = ""
with open(PATH + "doc/dev-test-split.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        if line in ["DEV", "TEST"]:
            category = line.lower()
        elif len(line) == 0:
            continue
        else:
            dev_test[category].append(line)

print(len(dev_test["dev"]), len(dev_test["test"]))



156 73


In [2]:
# Get train and test data and labels

train_tokens, test_tokens, train_labels, test_labels = [], [], [], []

for filename in dev_test["dev"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        train_tokens += tokens
        train_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

for filename in dev_test["test"]:
    try:
        tokens = read_tokens(PATH + "data/" + filename + ".txt")
        test_tokens += tokens
        test_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), tokens)
    except:
        pass

In [3]:
train_tokens


[('На', 0, 2),
 ('довірливих', 3, 13),
 ('кіровоградців', 14, 27),
 ('полюють', 28, 35),
 ('шахраї', 36, 42),
 ('та', 43, 45),
 ('фірми-посередники', 46, 63),
 (',', 64, 65),
 ('які', 66, 69),
 ('за', 70, 72),
 ('1000', 73, 77),
 ('грн', 78, 81),
 ('.', 82, 83),
 ('готові', 84, 90),
 ('«', 91, 92),
 ('виготовити', 93, 103),
 ('»', 104, 105),
 ('біометричний', 106, 118),
 ('паспорт', 119, 126),
 (',', 127, 128),
 ('який', 129, 133),
 ('коштує', 134, 140),
 ('518', 141, 144),
 ('грн', 145, 148),
 ('.', 149, 150),
 ('Із', 152, 154),
 ('запровадженням', 155, 169),
 ('біометричних', 170, 182),
 ('паспортів', 183, 192),
 ('активізувалися', 193, 207),
 ('шахраї', 208, 214),
 ('та', 215, 217),
 ('фірми-посередники', 218, 235),
 (',', 236, 237),
 ('які', 238, 241),
 ('пропонують', 242, 252),
 ('«', 253, 254),
 ('прискорити', 255, 265),
 ('»', 266, 267),
 ('оформлення', 268, 278),
 ('біометричного', 279, 292),
 ('паспорта', 293, 301),
 ('або', 302, 305),
 ('просто', 306, 312),
 ('оформити', 313,

Let's extract features for each word

In [4]:
def word2features(train_tokens, i):
    word = train_tokens[i][0]
    features = {}
    features["word"] = word
    features["is_capitalized"] = word.istitle()
    features["is_upper"] = word.isupper()
    features["is_digit"] = word.isdigit()
    features["length"] = len(word)
    if i > 0:
        word1 = train_tokens[i-1][0]
        features["word-1"] = word
        features["is_capitalized-1"] = word1.istitle()
        features["is_upper-1"] = word1.isupper()
        features["is_digit-1"] = word1.isdigit()
        features["length-1"] = len(word1)
    else:
        features['BOS'] = True
    try:
        word1 = train_tokens[i+1][0]
        features["word+1"] = word
        features["is_capitalized+1"] = word1.istitle()
        features["is_upper+1"] = word1.isupper()
        features["is_digit+1"] = word1.isdigit()
        features["length+1"] = len(word1)
    except:
        features['EOS'] = True
    return features
train_features = [word2features(train_tokens, i) for i in range(len(train_tokens))]


In [5]:
train_features[:10]

[{'BOS': True,
  'is_capitalized': True,
  'is_capitalized+1': False,
  'is_digit': False,
  'is_digit+1': False,
  'is_upper': False,
  'is_upper+1': False,
  'length': 2,
  'length+1': 10,
  'word': 'На',
  'word+1': 'На'},
 {'is_capitalized': False,
  'is_capitalized+1': False,
  'is_capitalized-1': True,
  'is_digit': False,
  'is_digit+1': False,
  'is_digit-1': False,
  'is_upper': False,
  'is_upper+1': False,
  'is_upper-1': False,
  'length': 10,
  'length+1': 13,
  'length-1': 2,
  'word': 'довірливих',
  'word+1': 'довірливих',
  'word-1': 'довірливих'},
 {'is_capitalized': False,
  'is_capitalized+1': False,
  'is_capitalized-1': False,
  'is_digit': False,
  'is_digit+1': False,
  'is_digit-1': False,
  'is_upper': False,
  'is_upper+1': False,
  'is_upper-1': False,
  'length': 13,
  'length+1': 7,
  'length-1': 10,
  'word': 'кіровоградців',
  'word+1': 'кіровоградців',
  'word-1': 'кіровоградців'},
 {'is_capitalized': False,
  'is_capitalized+1': False,
  'is_capitalize

In [6]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
test_features = [word2features(test_tokens, i) for i in range(len(test_tokens))]
# x_train = vec.fit_transform(train_features+test_features).toarray()
x_train = vec.fit_transform(train_features).toarray()
print(x_train)

[[1. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [None]:
from sklearn.linear_model import LogisticRegression
ls = LogisticRegression()
ls.fit(x_train, train_labels)

In [None]:
test_features = [word2features(test_tokens, i) for i in range(len(test_tokens))]
x_test = vec.transform(test_features).toarray()
y_pred = ls.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_labels, y_pred)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_labels, y_pred))