In [1]:
import re
import itertools
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd

In [2]:
def parse_data():
    sentences = []
    with open('./NE.train.txt', encoding='UTF-8') as file, \
        open('./NE.train.tokens.final.txt', encoding='UTF-8') as token_file:
        lines = file.readlines()
        token_lines = token_file.readlines()
        
        lst = ["__".join(value) for key, value in itertools.groupby(token_lines, lambda x: x.split()[0])]
        
        grouped_token_lines = []
        for element in lst:
            sp_element = element.split("__")
            g_element = [sp_e.split() for sp_e in sp_element]
            grouped_token_lines.append(g_element)
        
        for line, g_line in zip(lines, grouped_token_lines):
            sentence, ne, words, tokens = list(), list(), list(), list()
            if " '" in line: line = line.replace(" '", "'")
            extracted_ne = re.findall(r'(<b_enamex\s+TYPE=\"(?P<ne>ORGANIZATION|PERSON|LOCATION)\">(?P<words>[\w\s]*)<e_enamex>|[\w.?\-",\']*\s*)', line)
            for e_ne in extracted_ne:
                if e_ne[1] == '': ne.append('OTHER')
                else: ne.append(e_ne[1])
                if e_ne[2] != '': words.append(e_ne[2])
                else: words.append(e_ne[0])
            
            for w, n in zip(words, ne):
                if len(w.strip().split()) == 1:
                    is_appended = False
                    for g in g_line:
                        if g[1] == w.strip() and w.strip() != '':
                            sentence.append((w.strip(), g[2], n))
                            is_appended = True
                    if not is_appended:
                        sentence.append((w.strip(), '', n))
                else:
                    new_w, new_g = '', ''
                    for w_s in w.strip().split():
                        is_appended = False
                        for g in g_line:
                            if w_s == g[1] and w_s != '':
                                new_w += ' ' + w_s
                                new_g += ' ' + g[2]
                    sentence.append((new_w.strip(), new_g.strip(), n))
                    is_appended = True
                    if not is_appended:
                        sentence.append((w.strip(), '', n))   
            
            sentences.append(sentence)
    
    for sentence in sentences:
        for s in sentence:
            if s == ('', '', 'OTHER'):
                sentence.remove(s)
    return sentences

In [91]:
def split_fold(all_features, all_labels, n):
    train_feature, test_feature, train_label, test_label = list(), list(), list(), list()
    for idx, (features, labels) in enumerate(zip(all_features, all_labels)):
        if idx % 5 == n-1:
            test_feature.append(features)
            test_label.append(labels)
        else:
            train_feature.append(features)
            train_label.append(labels)
            
    return train_feature, test_feature, train_label, test_label

In [None]:
# features: root - part of speech tag - proper noun - noun case - orthographic case - all inflectionals - start of sentence
def extract_features(sentences, k=0):   
    all_features, all_labels = list(), list()
    for sentence in sentences:
        sentence_features, sentence_labels = list(), list()

        for idx, s in enumerate(sentence):
            splitted = s[1].split('+')
            root = splitted[0]
            pos = splitted[1] if len(splitted) > 1 else ''
            prop = 1 if 'Prop' in splitted else 0
            ncs = 1 if 'Nom' or 'Acc' or 'Dat' or 'Abl' or 'Loc' or 'Gen' or 'Ins' or 'Equ' in splitted else 0 # ??
            ogs = 1 if 'Punct' in splitted else 0
            infls = splitted[2:]
            start = 1 if idx == 0 else 0
            
            if k == 0:
            # all features
                features = {'Morphed': splitted, 'Root:': root, 'POS': pos, 'Prop-case': prop, 'Noun-case': ncs, 'Orth-case': ogs, 'Inflectionals': infls, 'SS': start}
            elif k == 1: features = {'Morphed': splitted} # morphed
            elif k == 2: features = {'Root': root} # root
            elif k == 3: features = {'Morphed': splitted, 'Root': root} # morphed + root
            elif k == 4: features = {'Morphed': splitted, 'Root': root, 'POS': pos} # morphed + root + pos
            elif k == 5:
                # morphed + root + pos + prop
                features = {'Morphed': splitted, 'Root': root, 'POS': pos, 'Prop-case': prop}
            elif k == 6:
                # morphed + root + pos + prop + ncs
                features = {'Morphed': splitted, 'Root': root, 'POS': pos, 'Prop-case': prop, 'Noun-case': ncs}
            elif k == 7:
                # morphed + root + pos + prop + ncs + ogs
                features = {'Morphed': splitted, 'Root': root, 'POS': pos, 'Prop-case': prop, 'Noun-case': ncs, 'Orth-case': ogs}
            elif k == 8:
                # morphed + root + pos + prop + ncs + ogs + inflections
                features = {'Morphed': splitted, 'Root': root, 'POS': pos, 'Prop-case': prop, 'Noun-case': ncs, 'Orth-case': ogs, 'Inflectionals': infls}
            
            sentence_features.append(features)
            sentence_labels.append(s[2])

        all_features.append(sentence_features)
        all_labels.append(sentence_labels)
        
    return all_features, all_labels

In [63]:
def use_crf(train_f, test_f, train_l, test_l):
    %%time
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                c1=0.1,
                                c2=0.2,
                                max_iterations=100,
                                all_possible_transitions=True)
    crf.fit(train_f, train_l)
    y_pred = crf.predict(test_f)
    
    RE_WORDS = re.compile(r"[\w\d\.-]+")
    HEADER_REPORT = ['Label', 'Precision', 'Recall', 'F1_score', 'Support']

    details = metrics.flat_classification_report(test_l, y_pred, digits=3)
    print(details)
    details = [i for i in [re.findall(RE_WORDS, i) for i in details.split('\n')] if i != []][1:-1]  
    details = pd.DataFrame(details, columns=HEADER_REPORT)
    details.set_index('Label')
    details[['Precision', 'Recall', 'F1_score', 'Support']] = details[['Precision', 'Recall', 'F1_score', 'Support']].apply(pd.to_numeric)
    
    return y_pred, details #, f1s, precisions, recalls

In [6]:
sentences = parse_data()

In [7]:
print(sentences[0])

[('Müzik', 'müzik+Noun+A3sg+Pnon+Nom', 'OTHER'), ('Müzik', 'müzik+Noun+A3sg+Pnon+Nom', 'OTHER'), ("Şenliği'ne", 'Şenlik+Noun+Prop+A3sg+P3sg+Dat', 'OTHER'), ('hazırlanın', 'hazırla+Verb^DB+Verb+Reflex+Pos+Imp+A2pl', 'OTHER'), ('POZİTİF ve Açık Radyo', 'pozitif+Adj ve+Conj açık+Adj radyo+Noun+A3sg+Pnon+Nom', 'ORGANIZATION'), ('işbirliğiyle', 'işbirliği+Noun+A3sg+Pnon+Ins', 'OTHER'), ('düzenlenecek', 'düzenle+Verb^DB+Verb+Pass+Pos^DB+Adj+FutPart+Pnon', 'OTHER'), ('olan', 'ol+Verb+Pos^DB+Adj+PresPart', 'OTHER'), ('İstanbul', 'İstanbul+Noun+Prop+A3sg+Pnon+Nom', 'LOCATION'), ('Müzik', 'müzik+Noun+A3sg+Pnon+Nom', 'OTHER'), ('Müzik', 'müzik+Noun+A3sg+Pnon+Nom', 'OTHER'), ('Şenliği', 'şenlik+Noun+A3sg+P3sg+Nom', 'OTHER'), ('2', '2+Num+Card', 'OTHER'), (',', ',+Punct', 'OTHER'), ('müzikseverlere', 'müziksever+Noun+A3pl+Pnon+Dat', 'OTHER'), ('Aralık', 'aralık+Noun+A3sg+Pnon+Nom', 'OTHER'), ('ayında', 'ay+Noun+A3sg+P3sg+Loc', 'OTHER'), ('merhaba', 'merhaba+Noun+A3sg+Pnon+Nom', 'OTHER'), ('demeye',

In [90]:
all_features, all_labels = extract_features(sentences)
features_1, labels_1 = extract_features(sentences, k=1)
features_2, labels_2 = extract_features(sentences, k=2)
features_3, labels_3 = extract_features(sentences, k=3)
features_4, labels_4 = extract_features(sentences, k=4)
features_5, labels_5 = extract_features(sentences, k=5)
features_6, labels_6 = extract_features(sentences, k=6)
features_7, labels_7 = extract_features(sentences, k=7)
features_8, labels_8 = extract_features(sentences, k=8)

In [152]:
all_features[0][3]

{'Inflectionals': ['Verb', 'Reflex', 'Pos', 'Imp', 'A2pl'],
 'Morphed': ['hazırla', 'Verb^DB', 'Verb', 'Reflex', 'Pos', 'Imp', 'A2pl'],
 'Noun-case': 1,
 'Orth-case': 0,
 'POS': 'Verb^DB',
 'Prop-case': 0,
 'Root:': 'hazırla',
 'SS': 0}

In [9]:
print(len(all_features))
print(len(all_labels))

27563
27563


In [92]:
train_feature_f1, test_feature_f1, train_label_f1, test_label_f1 = split_fold(all_features, all_labels, n=1)
train_feature_f2, test_feature_f2, train_label_f2, test_label_f2 = split_fold(all_features, all_labels, n=2)
train_feature_f3, test_feature_f3, train_label_f3, test_label_f3 = split_fold(all_features, all_labels, n=3)
train_feature_f4, test_feature_f4, train_label_f4, test_label_f4 = split_fold(all_features, all_labels, n=4)
train_feature_f5, test_feature_f5, train_label_f5, test_label_f5 = split_fold(all_features, all_labels, n=5)

In [93]:
train_feature_1_f1, test_feature_1_f1, train_label_1_f1, test_label_1_f1 = split_fold(features_1, labels_1, n=1)
train_feature_1_f2, test_feature_1_f2, train_label_1_f2, test_label_1_f2 = split_fold(features_1, labels_1, n=2)
train_feature_1_f3, test_feature_1_f3, train_label_1_f3, test_label_1_f3 = split_fold(features_1, labels_1, n=3)
train_feature_1_f4, test_feature_1_f4, train_label_1_f4, test_label_1_f4 = split_fold(features_1, labels_1, n=4)
train_feature_1_f5, test_feature_1_f5, train_label_1_f5, test_label_1_f5 = split_fold(features_1, labels_1, n=5)

In [94]:
train_feature_2_f1, test_feature_2_f1, train_label_2_f1, test_label_2_f1 = split_fold(features_2, labels_2, n=1)
train_feature_2_f2, test_feature_2_f2, train_label_2_f2, test_label_2_f2 = split_fold(features_2, labels_2, n=2)
train_feature_2_f3, test_feature_2_f3, train_label_2_f3, test_label_2_f3 = split_fold(features_2, labels_2, n=3)
train_feature_2_f4, test_feature_2_f4, train_label_2_f4, test_label_2_f4 = split_fold(features_2, labels_2, n=4)
train_feature_2_f5, test_feature_2_f5, train_label_2_f5, test_label_2_f5 = split_fold(features_2, labels_2, n=5)

In [95]:
train_feature_3_f1, test_feature_3_f1, train_label_3_f1, test_label_3_f1 = split_fold(features_3, labels_3, n=1)
train_feature_3_f2, test_feature_3_f2, train_label_3_f2, test_label_3_f2 = split_fold(features_3, labels_3, n=2)
train_feature_3_f3, test_feature_3_f3, train_label_3_f3, test_label_3_f3 = split_fold(features_3, labels_3, n=3)
train_feature_3_f4, test_feature_3_f4, train_label_3_f4, test_label_3_f4 = split_fold(features_3, labels_3, n=4)
train_feature_3_f5, test_feature_3_f5, train_label_3_f5, test_label_3_f5 = split_fold(features_3, labels_3, n=5)

In [96]:
train_feature_4_f1, test_feature_4_f1, train_label_4_f1, test_label_4_f1 = split_fold(features_4, labels_4, n=1)
train_feature_4_f2, test_feature_4_f2, train_label_4_f2, test_label_4_f2 = split_fold(features_4, labels_4, n=2)
train_feature_4_f3, test_feature_4_f3, train_label_4_f3, test_label_4_f3 = split_fold(features_4, labels_4, n=3)
train_feature_4_f4, test_feature_4_f4, train_label_4_f4, test_label_4_f4 = split_fold(features_4, labels_4, n=4)
train_feature_4_f5, test_feature_4_f5, train_label_4_f5, test_label_4_f5 = split_fold(features_4, labels_4, n=5)

In [97]:
train_feature_5_f1, test_feature_5_f1, train_label_5_f1, test_label_5_f1 = split_fold(features_5, labels_5, n=1)
train_feature_5_f2, test_feature_5_f2, train_label_5_f2, test_label_5_f2 = split_fold(features_5, labels_5, n=2)
train_feature_5_f3, test_feature_5_f3, train_label_5_f3, test_label_5_f3 = split_fold(features_5, labels_5, n=3)
train_feature_5_f4, test_feature_5_f4, train_label_5_f4, test_label_5_f4 = split_fold(features_5, labels_5, n=4)
train_feature_5_f5, test_feature_5_f5, train_label_5_f5, test_label_5_f5 = split_fold(features_5, labels_5, n=5)

In [98]:
train_feature_6_f1, test_feature_6_f1, train_label_6_f1, test_label_6_f1 = split_fold(features_6, labels_6, n=1)
train_feature_6_f2, test_feature_6_f2, train_label_6_f2, test_label_6_f2 = split_fold(features_6, labels_6, n=2)
train_feature_6_f3, test_feature_6_f3, train_label_6_f3, test_label_6_f3 = split_fold(features_6, labels_6, n=3)
train_feature_6_f4, test_feature_6_f4, train_label_6_f4, test_label_6_f4 = split_fold(features_6, labels_6, n=4)
train_feature_6_f5, test_feature_6_f5, train_label_6_f5, test_label_6_f5 = split_fold(features_6, labels_6, n=5)

In [99]:
train_feature_7_f1, test_feature_7_f1, train_label_7_f1, test_label_7_f1 = split_fold(features_7, labels_7, n=1)
train_feature_7_f2, test_feature_7_f2, train_label_7_f2, test_label_7_f2 = split_fold(features_7, labels_7, n=2)
train_feature_7_f3, test_feature_7_f3, train_label_7_f3, test_label_7_f3 = split_fold(features_7, labels_7, n=3)
train_feature_7_f4, test_feature_7_f4, train_label_7_f4, test_label_7_f4 = split_fold(features_7, labels_7, n=4)
train_feature_7_f5, test_feature_7_f5, train_label_7_f5, test_label_7_f5 = split_fold(features_7, labels_7, n=5)

In [100]:
train_feature_8_f1, test_feature_8_f1, train_label_8_f1, test_label_8_f1 = split_fold(features_8, labels_8, n=1)
train_feature_8_f2, test_feature_8_f2, train_label_8_f2, test_label_8_f2 = split_fold(features_8, labels_8, n=2)
train_feature_8_f3, test_feature_8_f3, train_label_8_f3, test_label_8_f3 = split_fold(features_8, labels_8, n=3)
train_feature_8_f4, test_feature_8_f4, train_label_8_f4, test_label_8_f4 = split_fold(features_8, labels_8, n=4)
train_feature_8_f5, test_feature_8_f5, train_label_8_f5, test_label_8_f5 = split_fold(features_8, labels_8, n=5)

In [64]:
y_pred_f1, details_f1 = use_crf(train_feature_f1, test_feature_f1, train_label_f1, test_label_f1)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.827     0.646     0.725      2515
ORGANIZATION      0.929     0.648     0.763      1949
       OTHER      0.985     0.998     0.992    135128
      PERSON      0.902     0.751     0.820      3232

 avg / total      0.980     0.981     0.980    142824



In [66]:
y_pred_f2, details_f2 = use_crf(train_feature_f2, test_feature_f2, train_label_f2, test_label_f2)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.826     0.646     0.725      2287
ORGANIZATION      0.932     0.658     0.771      1784
       OTHER      0.985     0.998     0.992    130518
      PERSON      0.922     0.737     0.819      3251

 avg / total      0.981     0.982     0.980    137840



In [69]:
y_pred_f3, details_f3 = use_crf(train_feature_f3, test_feature_f3, train_label_f3, test_label_f3)

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 16.5 µs
              precision    recall  f1-score   support

    LOCATION      0.821     0.649     0.725      2430
ORGANIZATION      0.950     0.668     0.785      1846
       OTHER      0.988     0.998     0.993    170312
      PERSON      0.956     0.824     0.885      5745

 avg / total      0.984     0.985     0.984    180333



In [70]:
y_pred_f4, details_f4 = use_crf(train_feature_f4, test_feature_f4, train_label_f4, test_label_f4)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.872     0.733     0.796      3597
ORGANIZATION      0.948     0.655     0.775      1851
       OTHER      0.988     0.998     0.993    166439
      PERSON      0.888     0.756     0.817      3389

 avg / total      0.983     0.984     0.983    175276



In [71]:
y_pred_f5, details_f5 = use_crf(train_feature_f5, test_feature_f5, train_label_f5, test_label_f5)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.822     0.635     0.717      2444
ORGANIZATION      0.946     0.649     0.770      1746
       OTHER      0.985     0.998     0.991    133160
      PERSON      0.915     0.758     0.829      3551

 avg / total      0.980     0.981     0.980    140901



In [80]:
averages = details_f1.iloc[:, 1:].add(details_f2.iloc[:, 1:]).add(details_f3.iloc[:, 1:]).add(details_f4.iloc[:, 1:]).add(details_f5.iloc[:, 1:]) / 5

print(averages)

   Precision  Recall  F1_score   Support
0     0.8336  0.6618    0.7376    2654.6
1     0.9410  0.6556    0.7728    1835.2
2     0.9862  0.9980    0.9922  147111.4
3     0.9166  0.7652    0.8340    3833.6


In [101]:
y_pred_1_f1, details_1_f1 = use_crf(train_feature_1_f1, test_feature_1_f1, train_label_1_f1, test_label_1_f1)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.823     0.637     0.718      2515
ORGANIZATION      0.940     0.624     0.750      1949
       OTHER      0.985     0.998     0.991    135128
      PERSON      0.900     0.735     0.809      3232

 avg / total      0.979     0.980     0.979    142824



In [103]:
y_pred_1_f2, details_1_f2 = use_crf(train_feature_1_f2, test_feature_1_f2, train_label_1_f2, test_label_1_f2)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.825     0.641     0.722      2287
ORGANIZATION      0.939     0.637     0.759      1784
       OTHER      0.985     0.998     0.991    130518
      PERSON      0.921     0.722     0.810      3251

 avg / total      0.980     0.981     0.980    137840



In [104]:
y_pred_1_f3, details_1_f3 = use_crf(train_feature_1_f3, test_feature_1_f3, train_label_1_f3, test_label_1_f3)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.817     0.641     0.718      2430
ORGANIZATION      0.955     0.645     0.770      1846
       OTHER      0.987     0.998     0.993    170312
      PERSON      0.953     0.814     0.878      5745

 avg / total      0.983     0.984     0.983    180333



In [105]:
y_pred_1_f4, details_1_f4 = use_crf(train_feature_1_f4, test_feature_1_f4, train_label_1_f4, test_label_1_f4)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.873     0.726     0.793      3597
ORGANIZATION      0.962     0.629     0.761      1851
       OTHER      0.987     0.998     0.993    166439
      PERSON      0.880     0.737     0.802      3389

 avg / total      0.983     0.983     0.982    175276



In [106]:
y_pred_1_f5, details_1_f5 = use_crf(train_feature_1_f5, test_feature_1_f5, train_label_1_f5, test_label_1_f5)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.818     0.627     0.710      2444
ORGANIZATION      0.953     0.616     0.748      1746
       OTHER      0.984     0.998     0.991    133160
      PERSON      0.914     0.741     0.818      3551

 avg / total      0.979     0.980     0.979    140901



In [107]:
averages_1 = details_1_f1.iloc[:, 1:].add(details_1_f2.iloc[:, 1:]).add(details_1_f3.iloc[:, 1:]).add(details_1_f4.iloc[:, 1:]).add(details_1_f5.iloc[:, 1:]) / 5

print(averages_1)

   Precision  Recall  F1_score   Support
0     0.8312  0.6544    0.7322    2654.6
1     0.9498  0.6302    0.7576    1835.2
2     0.9856  0.9980    0.9918  147111.4
3     0.9136  0.7498    0.8234    3833.6


In [108]:
y_pred_2_f1, details_2_f1 = use_crf(train_feature_2_f1, test_feature_2_f1, train_label_2_f1, test_label_2_f1)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.756     0.599     0.668      2515
ORGANIZATION      0.870     0.429     0.575      1949
       OTHER      0.979     0.997     0.988    135128
      PERSON      0.943     0.638     0.761      3232

 avg / total      0.973     0.975     0.972    142824



In [109]:
y_pred_2_f2, details_2_f2 = use_crf(train_feature_2_f2, test_feature_2_f2, train_label_2_f2, test_label_2_f2)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.743     0.599     0.663      2287
ORGANIZATION      0.864     0.432     0.576      1784
       OTHER      0.979     0.998     0.988    130518
      PERSON      0.946     0.631     0.757      3251

 avg / total      0.973     0.975     0.972    137840



In [110]:
y_pred_2_f3, details_2_f3 = use_crf(train_feature_2_f3, test_feature_2_f3, train_label_2_f3, test_label_2_f3)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.746     0.585     0.656      2430
ORGANIZATION      0.881     0.454     0.600      1846
       OTHER      0.983     0.998     0.990    170312
      PERSON      0.978     0.775     0.865      5745

 avg / total      0.979     0.980     0.978    180333



In [111]:
y_pred_2_f4, details_2_f4 = use_crf(train_feature_2_f4, test_feature_2_f4, train_label_2_f4, test_label_2_f4)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
              precision    recall  f1-score   support

    LOCATION      0.793     0.702     0.745      3597
ORGANIZATION      0.883     0.439     0.587      1851
       OTHER      0.982     0.997     0.990    166439
      PERSON      0.937     0.640     0.760      3389

 avg / total      0.977     0.978     0.976    175276



In [112]:
y_pred_2_f5, details_2_f5 = use_crf(train_feature_2_f5, test_feature_2_f5, train_label_2_f5, test_label_2_f5)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
              precision    recall  f1-score   support

    LOCATION      0.745     0.593     0.660      2444
ORGANIZATION      0.860     0.423     0.567      1746
       OTHER      0.979     0.997     0.988    133160
      PERSON      0.948     0.645     0.768      3551

 avg / total      0.973     0.974     0.972    140901



In [114]:
averages_2 = details_2_f1.iloc[:, 1:].add(details_2_f2.iloc[:, 1:]).add(details_2_f3.iloc[:, 1:]).add(details_2_f4.iloc[:, 1:]).add(details_2_f5.iloc[:, 1:]) / 5

print(averages_2)

   Precision  Recall  F1_score   Support
0     0.7566  0.6156    0.6784    2654.6
1     0.8716  0.4354    0.5810    1835.2
2     0.9804  0.9974    0.9888  147111.4
3     0.9504  0.6658    0.7822    3833.6


In [115]:
y_pred_3_f1, details_3_f1 = use_crf(train_feature_3_f1, test_feature_3_f1, train_label_3_f1, test_label_3_f1)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.821     0.645     0.723      2515
ORGANIZATION      0.932     0.641     0.760      1949
       OTHER      0.985     0.998     0.991    135128
      PERSON      0.903     0.746     0.817      3232

 avg / total      0.980     0.981     0.980    142824



In [116]:
y_pred_3_f2, details_3_f2 = use_crf(train_feature_3_f2, test_feature_3_f2, train_label_3_f2, test_label_3_f2)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.823     0.645     0.723      2287
ORGANIZATION      0.940     0.645     0.765      1784
       OTHER      0.985     0.998     0.991    130518
      PERSON      0.922     0.733     0.816      3251

 avg / total      0.980     0.981     0.980    137840



In [117]:
y_pred_3_f3, details_3_f3 = use_crf(train_feature_3_f3, test_feature_3_f3, train_label_3_f3, test_label_3_f3)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.818     0.644     0.721      2430
ORGANIZATION      0.951     0.657     0.777      1846
       OTHER      0.987     0.998     0.993    170312
      PERSON      0.956     0.821     0.883      5745

 avg / total      0.984     0.984     0.983    180333



In [118]:
y_pred_3_f4, details_3_f4 = use_crf(train_feature_3_f4, test_feature_3_f4, train_label_3_f4, test_label_3_f4)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.874     0.734     0.798      3597
ORGANIZATION      0.953     0.649     0.772      1851
       OTHER      0.988     0.998     0.993    166439
      PERSON      0.886     0.753     0.814      3389

 avg / total      0.983     0.984     0.983    175276



In [119]:
y_pred_3_f5, details_3_f5 = use_crf(train_feature_3_f5, test_feature_3_f5, train_label_3_f5, test_label_3_f5)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.817     0.637     0.716      2444
ORGANIZATION      0.945     0.633     0.758      1746
       OTHER      0.985     0.998     0.991    133160
      PERSON      0.918     0.754     0.828      3551

 avg / total      0.980     0.981     0.980    140901



In [120]:
averages_3 = details_3_f1.iloc[:, 1:].add(details_3_f2.iloc[:, 1:]).add(details_3_f3.iloc[:, 1:]).add(details_3_f4.iloc[:, 1:]).add(details_3_f5.iloc[:, 1:]) / 5

print(averages_3)

   Precision  Recall  F1_score   Support
0     0.8306  0.6610    0.7362    2654.6
1     0.9442  0.6450    0.7664    1835.2
2     0.9860  0.9980    0.9918  147111.4
3     0.9170  0.7614    0.8316    3833.6


In [121]:
y_pred_4_f1, details_4_f1 = use_crf(train_feature_4_f1, test_feature_4_f1, train_label_4_f1, test_label_4_f1)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.825     0.646     0.724      2515
ORGANIZATION      0.933     0.646     0.763      1949
       OTHER      0.985     0.998     0.992    135128
      PERSON      0.905     0.749     0.819      3232

 avg / total      0.980     0.981     0.980    142824



In [122]:
y_pred_4_f2, details_4_f2 = use_crf(train_feature_4_f2, test_feature_4_f2, train_label_4_f2, test_label_4_f2)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.824     0.645     0.724      2287
ORGANIZATION      0.928     0.655     0.768      1784
       OTHER      0.985     0.998     0.992    130518
      PERSON      0.920     0.736     0.818      3251

 avg / total      0.980     0.981     0.980    137840



In [123]:
y_pred_4_f3, details_4_f3 = use_crf(train_feature_4_f3, test_feature_4_f3, train_label_4_f3, test_label_4_f3)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.820     0.647     0.723      2430
ORGANIZATION      0.949     0.655     0.775      1846
       OTHER      0.987     0.998     0.993    170312
      PERSON      0.955     0.822     0.883      5745

 avg / total      0.984     0.984     0.983    180333



In [124]:
y_pred_4_f4, details_4_f4 = use_crf(train_feature_4_f4, test_feature_4_f4, train_label_4_f4, test_label_4_f4)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.873     0.732     0.796      3597
ORGANIZATION      0.950     0.653     0.774      1851
       OTHER      0.988     0.998     0.993    166439
      PERSON      0.886     0.754     0.815      3389

 avg / total      0.983     0.984     0.983    175276



In [125]:
y_pred_4_f5, details_4_f5 = use_crf(train_feature_4_f5, test_feature_4_f5, train_label_4_f5, test_label_4_f5)

CPU times: user 0 ns, sys: 17 µs, total: 17 µs
Wall time: 24.6 µs
              precision    recall  f1-score   support

    LOCATION      0.820     0.637     0.717      2444
ORGANIZATION      0.941     0.639     0.761      1746
       OTHER      0.985     0.998     0.991    133160
      PERSON      0.918     0.757     0.830      3551

 avg / total      0.980     0.981     0.980    140901



In [126]:
averages_4 = details_4_f1.iloc[:, 1:].add(details_4_f2.iloc[:, 1:]).add(details_4_f3.iloc[:, 1:]).add(details_4_f4.iloc[:, 1:]).add(details_4_f5.iloc[:, 1:]) / 5

print(averages_4)

   Precision  Recall  F1_score   Support
0     0.8324  0.6614    0.7368    2654.6
1     0.9402  0.6496    0.7682    1835.2
2     0.9860  0.9980    0.9922  147111.4
3     0.9168  0.7636    0.8330    3833.6


In [127]:
y_pred_5_f1, details_5_f1 = use_crf(train_feature_5_f1, test_feature_5_f1, train_label_5_f1, test_label_5_f1)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.82 µs
              precision    recall  f1-score   support

    LOCATION      0.821     0.646     0.723      2515
ORGANIZATION      0.926     0.643     0.759      1949
       OTHER      0.985     0.998     0.991    135128
      PERSON      0.908     0.746     0.819      3232

 avg / total      0.980     0.981     0.980    142824



In [128]:
y_pred_5_f2, details_5_f2 = use_crf(train_feature_5_f2, test_feature_5_f2, train_label_5_f2, test_label_5_f2)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.824     0.645     0.723      2287
ORGANIZATION      0.929     0.655     0.768      1784
       OTHER      0.985     0.998     0.992    130518
      PERSON      0.922     0.736     0.819      3251

 avg / total      0.980     0.982     0.980    137840



In [129]:
y_pred_5_f3, details_5_f3 = use_crf(train_feature_5_f3, test_feature_5_f3, train_label_5_f3, test_label_5_f3)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.818     0.644     0.721      2430
ORGANIZATION      0.950     0.662     0.780      1846
       OTHER      0.987     0.998     0.993    170312
      PERSON      0.956     0.823     0.885      5745

 avg / total      0.984     0.984     0.983    180333



In [130]:
y_pred_5_f4, details_5_f4 = use_crf(train_feature_5_f4, test_feature_5_f4, train_label_5_f4, test_label_5_f4)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.872     0.732     0.796      3597
ORGANIZATION      0.949     0.652     0.773      1851
       OTHER      0.988     0.998     0.993    166439
      PERSON      0.888     0.753     0.815      3389

 avg / total      0.983     0.984     0.983    175276



In [131]:
y_pred_5_f5, details_5_f5 = use_crf(train_feature_5_f5, test_feature_5_f5, train_label_5_f5, test_label_5_f5)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.819     0.634     0.714      2444
ORGANIZATION      0.938     0.645     0.764      1746
       OTHER      0.985     0.998     0.991    133160
      PERSON      0.914     0.759     0.829      3551

 avg / total      0.980     0.981     0.980    140901



In [132]:
averages_5 = details_5_f1.iloc[:, 1:].add(details_5_f2.iloc[:, 1:]).add(details_5_f3.iloc[:, 1:]).add(details_5_f4.iloc[:, 1:]).add(details_5_f5.iloc[:, 1:]) / 5

print(averages_5)

   Precision  Recall  F1_score   Support
0     0.8308  0.6602    0.7354    2654.6
1     0.9384  0.6514    0.7688    1835.2
2     0.9860  0.9980    0.9920  147111.4
3     0.9176  0.7634    0.8334    3833.6


In [133]:
y_pred_6_f1, details_6_f1 = use_crf(train_feature_6_f1, test_feature_6_f1, train_label_6_f1, test_label_6_f1)

CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 17.9 µs
              precision    recall  f1-score   support

    LOCATION      0.824     0.645     0.723      2515
ORGANIZATION      0.927     0.647     0.762      1949
       OTHER      0.985     0.998     0.992    135128
      PERSON      0.901     0.748     0.818      3232

 avg / total      0.980     0.981     0.980    142824



In [134]:
y_pred_6_f2, details_6_f2 = use_crf(train_feature_6_f2, test_feature_6_f2, train_label_6_f2, test_label_6_f2)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.824     0.645     0.723      2287
ORGANIZATION      0.936     0.652     0.769      1784
       OTHER      0.985     0.998     0.992    130518
      PERSON      0.925     0.737     0.820      3251

 avg / total      0.981     0.982     0.980    137840



In [135]:
y_pred_6_f3, details_6_f3 = use_crf(train_feature_6_f3, test_feature_6_f3, train_label_6_f3, test_label_6_f3)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.818     0.648     0.723      2430
ORGANIZATION      0.953     0.664     0.783      1846
       OTHER      0.987     0.998     0.993    170312
      PERSON      0.956     0.823     0.885      5745

 avg / total      0.984     0.985     0.984    180333



In [136]:
y_pred_6_f4, details_6_f4 = use_crf(train_feature_6_f4, test_feature_6_f4, train_label_6_f4, test_label_6_f4)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.872     0.733     0.796      3597
ORGANIZATION      0.945     0.655     0.773      1851
       OTHER      0.988     0.998     0.993    166439
      PERSON      0.886     0.755     0.815      3389

 avg / total      0.983     0.984     0.983    175276



In [137]:
y_pred_6_f5, details_6_f5 = use_crf(train_feature_6_f5, test_feature_6_f5, train_label_6_f5, test_label_6_f5)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.819     0.637     0.717      2444
ORGANIZATION      0.944     0.645     0.766      1746
       OTHER      0.985     0.998     0.991    133160
      PERSON      0.916     0.758     0.830      3551

 avg / total      0.980     0.981     0.980    140901



In [138]:
averages_6 = details_6_f1.iloc[:, 1:].add(details_6_f2.iloc[:, 1:]).add(details_6_f3.iloc[:, 1:]).add(details_6_f4.iloc[:, 1:]).add(details_6_f5.iloc[:, 1:]) / 5

print(averages_6)

   Precision  Recall  F1_score   Support
0     0.8314  0.6616    0.7364    2654.6
1     0.9410  0.6526    0.7706    1835.2
2     0.9860  0.9980    0.9922  147111.4
3     0.9168  0.7642    0.8336    3833.6


In [139]:
y_pred_7_f1, details_7_f1 = use_crf(train_feature_7_f1, test_feature_7_f1, train_label_7_f1, test_label_7_f1)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.825     0.646     0.725      2515
ORGANIZATION      0.926     0.645     0.760      1949
       OTHER      0.985     0.998     0.992    135128
      PERSON      0.903     0.748     0.818      3232

 avg / total      0.980     0.981     0.980    142824



In [140]:
y_pred_7_f2, details_7_f2 = use_crf(train_feature_7_f2, test_feature_7_f2, train_label_7_f2, test_label_7_f2)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.824     0.645     0.723      2287
ORGANIZATION      0.927     0.655     0.768      1784
       OTHER      0.985     0.998     0.992    130518
      PERSON      0.923     0.738     0.820      3251

 avg / total      0.980     0.982     0.980    137840



In [141]:
y_pred_7_f3, details_7_f3 = use_crf(train_feature_7_f3, test_feature_7_f3, train_label_7_f3, test_label_7_f3)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.818     0.647     0.722      2430
ORGANIZATION      0.945     0.665     0.781      1846
       OTHER      0.987     0.998     0.993    170312
      PERSON      0.956     0.823     0.884      5745

 avg / total      0.984     0.984     0.984    180333



In [142]:
y_pred_7_f4, details_7_f4 = use_crf(train_feature_7_f4, test_feature_7_f4, train_label_7_f4, test_label_7_f4)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.872     0.733     0.796      3597
ORGANIZATION      0.947     0.655     0.774      1851
       OTHER      0.988     0.998     0.993    166439
      PERSON      0.888     0.752     0.814      3389

 avg / total      0.983     0.984     0.983    175276



In [143]:
y_pred_7_f5, details_7_f5 = use_crf(train_feature_7_f5, test_feature_7_f5, train_label_7_f5, test_label_7_f5)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs
              precision    recall  f1-score   support

    LOCATION      0.820     0.636     0.716      2444
ORGANIZATION      0.943     0.646     0.767      1746
       OTHER      0.985     0.998     0.991    133160
      PERSON      0.916     0.760     0.830      3551

 avg / total      0.980     0.981     0.980    140901



In [144]:
averages_7 = details_7_f1.iloc[:, 1:].add(details_7_f2.iloc[:, 1:]).add(details_7_f3.iloc[:, 1:]).add(details_7_f4.iloc[:, 1:]).add(details_7_f5.iloc[:, 1:]) / 5

print(averages_7)

   Precision  Recall  F1_score   Support
0     0.8318  0.6614    0.7364    2654.6
1     0.9376  0.6532    0.7700    1835.2
2     0.9860  0.9980    0.9922  147111.4
3     0.9172  0.7642    0.8332    3833.6


In [145]:
y_pred_8_f1, details_8_f1 = use_crf(train_feature_8_f1, test_feature_8_f1, train_label_8_f1, test_label_8_f1)

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 16.9 µs
              precision    recall  f1-score   support

    LOCATION      0.825     0.645     0.724      2515
ORGANIZATION      0.930     0.648     0.763      1949
       OTHER      0.985     0.998     0.992    135128
      PERSON      0.902     0.751     0.820      3232

 avg / total      0.980     0.981     0.980    142824



In [146]:
y_pred_8_f2, details_8_f2 = use_crf(train_feature_8_f2, test_feature_8_f2, train_label_8_f2, test_label_8_f2)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.827     0.646     0.725      2287
ORGANIZATION      0.925     0.660     0.770      1784
       OTHER      0.985     0.998     0.992    130518
      PERSON      0.922     0.738     0.820      3251

 avg / total      0.981     0.982     0.980    137840



In [147]:
y_pred_8_f3, details_8_f3 = use_crf(train_feature_8_f3, test_feature_8_f3, train_label_8_f3, test_label_8_f3)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs
              precision    recall  f1-score   support

    LOCATION      0.819     0.646     0.722      2430
ORGANIZATION      0.951     0.667     0.784      1846
       OTHER      0.987     0.998     0.993    170312
      PERSON      0.957     0.823     0.885      5745

 avg / total      0.984     0.985     0.984    180333



In [148]:
y_pred_8_f4, details_8_f4 = use_crf(train_feature_8_f4, test_feature_8_f4, train_label_8_f4, test_label_8_f4)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs
              precision    recall  f1-score   support

    LOCATION      0.873     0.731     0.796      3597
ORGANIZATION      0.949     0.656     0.776      1851
       OTHER      0.988     0.998     0.993    166439
      PERSON      0.885     0.755     0.815      3389

 avg / total      0.983     0.984     0.983    175276



In [149]:
y_pred_8_f5, details_8_f5 = use_crf(train_feature_8_f5, test_feature_8_f5, train_label_8_f5, test_label_8_f5)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.34 µs
              precision    recall  f1-score   support

    LOCATION      0.821     0.636     0.717      2444
ORGANIZATION      0.946     0.651     0.771      1746
       OTHER      0.985     0.998     0.991    133160
      PERSON      0.915     0.757     0.829      3551

 avg / total      0.980     0.981     0.980    140901



In [150]:
averages_8 = details_8_f1.iloc[:, 1:].add(details_8_f2.iloc[:, 1:]).add(details_8_f3.iloc[:, 1:]).add(details_8_f4.iloc[:, 1:]).add(details_8_f5.iloc[:, 1:]) / 5

print(averages_8)

   Precision  Recall  F1_score   Support
0     0.8330  0.6608    0.7368    2654.6
1     0.9402  0.6564    0.7728    1835.2
2     0.9860  0.9980    0.9922  147111.4
3     0.9162  0.7648    0.8338    3833.6
