In [1]:
#load and install required packages

In [1]:
#install required packages
!pip install sklearn_crfsuite
!pip install scikit-learn 
!pip install nltk 



In [2]:
# import required libraries
import pandas as pd 
import nltk
import sklearn
import scipy.stats
import numpy as np
import seaborn as sns
import sklearn_crfsuite
import os

#from matplotlib 
from matplotlib import pyplot as plt

#from itertools
from itertools import chain

#from sklearn
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [3]:
#load the data and ontologies

In [4]:
#time to load the ontologies 

#Material ontology
materials = pd.read_csv(r'ontologies\Materials.csv') #open the file
materials[materials.columns[2]]= materials[materials.columns[2]].str.lower() #takes the second column (the one with the entities) and makes it all lower case
materials_list = materials[materials.columns[2]].values.tolist() #turns the items into a list
print(materials_list) # outputs the list of terms to see what sort of data it contains
print("\n""\n")

# #Taxon ontology
# taxon = pd.read_csv(r'ontologies\Taxon.tsv', sep="\t", error_bad_lines=False) #open the file
# taxon[taxon.columns[8]]= taxon[taxon.columns[8]].str.lower() #takes the second column (the one with the entities) and makes it all lower case
# taxon_list = taxon[taxon.columns[8]].values.tolist() #turns the items into a list
# print(taxon_list[0:100]) # outputs the list of terms to see what sort of data it contains
# print("\n""\n")

#Animal ontology
animal = pd.read_csv(r'ontologies\VernacularName.tsv', sep="\t") #open the file
animal[animal.columns[2]]= animal[animal.columns[2]].str.lower() #takes the second column (the one with the entities) and makes it all lower case
animal_list = animal[animal.columns[2]].values.tolist() #turns the items into a list
print(animal_list[0:100]) # outputs the list of terms to see what sort of data it contains
print("\n""\n")


#Periods ontology - for notes see above
periods = pd.read_csv(r'ontologies\Periods.csv')
periods[periods.columns[1]]= periods[periods.columns[1]].str.lower()
periods_list = periods[periods.columns[1]].values.tolist()
print(periods_list[0:100])
print("\n""\n")


#Context ontology - for notes see material 
context = pd.read_csv(r'ontologies\context.csv')
context[context.columns[0]]= context[context.columns[0]].str.lower().str.replace('*','').str.replace('<','').str.replace('>','') #replace all other symbols
context[context.columns[0]]= context[context.columns[0]].replace(to_replace =':.*',value='',regex=True) # replace the words after a colon
context_list = context[context.columns[0]].values.tolist()
print(context_list[0:100])
print("\n""\n")

['elm', 'felt', 'alabaster', 'spruce', 'tamarac', 'aluminum', 'variscite', 'argillite', 'totternhoe clunch', 'ash', 'aluminium', 'carnelian', 'cornelian', 'plaster', 'sapphire', 'paper', 'ebony', 'garnet', 'rubber', 'coal', 'emerald', 'hazel', 'puddingstone', 'hertfordshire puddingstone', 'charcoal', 'chalk', 'hydrocarbon', 'bakelite', 'amethyst', 'amphibolite', 'larch', 'siltstone', 'mudstone', 'utahlite', 'teak', 'shale', 'ivory', 'marble', 'limestone', 'leather', 'lead', 'lava', 'faience', 'jadeite', 'tooth', 'iron', 'pottery', 'greenstone', 'gold', 'glass', 'flint', 'jet', 'silver', 'pewter', 'obsidian', 'sandstone', 'object material', 'oak', 'mineral', 'wood', 'shell', 'quartz', 'slate', 'steel', 'stone', 'terracotta', 'tin', 'granite', 'quartzite', 'fir', 'antimony', 'schist', 'birch', 'lead alloy', 'zinc', 'dolerite', 'ceramic', 'pine', 'fibreglass', 'glass fibre', 'graphite', 'jade', 'onyx', 'fiberglass', 'beech', 'textile', 'metal', 'alloy', 'bronze', 'horn', 'brass', 'bone', 



In [5]:
#this function turns the file into a list. 
def file2list(fileLocation):
    outputList = []
    with open(fileLocation, 'r', encoding='utf8') as myfile:
        sentences = myfile.read().split('\n\n')
        if len(sentences):
            for sentence in sentences:
                    sentenceList = []
                    words = sentence.split('\n')
                    for word in words:
                        wordsList = []
                        attributes = word.split(' ')
                        for attribute in attributes:
                            wordsList.append(attribute)
                        sentenceList.append(wordsList)
                    outputList.append(sentenceList)
    
    return outputList

In [10]:
#this calls the function
def word2features(sent, i): 
    word = sent[i][0] #takes each token
    postag = sent[i][1] #this is the Part of Speach Tager
    
     #this tells if each token is in the ontology or not 
    if word in materials_list: 
        in_materials = True 
    else:
        in_materials = False
    
#     if word in taxon_list: 
#         in_taxon = True 
#     else:
#         in_taxon = False
    
    if word in animal_list: 
        in_animal = True 
    else:
        in_animal = False
  
    if word in periods_list: 
        in_periods = True 
    else:
        in_periods = False

    if word in context_list: 
        in_context = True 
    else:
        in_context = False
    
    #time to give each token some information     
    features = { # these are all default. 
        'bias': 1.0, # bias is just 1. 
        'word.lower()': word.lower(), # tells if the token is lower case 
        'word[-3:]': word[-3:], # takes the last four letters - the suffix
        'Word.in_materials': in_materials, #is the token in the material ontology
#         'Word.in_taxon': in_taxon, #is the token in the material ontology
        'Word.in_animal': in_animal, #is the token in the periods ontology 
        'Word.in_periods': in_periods, #is the token in the periods ontology
        'Word.in_context': in_context, #is the token in the evidence ontology
        'word.isupper()': word.isupper(), # tells if the whole token is uppercase 
        'word.istitle()': word.istitle(), # tells if the token is capital first letter
        'postag': postag,  # what is its label - Part-Of-Speech Tagger
        'postag[:2]': postag[:2],  #Takes the first three letters of the tag
        'word.isdigit()': word.isdigit(),
    }
    if i > 0: #if the token is not at the start of a sentence
        word1 = sent[i-1][0] # works out details of the token before - this is to understand the context 
        postag1 = sent[i-1][1] #what is the postag of the word before
        wordbefore = (sent[i-1][0]+ ' ' +sent[i][0]).lower #this is the token and the token before
        #if this word and word before is in the ontology then 
        if wordbefore in materials_list: 
            wordbefore_in_materials = True 
        else:
            wordbefore_in_materials = False

#         if wordbefore in taxon_list: 
#             wordbefore_in_taxon = True 
#         else:
#             wordbefore_in_taxon = False

        if wordbefore in animal_list: 
            wordbefore_in_animal = True 
            print(word)
        else:
            wordbefore_in_animal = False

        if wordbefore in periods_list: 
            wordbefore_in_periods = True 
        else:
            wordbefore_in_periods = False

        if wordbefore in context_list: 
            wordbefore_in_context = True 
        else:
            wordbefore_in_context = False
        features.update({
            '-1:word.lower()': word1.lower(), # tells if the token is lower case
            '-1:word.istitle()': word1.istitle(), # tells if the token is capital first letter
            '-1:word.isdigit()': word1.isdigit(), # tells if the toekn is only numbers
            '-1:word.isupper()': word1.isupper(),# tells if the whole token is uppercase
            'wordbefore_in_materials': wordbefore_in_materials,
#             'wordbefore_in_taxon': wordbefore_in_taxon,
            'wordbefore_in_animal': wordbefore_in_animal,
            'wordbefore_in_periods': wordbefore_in_periods,
            'wordbefore_in_context': wordbefore_in_context,
            '-1:postag': postag1, # what was its POS tag
            '-1:postag[:2]': postag1[:2], #what is the first three POS tag of the word before
        })
    
        if i > 1: #if the token is not at the start of a sentence
                word3 = sent[i-1][0] # works out details of the token before - this is to understand the context 
                postag3 = sent[i-1][1] #what is the postag of the word before
                two_words_before = (sent[i-1][0]+ ' ' +sent[i-1][0]+ ' ' +sent[i][0]).lower #this is the token and the token before
                #if this word and word before is in the ontology then 
                if two_words_before in materials_list: 
                    two_words_before_in_materials = True 
                else:
                    two_words_before_in_materials = False

#                 if two_words_before in taxon_list: 
#                     two_words_before_in_taxon = True 
#                 else:
#                     two_words_before_in_taxon = False

                if two_words_before in animal_list: 
                    two_words_before_in_animal = True 
                    print(word)
                else:
                    two_words_before_in_animal = False

                if two_words_before in periods_list: 
                    two_words_before_in_periods = True 
                else:
                    two_words_before_in_periods = False

                if two_words_before in context_list: 
                    two_words_before_in_context = True 
                else:
                    two_words_before_in_context = False
                features.update({
                    '-1:word.lower()': word1.lower(), # tells if the token is lower case
                    '-1:word.istitle()': word1.istitle(), # tells if the token is capital first letter
                    '-1:word.isdigit()': word1.isdigit(), # tells if the toekn is only numbers
                    '-1:word.isupper()': word1.isupper(),# tells if the whole token is uppercase
                    'two_words_before_in_materials': two_words_before_in_materials,
#                     'two_words_before_in_taxon': two_words_before_in_taxon,
                    'two_words_before_in_animal': two_words_before_in_animal,
                    'two_words_before_in_periods': two_words_before_in_periods,
                    'two_words_before_in_context': two_words_before_in_context,
                    '-2-postag': postag3, # what was its POS tag
                    '-2:postag[:2]': postag3[:2], #what is the first three POS tag of the word before
                })
        else:
            features['BOS2'] = True # if word is the beggining of sentence label it as so         
    else:
        features['BOS'] = True # if word is the beggining of sentence label it as so 
        
    if i < len(sent)-1: # is the word at the end of the sentence. sme as above after
        wordafter= (sent[i][0]+ ' ' +sent[i+1][0]).lower
        #this tells if the token AFTER and each token combined is in the ontology or not
        if wordafter in materials_list: 
            wordafter_in_materials = True 
        else:
            wordafter_in_materials = False

#         if wordafter in taxon_list: 
#             wordafter_in_taxon = True 
#         else:
#             wordafter_in_taxon = False

        if wordafter in animal_list: 
            wordafter_in_animal = True 
        else:
            wordafter_in_animal = False

        if wordafter in periods_list: 
            wordafter_in_periods = True 
        else:
            wordafter_in_periods = False

        if wordafter in context_list: 
            wordafter_in_context = True 
        else:
            wordafter_in_context = False
        word2 = sent[i+1][0]  
        postag2 = sent[i+1][1] 
        features.update({
            'wordafter_in_materials': wordafter_in_materials,
#             'wordafter_in_taxon': wordafter_in_taxon,
            'wordafter_in_animal': wordafter_in_animal,
            'wordafter_in_periods': wordafter_in_periods,
            'wordafter_in_context': wordafter_in_context,
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
        
        if i < len(sent)-2: #if the token is not at the start of a sentence
                word4 = sent[i-1][0] # works out details of the token before - this is to understand the context 
                postag4 = sent[i-1][1] #what is the postag of the word before
                two_words_after = (sent[i][0]+ ' ' +sent[i+1][0]+ ' ' +sent[i+1][0]).lower#this is the token and the token before
                #if this word and word before is in the ontology then 
                if two_words_after in materials_list: 
                    two_words_after_in_materials = True 
                else:
                    two_words_after_in_materials = False

#                 if two_words_after in taxon_list: 
#                     two_words_after_in_taxon = True 
#                 else:
#                     two_words_after_in_taxon = False

                if two_words_after in animal_list: 
                    two_words_after_in_animal = True 
                    print(word)
                else:
                    two_words_after_in_animal = False

                if two_words_after in periods_list: 
                    two_words_after_in_periods = True 
                else:
                   two_words_after_in_periods = False

                if two_words_after in context_list: 
                    two_words_after_in_context = True 
                else:
                    two_words_after_in_context = False
                features.update({
                    '-1:word.lower()': word4.lower(), # tells if the token is lower case
                    '-1:word.istitle()': word4.istitle(), # tells if the token is capital first letter
                    '-1:word.isdigit()': word4.isdigit(), # tells if the toekn is only numbers
                    '-1:word.isupper()': word4.isupper(),# tells if the whole token is uppercase
                    'two_words_after_in_materials': two_words_after_in_materials,
#                     'two_words_after_in_taxon': two_words_after_in_taxon,
                    'two_words_after_in_animal': two_words_after_in_animal,
                    'two_words_after_in_periods': two_words_after_in_periods,
                    'two_words_after_in_context': two_words_after_in_context,
                    '-2-postag': postag4, # what was its POS tag
                    '-2:postag[:2]': postag4[:2], #what is the first three POS tag of the word before
                })
        else:
            features['EOS2'] = True # if word is the beggining of sentence label it as so       
        
        
        
    else:
        features['EOS'] = True # if word is the end of sentence label it as so         
   

    if i < len(sent)-1 and i > 0: # is the word at the end of the sentence. sme as above after
            word3 = sent[i+1][0]  
            postag3 = sent[i+1][1] 
            wordsorround = (sent[i-1][0]+ ' ' +sent[i][0]+ ' ' +sent[i+1][0]).lower
            if wordsorround in materials_list: 
                wordsorround_in_materials = True 
            else:
                wordsorround_in_materials = False

#             if wordsorround in taxon_list: 
#                 wordsorround_in_taxon = True 
#             else:
#                 wordsorround_in_taxon = False

            if wordsorround in animal_list: 
                wordsorround_in_animal = True 
            else:
                wordsorround_in_animal = False

            if wordsorround in periods_list: 
                wordsorround_in_periods = True 
            else:
                wordsorround_in_periods = False

            if wordsorround in context_list: 
                wordsorround_in_context = True 
            else:
                wordsorround_in_context = False
            features.update({
                'wordsorround_in_materials': wordsorround_in_materials,
#                 'wordsorround_in_taxon': wordsorround_in_taxon,
                'wordsorround_in_animal': wordsorround_in_animal,
                'wordsorround_in_periods': wordsorround_in_periods,
                'wordsorround_in_context': wordsorround_in_context,
            })
            if i < len(sent)-2 and i > 1: # is the word at the end of the sentence. sme as above after
                word5 = sent[i+1][0]  
                postag5 = sent[i+1][1] 
                twowordsorround = (sent[i-2][0]+ ' ' +sent[i-1][0]+ ' ' +sent[i][0]+ ' ' +sent[i+1][0]+ ' ' +sent[i+2][0]).lower
                if twowordsorround in materials_list: 
                    twowordsorround_in_materials = True 
                else:
                    twowordsorround_in_materials = False

#                 if twowordsorround in taxon_list: 
#                     twowordsorround_in_taxon = True 
#                 else:
#                     twowordsorround_in_taxon = False

                if twowordsorround in animal_list: 
                    twowordsorround_in_animal = True 
                else:
                    twowordsorround_in_animal = False

                if twowordsorround in periods_list: 
                    twowordsorround_in_periods = True 
                else:
                    twowordsorround_in_periods = False

                if twowordsorround in context_list: 
                    twowordsorround_in_context = True 
                else:
                    twowordsorround_in_context = False
                features.update({
                    'twowordsorround_in_materials': twowordsorround_in_materials,
#                     'twowordsorround_in_taxon': twowordsorround_in_taxon,
                    'twowordsorround_in_animal': twowordsorround_in_animal,
                    'twowordsorround_in_periods': twowordsorround_in_periods,
                    'twowordsorround_in_context': twowordsorround_in_context,                   
                })
            else:
                    features['OWS'] = True # if word is the end of sentence label it as so 
    else:
            features['OWS'] = True # if word is the end of sentence label it as so 
            
    return features # output these details
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))] #output for each word

def sent2labels(sent):
    return [label for token, postag, label in sent] #output for each token

def sent2tokens(sent):
    return [token for token, postag, label in sent] #output for ???


In [11]:
for folder in os.listdir('no-tables\\txt'):    
    print(folder)
    #load the datasets, with training documents as train, and test documents and test
    train =  file2list('no-tables\\txt\\'+folder+"\\train.txt") 
    test =  file2list('no-tables\\txt\\'+folder+"\\test.txt") 
    #remove empty lines as this breaks the code
    test[0].pop() 
    train[0].pop()
#    
    # calculates the time to open the file
            #train the NER on the list. there is one set of test and one of training. often 20:80 split
    train_sent = train
    test_sent = test   # tests the sent (input) of the given list as defined above
    #train_sent[0] # displayes the first 10 rows in the bio. - each row hs the token (effectively word), followed by pos?, and the bio label
    # to identify whats below - token - label(specific label) - common derivitive of word (for posting would be post)  - then the bio label
    #sent2features(train_sent[0:1])[0]
    [sent2features(s) for s in train_sent[0:1]]
    X_train = [sent2features(s) for s in train_sent] # for the token train the ner on the learned set
    y_train = [sent2labels(s) for s in train_sent] # for the POS tag train the ner on the learned set

    X_test = [sent2features(s) for s in test_sent] # for the token train the ner on the test set
    y_test = [sent2labels(s) for s in test_sent] # for the POS tag train the ner on the test set
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs', 
        c1=0.0020339643465827964,  #was initially 0.1 each
        c2=0.028003487848126302, # 'c1': 0.2963053968677204, 'c2': 0.004195898642365605
        max_iterations=100, 
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)

    labels = list(crf.classes_) # get the list of all labels
#     print(labels)
    labels.remove('O') # remove the ones where bio is o - not got a postag
    # labels # show what the labels are 
    #calculate the f1 score
    y_pred = crf.predict(X_test) # work out and predict what is the likely token
    metrics.flat_f1_score(y_test, y_pred, # work out what the likely postag will be, and give it a f1 score
                          average='weighted', labels=labels)

    #calculate the metrics table
    # group B and I results - this isnt needed, but orders the list
    sorted_labels = sorted(
        labels, 
        key=lambda name: (name[1:], name[0])
    )
    # print(metrics.classification_report(y_test, y_pred))
    print(metrics.flat_classification_report(
       y_test, y_pred, labels=sorted_labels, digits=3
    ))

split0


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.758     0.479     0.587       470
       I-ART      0.342     0.562     0.425        96
       B-CON      0.825     0.426     0.562       399
       I-CON      0.485     0.281     0.356        57
       B-LOC      0.865     0.557     0.677       115
       I-LOC      0.736     0.574     0.645        68
       B-MAT      0.438     0.111     0.177        63
       I-MAT      0.500     0.077     0.133        13
       B-PER      0.923     0.806     0.860       608
       I-PER      0.934     0.762     0.840       669
       B-SPE      1.000     0.193     0.323        83
       I-SPE      0.000     0.000     0.000         0

   micro avg      0.824     0.603     0.696      2641
   macro avg      0.650     0.402     0.465      2641
weighted avg      0.832     0.603     0.684      2641

split1
              precision    recall  f1-score   support

       B-ART      0.688     0.181     0.286       293
       I-ART     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.721     0.150     0.249       293
       I-ART      0.192     0.098     0.130        51
       B-CON      0.811     0.128     0.221       235
       I-CON      0.100     0.026     0.041        39
       B-LOC      0.759     0.293     0.423        75
       I-LOC      0.826     0.514     0.633        37
       B-MAT      0.231     0.053     0.086        57
       I-MAT      0.000     0.000     0.000        13
       B-PER      0.789     0.421     0.549       356
       I-PER      0.837     0.505     0.630       438
       B-SPE      0.000     0.000     0.000        29

   micro avg      0.758     0.305     0.435      1623
   macro avg      0.479     0.199     0.269      1623
weighted avg      0.717     0.305     0.409      1623

split3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.694     0.232     0.348       293
       I-ART      0.226     0.275     0.248        51
       B-CON      0.789     0.128     0.220       235
       I-CON      0.125     0.026     0.043        39
       B-LOC      0.767     0.307     0.438        75
       I-LOC      0.826     0.514     0.633        37
       B-MAT      0.250     0.035     0.062        57
       I-MAT      0.500     0.077     0.133        13
       B-PER      0.787     0.478     0.594       356
       I-PER      0.840     0.527     0.648       438
       B-SPE      0.000     0.000     0.000        29

   micro avg      0.736     0.344     0.469      1623
   macro avg      0.528     0.236     0.306      1623
weighted avg      0.716     0.344     0.447      1623

split4


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.639     0.259     0.369       293
       I-ART      0.091     0.098     0.094        51
       B-CON      0.872     0.174     0.291       235
       I-CON      0.000     0.000     0.000        39
       B-LOC      0.769     0.400     0.526        75
       I-LOC      0.625     0.541     0.580        37
       B-MAT      0.333     0.035     0.063        57
       I-MAT      0.000     0.000     0.000        13
       B-PER      0.829     0.559     0.668       356
       I-PER      0.870     0.610     0.717       438
       B-SPE      0.000     0.000     0.000        29
       I-SPE      0.000     0.000     0.000         0

   micro avg      0.751     0.394     0.517      1623
   macro avg      0.419     0.223     0.276      1623
weighted avg      0.723     0.394     0.491      1623

split5


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.645     0.266     0.377       293
       I-ART      0.157     0.157     0.157        51
       B-CON      0.776     0.162     0.268       235
       I-CON      0.000     0.000     0.000        39
       B-LOC      0.810     0.453     0.581        75
       I-LOC      0.564     0.595     0.579        37
       B-MAT      0.375     0.053     0.092        57
       I-MAT      1.000     0.077     0.143        13
       B-PER      0.779     0.576     0.662       356
       I-PER      0.898     0.562     0.691       438
       B-SPE      1.000     0.069     0.129        29
       I-SPE      0.000     0.000     0.000         0

   micro avg      0.746     0.392     0.514      1623
   macro avg      0.584     0.247     0.307      1623
weighted avg      0.736     0.392     0.490      1623

split6


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.635     0.273     0.382       293
       I-ART      0.114     0.098     0.105        51
       B-CON      0.855     0.226     0.357       235
       I-CON      0.588     0.256     0.357        39
       B-LOC      0.810     0.453     0.581        75
       I-LOC      0.515     0.459     0.486        37
       B-MAT      0.267     0.070     0.111        57
       I-MAT      0.000     0.000     0.000        13
       B-PER      0.855     0.680     0.757       356
       I-PER      0.916     0.644     0.756       438
       B-SPE      1.000     0.069     0.129        29
       I-SPE      0.000     0.000     0.000         0

   micro avg      0.777     0.449     0.569      1623
   macro avg      0.546     0.269     0.335      1623
weighted avg      0.767     0.449     0.547      1623

split7


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.691     0.328     0.444       293
       I-ART      0.238     0.392     0.296        51
       B-CON      0.826     0.243     0.375       235
       I-CON      0.474     0.231     0.310        39
       B-LOC      0.846     0.440     0.579        75
       I-LOC      0.583     0.568     0.575        37
       B-MAT      0.333     0.088     0.139        57
       I-MAT      0.143     0.077     0.100        13
       B-PER      0.881     0.685     0.771       356
       I-PER      0.856     0.667     0.750       438
       B-SPE      1.000     0.069     0.129        29
       I-SPE      0.000     0.000     0.000         0

   micro avg      0.759     0.481     0.588      1623
   macro avg      0.573     0.316     0.372      1623
weighted avg      0.771     0.481     0.571      1623

split8


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-ART      0.778     0.382     0.513       293
       I-ART      0.268     0.431     0.331        51
       B-CON      0.744     0.260     0.385       235
       I-CON      0.526     0.256     0.345        39
       B-LOC      0.878     0.480     0.621        75
       I-LOC      0.656     0.568     0.609        37
       B-MAT      0.583     0.123     0.203        57
       I-MAT      0.500     0.077     0.133        13
       B-PER      0.899     0.775     0.833       356
       I-PER      0.916     0.651     0.761       438
       B-SPE      1.000     0.069     0.129        29
       I-SPE      0.000     0.000     0.000         0

   micro avg      0.806     0.513     0.627      1623
   macro avg      0.646     0.339     0.405      1623
weighted avg      0.812     0.513     0.608      1623

split9
              precision    recall  f1-score   support

       B-ART      0.826     0.437     0.571       293
       I-ART     

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
#time to use POS

In [10]:
#print(sent2features[:2])

In [11]:
#time to train the NER

In [12]:
#now to evaluate its success

In [13]:
#work out which of the transitions are most likely in descending order
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

#what are the 20 most likely transitions
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

#what are the 20 least likely transitions
print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-ART  -> I-ART   4.926236
I-PER  -> I-PER   4.788583
B-ART  -> I-ART   4.600642
B-CON  -> I-CON   4.204833
I-CON  -> I-CON   3.848280
B-SPE  -> I-SPE   3.827577
B-MAT  -> I-MAT   3.812444
I-LOC  -> I-LOC   3.396669
O      -> O       3.335598
B-PER  -> I-PER   2.781085
B-LOC  -> I-LOC   2.384541
I-MAT  -> I-MAT   2.154743
B-MAT  -> B-ART   1.883056
O      -> B-PER   1.763551
B-SPE  -> B-ART   1.433932
I-ART  -> B-CON   1.351225
B-MAT  -> B-CON   1.056310
I-MAT  -> B-ART   1.043014
I-SPE  -> I-SPE   0.870469
I-CON  -> B-CON   0.855547

Top unlikely transitions:
B-PER  -> B-SPE   -1.840747
B-ART  -> I-CON   -1.864372
B-LOC  -> B-CON   -1.870458
B-CON  -> I-ART   -2.168228
I-PER  -> I-ART   -2.185135
B-MAT  -> I-CON   -2.201417
B-MAT  -> I-ART   -2.239880
B-ART  -> B-CON   -2.247380
B-ART  -> B-PER   -2.261325
O      -> I-SPE   -2.262403
B-PER  -> I-CON   -2.506402
B-SPE  -> I-ART   -2.640231
B-LOC  -> I-PER   -2.748381
B-PER  -> I-ART   -2.900018
O      -> I-MAT  

In [14]:
#What aspects of the terms make it likely to be that tag
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

#what are the 30 most likely aspects
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

#what 30 aspects make it least likely to be that term
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
5.224631 B-PER    word[-3:]:xon
5.105987 B-ART    word.lower():coping
5.092587 B-CON    word.lower():postholes
5.063482 B-ART    word.lower():loomweight
4.936461 B-PER    word[-3:]:/05
4.771421 B-MAT    word.lower():ragstone
4.559046 B-LOC    word.lower():leicester
4.446704 O        Word_after:pits or
4.429710 B-ART    word.lower():pottery
4.398048 O        word_before:of Bedfordshire
4.393974 O        word_before:or flint
4.391809 B-CON    word.lower():post-holes
4.385493 O        Word_after:school had
4.369093 B-CON    word_before:( Building
4.366871 B-PER    word.lower():post-medieval
4.311591 B-CON    Word_after:school to
4.285267 B-PER    word[-3:]:val
4.283504 B-CON    Word_after:floor was
4.110761 B-ART    word_before:charcoal remains
4.104962 B-PER    word.lower():romano-british
4.094635 B-LOC    word[-3:]:ton
4.073056 B-PER    word.lower():roman
4.011830 I-PER    word.lower():centuries
3.937804 B-ART    word.lower():slates
3.925067 B-ART    word.lower():tegulae
3

In [15]:
## dont run this takes too long

In [16]:
%%time 
# this is to work out the best parameters for the testing. not needed yet but can increase the results by .1 
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


ValueError: Cannot have number of splits n_splits=3 greater than the number of samples: n_samples=1.

In [17]:
# crf = rs.best_estimator_  # shows that the best params are 
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

In [None]:
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))