In [5]:
#load and install required packages

In [6]:
#install required packages
!pip install sklearn_crfsuite
!pip install scikit-learn 
!pip install nltk 



In [7]:
# import required libraries
import pandas as pd 
import nltk
import sklearn
import scipy.stats
import numpy as np
import seaborn as sns
import sklearn_crfsuite

#from matplotlib 
from matplotlib import pyplot as plt

#from itertools
from itertools import chain

#from sklearn
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [8]:
#load the data and ontologies

In [9]:
#this function turns the file into a list. 
def file2list(fileLocation):
    outputList = []
    with open(fileLocation, 'r', encoding='utf8') as myfile:
        sentences = myfile.read().split('\n\n')
        for sentence in sentences:
                sentenceList = []
                words = sentence.split('\n')
                for word in words:
                    wordsList = []
                    attributes = word.split(' ')
                    for attribute in attributes:
                        wordsList.append(attribute)
                    sentenceList.append(wordsList)
                outputList.append(sentenceList)
    
    return outputList

In [10]:
#load the datasets, with training documents as train, and test documents and test
train =  file2list('txt/train.txt') 
test =  file2list('txt/test.txt') 
#remove empty lines as this breaks the code
test.pop() 
train.pop()

[['']]

In [11]:
%%time 
# calculates the time to open the file
        #train the NER on the list. there is one set of test and one of training. often 20:80 split
train_sent = train
test_sent = test   # tests the sent (input) of the given list as defined above
train_sent[0] # displayes the first 10 rows in the bio. - each row hs the token (effectively word), followed by pos?, and the bio label
# to identify whats below - token - label(specific label) - common derivitive of word (for posting would be post)  - then the bio label

Wall time: 0 ns


[['INTRODUCTION', 'NNP', 'O'],
 ['7', 'CD', 'O'],
 ['1.1', 'CD', 'O'],
 ['Project', 'NN', 'O'],
 ['Background', 'NNP', 'O'],
 ['7', 'CD', 'O'],
 ['1.2', 'CD', 'O'],
 ['Site', 'NNP', 'O'],
 ['Location', 'NNP', 'O'],
 ['and', 'CC', 'O'],
 ['Description', 'NNP', 'O'],
 ['7', 'CD', 'O'],
 ['1.3', 'CD', 'O'],
 ['Archaeological', 'NNP', 'O'],
 ['Background', 'NNP', 'O'],
 ['7', 'CD', 'O'],
 ['1.4', 'CD', 'O'],
 ['Methodologies', 'NNS', 'O'],
 ['9', 'CD', 'O'],
 ['1.5', 'CD', 'O'],
 ['Professional', 'JJ', 'O'],
 ['Standards', 'NNS', 'O'],
 ['9', 'CD', 'O'],
 ['2', 'CD', 'O'],
 ['.', '.', 'O']]

In [79]:
#time to load the ontologies 

#Material ontology
materials = pd.read_csv(r'ontologies\Materials.csv') #open the file
materials[materials.columns[2]]= materials[materials.columns[2]].str.lower() #takes the second column (the one with the entities) and makes it all lower case
materials_list = materials[materials.columns[2]].values.tolist() #turns the items into a list
print(materials_list) # outputs the list of terms to see what sort of data it contains
print("\n""\n")

#Taxon ontology
taxon = pd.read_csv(r'ontologies\Taxon.tsv', sep="\t", error_bad_lines=False) #open the file
taxon[taxon.columns[8]]= taxon[taxon.columns[8]].str.lower() #takes the second column (the one with the entities) and makes it all lower case
taxon_list = taxon[taxon.columns[8]].values.tolist() #turns the items into a list
print(taxon_list[0:100]) # outputs the list of terms to see what sort of data it contains
print("\n""\n")

#Animal ontology
animal = pd.read_csv(r'ontologies\VernacularName.tsv', sep="\t") #open the file
animal[animal.columns[2]]= animal[animal.columns[2]].str.lower() #takes the second column (the one with the entities) and makes it all lower case
animal_list = animal[animal.columns[2]].values.tolist() #turns the items into a list
print(animal_list[0:100]) # outputs the list of terms to see what sort of data it contains
print("\n""\n")


#Periods ontology - for notes see above
periods = pd.read_csv(r'ontologies\Periods.csv')
periods[periods.columns[1]]= periods[periods.columns[1]].str.lower()
periods_list = periods[periods.columns[1]].values.tolist()
print(periods_list[0:100])
print("\n""\n")


#Context ontology - for notes see material 
context = pd.read_csv(r'ontologies\context.csv')
context[context.columns[0]]= context[context.columns[0]].str.lower().str.replace('*','').str.replace('<','').str.replace('>','') #replace all other symbols
context[context.columns[0]]= context[context.columns[0]].replace(to_replace =':.*',value='',regex=True) # replace the words after a colon
context_list = context[context.columns[0]].values.tolist()
print(context_list[0:100])
print("\n""\n")

['elm', 'felt', 'alabaster', 'spruce', 'tamarac', 'aluminum', 'variscite', 'argillite', 'totternhoe clunch', 'ash', 'aluminium', 'carnelian', 'cornelian', 'plaster', 'sapphire', 'paper', 'ebony', 'garnet', 'rubber', 'coal', 'emerald', 'hazel', 'puddingstone', 'hertfordshire puddingstone', 'charcoal', 'chalk', 'hydrocarbon', 'bakelite', 'amethyst', 'amphibolite', 'larch', 'siltstone', 'mudstone', 'utahlite', 'teak', 'shale', 'ivory', 'marble', 'limestone', 'leather', 'lead', 'lava', 'faience', 'jadeite', 'tooth', 'iron', 'pottery', 'greenstone', 'gold', 'glass', 'flint', 'jet', 'silver', 'pewter', 'obsidian', 'sandstone', 'object material', 'oak', 'mineral', 'wood', 'shell', 'quartz', 'slate', 'steel', 'stone', 'terracotta', 'tin', 'granite', 'quartzite', 'fir', 'antimony', 'schist', 'birch', 'lead alloy', 'zinc', 'dolerite', 'ceramic', 'pine', 'fibreglass', 'glass fibre', 'graphite', 'jade', 'onyx', 'fiberglass', 'beech', 'textile', 'metal', 'alloy', 'bronze', 'horn', 'brass', 'bone', 



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 3090873: expected 22 fields, saw 25\n'
  exec(code_obj, self.user_global_ns, self.user_ns)


['urocystis beckwithiae vánky', 'urocystis murashkinskyi (cif.) zundel', 'urocystis pulsatillae-albae vánky & tóth', 'urocystis vulpiae vánky', 'urocystis bulbinellae (p.h.b. talbot) vánky, m. lutz, r. bauer & piątek', 'biota', 'velocipedoidea', 'urocystis simplex (liro) vánky', 'urocystis skirgielloae piątek', 'urocystis curculiginis a.r. patil, t.m. patil & m.s. patil', 'urocystis clintoniae (kom.) uljan. ex govorova', 'urocystis paridis (unger) thüm.', 'urocystis tranzscheliana (lavrov) zundel', 'urocystis lithophragmatis garrett', 'urocystis puccinelliae l. guo & h.c. zhang', 'urocystis chorizandrae cunningt., r.g. shivas & vánky', 'urocystis circaeasteri vánky', 'urocystis aurea vánky', 'urocystis yunnanensis l. guo', 'callichilella', 'urocystis callianthemi domashova', 'urocystis pseudoanemones denchev, kakish. & y. harada', 'urocystis macrospora (desm.) liro', 'urocystis beijingensis l. guo', 'urocystis filipendulae (tul. & c. tul.) j. schröt.', 'urocystis reinhardii m. piepenbr



In [80]:
#time to use POS

In [105]:
#this calls the function
def word2features(sent, i): 
    word = sent[i][0] #takes each token
    postag = sent[i][1] #this is the Part of Speach Tager
    
     #this tells if each token is in the ontology or not 
    if word in materials_list: 
        in_materials = True 
    else:
        in_materials = False
    
    if word in taxon_list: 
        in_taxon = True 
    else:
        in_taxon = False
    
    if word in animal_list: 
        in_animal = True 
    else:
        in_animal = False
  
    if word in periods_list: 
        in_periods = True 
    else:
        in_periods = False

    if word in context_list: 
        in_context = True 
    else:
        in_context = False
    
    #time to give each token some information     
    features = { # these are all default. 
        'bias': 1.0, # bias is just 1. 
        'word.lower()': word.lower(), # tells if the token is lower case 
        'word[-3:]': word[-3:], # takes the last four letters - the suffix
        'Word.in_materials': in_materials, #is the token in the material ontology
#         'Word.in_taxon': in_taxon, #is the token in the material ontology
#         'Word.in_animal': in_animal, #is the token in the periods ontology 
        'Word.in_periods': in_periods, #is the token in the periods ontology
        'Word.in_context': in_context, #is the token in the evidence ontology
        'word.isupper()': word.isupper(), # tells if the whole token is uppercase 
        'word.istitle()': word.istitle(), # tells if the token is capital first letter
        'postag': postag,  # what is its label - Part-Of-Speech Tagger
        'postag[:2]': postag[:2],  #Takes the first three letters of the tag
        'word.isdigit()': word.isdigit(),
    }
    if i > 0: #if the token is not at the start of a sentence
        word1 = sent[i-1][0] # works out details of the token before - this is to understand the context 
        postag1 = sent[i-1][1] #what is the postag of the word before
        wordbefore = (sent[i-1][0]+ ' ' +sent[i][0]).lower #this is the token and the token before
        #if this word and word before is in the ontology then 
        if wordbefore in materials_list: 
            wordbefore_in_materials = True 
        else:
            wordbefore_in_materials = False

#         if wordbefore in taxon_list: 
#             wordbefore_in_taxon = True 
#         else:
#             wordbefore_in_taxon = False

        if wordbefore in animal_list: 
            wordbefore_in_animal = True 
            print(word)
        else:
            wordbefore_in_animal = False

        if wordbefore in periods_list: 
            wordbefore_in_periods = True 
        else:
            wordbefore_in_periods = False

        if wordbefore in context_list: 
            wordbefore_in_context = True 
        else:
            wordbefore_in_context = False
        features.update({
            '-1:word.lower()': word1.lower(), # tells if the token is lower case
            '-1:word.istitle()': word1.istitle(), # tells if the token is capital first letter
            '-1:word.isdigit()': word1.isdigit(), # tells if the toekn is only numbers
            '-1:word.isupper()': word1.isupper(),# tells if the whole token is uppercase
            'wordbefore_in_materials': wordbefore_in_materials,
#             'wordbefore_in_taxon': wordbefore_in_taxon,
#             'wordbefore_in_animal': wordbefore_in_animal,
            'wordbefore_in_periods': wordbefore_in_periods,
            'wordbefore_in_context': wordbefore_in_context,
            '-1:postag': postag1, # what was its POS tag
            '-1:postag[:2]': postag1[:2], #what is the first three POS tag of the word before
        })
    
        if i > 1: #if the token is not at the start of a sentence
                word3 = sent[i-1][0] # works out details of the token before - this is to understand the context 
                postag3 = sent[i-1][1] #what is the postag of the word before
                two_words_before = (sent[i-1][0]+ ' ' +sent[i-1][0]+ ' ' +sent[i][0]).lower #this is the token and the token before
                #if this word and word before is in the ontology then 
                if two_words_before in materials_list: 
                    two_words_before_in_materials = True 
                else:
                    two_words_before_in_materials = False

#                 if two_words_before in taxon_list: 
#                     two_words_before_in_taxon = True 
#                 else:
#                     two_words_before_in_taxon = False

                if two_words_before in animal_list: 
                    two_words_before_in_animal = True 
                    print(word)
                else:
                    two_words_before_in_animal = False

                if two_words_before in periods_list: 
                    two_words_before_in_periods = True 
                else:
                    two_words_before_in_periods = False

                if two_words_before in context_list: 
                    two_words_before_in_context = True 
                else:
                    two_words_before_in_context = False
                features.update({
                    '-1:word.lower()': word1.lower(), # tells if the token is lower case
                    '-1:word.istitle()': word1.istitle(), # tells if the token is capital first letter
                    '-1:word.isdigit()': word1.isdigit(), # tells if the toekn is only numbers
                    '-1:word.isupper()': word1.isupper(),# tells if the whole token is uppercase
                    'two_words_before_in_materials': two_words_before_in_materials,
#                     'two_words_before_in_taxon': two_words_before_in_taxon,
#                     'two_words_before_in_animal': two_words_before_in_animal,
                    'two_words_before_in_periods': two_words_before_in_periods,
                    'two_words_before_in_context': two_words_before_in_context,
                    '-2-postag': postag3, # what was its POS tag
                    '-2:postag[:2]': postag3[:2], #what is the first three POS tag of the word before
                })
        else:
            features['BOS2'] = True # if word is the beggining of sentence label it as so         
    else:
        features['BOS'] = True # if word is the beggining of sentence label it as so 
        
    if i < len(sent)-1: # is the word at the end of the sentence. sme as above after
        wordafter= (sent[i][0]+ ' ' +sent[i+1][0]).lower
        #this tells if the token AFTER and each token combined is in the ontology or not
        if wordafter in materials_list: 
            wordafter_in_materials = True 
        else:
            wordafter_in_materials = False

#         if wordafter in taxon_list: 
#             wordafter_in_taxon = True 
#         else:
#             wordafter_in_taxon = False

        if wordafter in animal_list: 
            wordafter_in_animal = True 
        else:
            wordafter_in_animal = False

        if wordafter in periods_list: 
            wordafter_in_periods = True 
        else:
            wordafter_in_periods = False

        if wordafter in context_list: 
            wordafter_in_context = True 
        else:
            wordafter_in_context = False
        word2 = sent[i+1][0]  
        postag2 = sent[i+1][1] 
        features.update({
            'wordafter_in_materials': wordafter_in_materials,
#             'wordafter_in_taxon': wordafter_in_taxon,
#             'wordafter_in_animal': wordafter_in_animal,
            'wordafter_in_periods': wordafter_in_periods,
            'wordafter_in_context': wordafter_in_context,
            '+2:word.lower()': word2.lower(),
            '+2:word.istitle()': word2.istitle(),
            '+2:word.isupper()': word2.isupper(),
            '+2:postag': postag2,
            '+2:postag[:2]': postag2[:2],
        })
        
        if i < len(sent)-2: #if the token is not at the start of a sentence
                word4 = sent[i-1][0] # works out details of the token before - this is to understand the context 
                postag4 = sent[i-1][1] #what is the postag of the word before
                two_words_after = (sent[i][0]+ ' ' +sent[i+1][0]+ ' ' +sent[i+1][0]).lower#this is the token and the token before
                #if this word and word before is in the ontology then 
                if two_words_after in materials_list: 
                    two_words_after_in_materials = True 
                else:
                    two_words_after_in_materials = False

#                 if two_words_after in taxon_list: 
#                     two_words_after_in_taxon = True 
#                 else:
#                     two_words_after_in_taxon = False

                if two_words_after in animal_list: 
                    two_words_after_in_animal = True 
                    print(word)
                else:
                    two_words_after_in_animal = False

                if two_words_after in periods_list: 
                    two_words_after_in_periods = True 
                else:
                   two_words_after_in_periods = False

                if two_words_after in context_list: 
                    two_words_after_in_context = True 
                else:
                    two_words_after_in_context = False
                features.update({
                    '-1:word.lower()': word4.lower(), # tells if the token is lower case
                    '-1:word.istitle()': word4.istitle(), # tells if the token is capital first letter
                    '-1:word.isdigit()': word4.isdigit(), # tells if the toekn is only numbers
                    '-1:word.isupper()': word4.isupper(),# tells if the whole token is uppercase
                    'two_words_after_in_materials': two_words_after_in_materials,
#                     'two_words_after_in_taxon': two_words_after_in_taxon,
#                     'two_words_after_in_animal': two_words_after_in_animal,
                    'two_words_after_in_periods': two_words_after_in_periods,
                    'two_words_after_in_context': two_words_after_in_context,
                    '-2-postag': postag4, # what was its POS tag
                    '-2:postag[:2]': postag4[:2], #what is the first three POS tag of the word before
                })
        else:
            features['EOS2'] = True # if word is the beggining of sentence label it as so       
        
        
        
    else:
        features['EOS'] = True # if word is the end of sentence label it as so         
   

    if i < len(sent)-1 and i > 0: # is the word at the end of the sentence. sme as above after
            word3 = sent[i+1][0]  
            postag3 = sent[i+1][1] 
            wordsorround = (sent[i-1][0]+ ' ' +sent[i][0]+ ' ' +sent[i+1][0]).lower
            if wordsorround in materials_list: 
                wordsorround_in_materials = True 
            else:
                wordsorround_in_materials = False

#             if wordsorround in taxon_list: 
#                 wordsorround_in_taxon = True 
#             else:
#                 wordsorround_in_taxon = False

            if wordsorround in animal_list: 
                wordsorround_in_animal = True 
            else:
                wordsorround_in_animal = False

            if wordsorround in periods_list: 
                wordsorround_in_periods = True 
            else:
                wordsorround_in_periods = False

            if wordsorround in context_list: 
                wordsorround_in_context = True 
            else:
                wordsorround_in_context = False
            features.update({
                'wordsorround_in_materials': wordsorround_in_materials,
#                 'wordsorround_in_taxon': wordsorround_in_taxon,
#                 'wordsorround_in_animal': wordsorround_in_animal,
                'wordsorround_in_periods': wordsorround_in_periods,
                'wordsorround_in_context': wordsorround_in_context,
            })
            if i < len(sent)-2 and i > 1: # is the word at the end of the sentence. sme as above after
                word5 = sent[i+1][0]  
                postag5 = sent[i+1][1] 
                twowordsorround = (sent[i-2][0]+ ' ' +sent[i-1][0]+ ' ' +sent[i][0]+ ' ' +sent[i+1][0]+ ' ' +sent[i+2][0]).lower
                if twowordsorround in materials_list: 
                    twowordsorround_in_materials = True 
                else:
                    twowordsorround_in_materials = False

#                 if twowordsorround in taxon_list: 
#                     twowordsorround_in_taxon = True 
#                 else:
#                     twowordsorround_in_taxon = False

                if twowordsorround in animal_list: 
                    twowordsorround_in_animal = True 
                else:
                    twowordsorround_in_animal = False

                if twowordsorround in periods_list: 
                    twowordsorround_in_periods = True 
                else:
                    twowordsorround_in_periods = False

                if twowordsorround in context_list: 
                    twowordsorround_in_context = True 
                else:
                    twowordsorround_in_context = False
                features.update({
                    'twowordsorround_in_materials': twowordsorround_in_materials,
#                     'twowordsorround_in_taxon': twowordsorround_in_taxon,
#                     'twowordsorround_in_animal': twowordsorround_in_animal,
                    'twowordsorround_in_periods': twowordsorround_in_periods,
                    'twowordsorround_in_context': twowordsorround_in_context,                   
                })
            else:
                    features['OWS'] = True # if word is the end of sentence label it as so 
    else:
            features['OWS'] = True # if word is the end of sentence label it as so 
            
    return features # output these details
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))] #output for each word

def sent2labels(sent):
    return [label for token, postag, label in sent] #output for each token

def sent2tokens(sent):
    return [token for token, postag, label in sent] #output for ???


In [106]:
#print(sent2features[:2])

In [107]:
#sent2features(train_sent[0:1])[0]
[sent2features(s) for s in train_sent[0:1]]

[[{'bias': 1.0,
   'word.lower()': 'introduction',
   'word[-3:]': 'ION',
   'Word.in_materials': False,
   'Word.in_periods': False,
   'Word.in_context': False,
   'word.isupper()': True,
   'word.istitle()': False,
   'postag': 'NNP',
   'postag[:2]': 'NN',
   'word.isdigit()': False,
   'BOS': True,
   'wordafter_in_materials': False,
   'wordafter_in_periods': False,
   'wordafter_in_context': False,
   '+2:word.lower()': '7',
   '+2:word.istitle()': False,
   '+2:word.isupper()': False,
   '+2:postag': 'CD',
   '+2:postag[:2]': 'CD',
   '-1:word.lower()': '.',
   '-1:word.istitle()': False,
   '-1:word.isdigit()': False,
   '-1:word.isupper()': False,
   'two_words_after_in_materials': False,
   'two_words_after_in_periods': False,
   'two_words_after_in_context': False,
   '-2-postag': '.',
   '-2:postag[:2]': '.',
   'OWS': True},
  {'bias': 1.0,
   'word.lower()': '7',
   'word[-3:]': '7',
   'Word.in_materials': False,
   'Word.in_periods': False,
   'Word.in_context': False,

In [108]:
#time to train the NER

In [109]:
%%time
X_train = [sent2features(s) for s in train_sent] # for the token train the ner on the learned set
y_train = [sent2labels(s) for s in train_sent] # for the POS tag train the ner on the learned set

X_test = [sent2features(s) for s in test_sent] # for the token train the ner on the test set
y_test = [sent2labels(s) for s in test_sent] # for the POS tag train the ner on the test set

Wall time: 2h 8min 29s


In [110]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.0020339643465827964,  #was initially 0.1 each
    c2=0.028003487848126302, # 'c1': 0.2963053968677204, 'c2': 0.004195898642365605
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Wall time: 14.9 s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.0020339643465827964,
    c2=0.028003487848126302, max_iterations=100)

In [111]:
labels = list(crf.classes_) # get the list of all labels
labels.remove('O') # remove the ones where bio is o - not got a postag
labels # show what the labels are 

['B-LOC',
 'I-LOC',
 'B-PER',
 'I-PER',
 'B-CON',
 'I-CON',
 'B-MAT',
 'B-ART',
 'I-ART',
 'I-MAT',
 'B-SPE',
 'I-SPE']

In [112]:
#now to evaluate its success

In [113]:
#calculate the f1 score
y_pred = crf.predict(X_test) # work out and predict what is the likely token
metrics.flat_f1_score(y_test, y_pred, # work out what the likely postag will be, and give it a f1 score
                      average='weighted', labels=labels)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


0.7132271244182367

In [114]:
#calculate the metrics table
# group B and I results - this isnt needed, but orders the list
sorted_labels = sorted(
    labels, 
    key=lambda name: (name[1:], name[0])
)
# print(metrics.classification_report(y_test, y_pred))
print(metrics.flat_classification_report(
   y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-ART      0.750     0.517     0.612       470
       I-ART      0.359     0.583     0.444        96
       B-CON      0.851     0.444     0.583       399
       I-CON      0.545     0.316     0.400        57
       B-LOC      0.915     0.565     0.699       115
       I-LOC      0.758     0.627     0.686        75
       B-MAT      0.400     0.095     0.154        63
       I-MAT      0.000     0.000     0.000        13
       B-PER      0.924     0.839     0.880       610
       I-PER      0.949     0.825     0.883       674
       B-SPE      1.000     0.265     0.419        83
       I-SPE      0.000     0.000     0.000         0

   micro avg      0.837     0.641     0.726      2655
   macro avg      0.621     0.423     0.480      2655
weighted avg      0.840     0.641     0.713      2655



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [115]:
#work out which of the transitions are most likely in descending order
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

#what are the 20 most likely transitions
print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

#what are the 20 least likely transitions
print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
I-ART  -> I-ART   5.281411
B-ART  -> I-ART   5.068936
B-CON  -> I-CON   4.967742
I-CON  -> I-CON   4.730384
B-SPE  -> I-SPE   4.002221
B-MAT  -> I-MAT   3.829932
I-PER  -> I-PER   3.715105
I-LOC  -> I-LOC   3.635283
B-LOC  -> I-LOC   3.171184
O      -> O       3.105637
B-PER  -> I-PER   2.296342
I-MAT  -> I-MAT   2.183223
I-SPE  -> I-SPE   2.120000
O      -> B-PER   1.656916
I-MAT  -> B-ART   1.514074
B-MAT  -> B-ART   1.463999
I-ART  -> B-CON   0.993129
O      -> B-CON   0.979078
B-MAT  -> B-CON   0.785693
O      -> B-SPE   0.738113

Top unlikely transitions:
B-SPE  -> I-ART   -2.435650
B-CON  -> I-PER   -2.534125
B-CON  -> B-PER   -2.561088
B-PER  -> B-PER   -2.582452
I-PER  -> I-ART   -2.730049
B-PER  -> I-CON   -2.808052
B-MAT  -> I-ART   -2.975743
B-MAT  -> I-CON   -3.143090
B-CON  -> I-ART   -3.229804
B-LOC  -> I-PER   -3.252390
I-PER  -> B-PER   -3.374516
B-PER  -> I-LOC   -3.509090
I-ART  -> B-ART   -3.525804
B-ART  -> I-PER   -3.606911
O      -> I-MAT  

In [116]:
#What aspects of the terms make it likely to be that tag
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

#what are the 30 most likely aspects
print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

#what 30 aspects make it least likely to be that term
print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
8.026391 B-PER    word.lower():ad1900-present
7.790288 B-CON    word.lower():postholes
7.525848 B-ART    word.lower():coping
7.339437 B-MAT    word.lower():cemented
6.967308 B-CON    word.lower():post-holes
6.908390 B-ART    word.lower():loomweight
6.566720 B-CON    word.lower():granite
6.552146 I-ART    -1:word.lower():building
6.474151 B-ART    word.lower():mandible
6.236954 O        EOS
6.175265 B-ART    word.lower():coarsewares
6.146536 I-PER    word.lower():centuries
6.068211 B-PER    word[-3:]:/05
6.064262 B-CON    word.lower():barracks
6.001557 B-CON    word.lower():firepits
5.913849 B-PER    word.lower():post-medieval
5.904850 B-SPE    word.lower():barley
5.880798 B-CON    +2:word.lower():pits/
5.834896 B-ART    word.lower():debitage
5.823825 B-PER    word[-3:]:xon
5.765834 B-CON    word.lower():staircase
5.753883 B-ART    word.lower():pots
5.752541 I-PER    -1:word.lower():unspecific
5.733851 B-SPE    word.lower():cattle
5.716321 B-CON    word.lower():toilets
5.7

In [117]:
## dont run this takes too long

In [None]:
%%time 
# this is to work out the best parameters for the testing. not needed yet but can increase the results by .1 
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    max_iterations=100, 
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=50, 
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
# crf = rs.best_estimator_  # shows that the best params are 
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))