In [1]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter
import re

In [2]:
#nltk.download('treebank')
#nltk.download('universal_tagset')

In [8]:
df=pd.read_csv('../Dataset/Augmented_NER_training_ml.csv')
df=df[['ID','FORM','NER']]

In [9]:
df.groupby('NER').count()

Unnamed: 0_level_0,ID,FORM
NER,Unnamed: 1_level_1,Unnamed: 2_level_1
AN,1,1
DN,909,909
EN,60,60
FN,1463,1463
GN,1351,1351
MN,79,79
O,38822,38822
ON,18,18
PN,17729,17729
RN,150,150


In [10]:
df

Unnamed: 0,ID,FORM,NER
0,1,2(disz),O
1,1,sag-x,PN
2,2,giri3,O
3,2,{d}nanna-ag2,PN
4,3,mu,O
...,...,...,...
61462,22727,zi3,O
61463,22727,u4,O
61464,22727,2(disz)-kam,O
61465,22728,lugal-{gesz}gigir-re,PN


In [11]:
def Preparing_tagged_data(df):
    tagged_sentence=[]
    c=1
    temp=[]
    for i in range(len(df)):
        if df['ID'][i]==c:
            temp.append((df['FORM'][i],df['NER'][i]))
        else:
            tagged_sentence.append(temp)
            temp=[]
            temp.append((df['FORM'][i],df['NER'][i]))
            c+=1
    tagged_sentence.append(temp)
    return tagged_sentence
tagged_sentence=Preparing_tagged_data(df)

In [12]:
#l=[tagged_sentences[i] for i in range(len(tagged_sentences)) if len(tagged_sentences[i])==1]
#print('Phrases with single words')
#print(len(l))

In [13]:
tagged_sentence[0]

[('2(disz)', 'O'), ('sag-x', 'PN')]

In [14]:
#s='1(disz)'
#re.search(r'\d+\(\w+\)',s)

In [15]:
print("Number of Tagged Sentences ",len(tagged_sentence))
tagged_words=[tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged words", len(tagged_words))
vocab=set([word for word,tag in tagged_words])
print("Vocabulary of the Corpus",len(vocab))
tags=set([tag for word,tag in tagged_words])
print("Number of Tags in the Corpus ",len(tags))

Number of Tagged Sentences  22728
Total Number of Tagged words 61467
Vocabulary of the Corpus 21556
Number of Tags in the Corpus  13


In [18]:
train_set, test_set = train_test_split(tagged_sentence,test_size=0.1,random_state=42)
print("Number of Sentences in Training Data ",len(train_set))
print("Number of Sentences in Testing Data ",len(test_set))

Number of Sentences in Training Data  20455
Number of Sentences in Testing Data  2273


In [19]:
test_set

[[('2(disz)', 'O'), ('da-dam', 'PN'), ('dumu', 'O'), ('...-in', 'PN')],
 [('1(barig)', 'O'),
  ('2(ban2)', 'O'),
  ('sze', 'O'),
  ('ur5-ra', 'O'),
  ('kiszib3', 'O'),
  ('a-bu-um-...', 'PN')],
 [('udu', 'O'), ('{d}utu-ka', 'DN')],
 [('kiszib3', 'O'), ('u4-ma-ni-gar', 'PN')],
 [('ugu2', 'O'),
  ('i-din-esz18-dar', 'PN'),
  ('dumu', 'O'),
  ('nin-anzu2{muszen}', 'PN'),
  ('ga2-ga2-dam', 'O')],
 [('ki', 'O'), ('e2-...', 'PN')],
 [('giri3', 'O'), ('ur-sza-nig2-sza3-ge', 'PN'), ('sa12-du5', 'O')],
 [('giri3', 'O'), ('du-uk-ra', 'PN')],
 [('geme2-{d}amar-{d}suen', 'PN')],
 [('ki', 'O'), ('ur-ab-ba-ta', 'PN')],
 [('ugula', 'O'), ('ab-ba-sa6-ga-a', 'PN')],
 [('1(disz)', 'O'), ('tug2', 'O'), ('{d}bala-a', 'PN'), ('iszib', 'O')],
 [('ama-na-si', 'PN'), ('szu', 'O'), ('ba-ti', 'O')],
 [('{d}nin-mar{ki}-i3-sa6', 'PN')],
 [('2(disz)', 'O'), ('udu', 'O'), ('niga', 'O'), ('{d}nin-szir', 'DN')],
 [('kab2-ku5', 'O'), ('tur-an-na', 'FN')],
 [('giri3', 'O'), ('SZIM', 'PN')],
 [('3(ban2)', 'O'),
  ('sze'

In [20]:
def features(sentence,index):
    ### sentence is of the form [w1,w2,w3,..], index is the position of the word in the sentence
    
    #Dictionary of features
    d={}
    
    word = sentence[index]
    left = ""
    right = ""
    if (index!=0):
        left=sentence[index-1]
    if (index!=len(sentence)-1):
        right=sentence[index+1]
        
    
    ###### ------- Rules/Features to Identify POS Tags  -------#######
    
    d['Is_first_word'] = 1 if index==0 else 0

    d['Is_last_word'] = 1 if index==len(sentence)-1 else 0

    d['previous_word'] = "" if left=="" else left

    d['next_word'] = "" if right=="" else right

    d['distorted_word_1'] = 1 if re.search(r'x',word) else 0

    d['distorted_word_2'] = 1 if re.search(r'\.\.\.',sentence[index]) else 0

    d['Is_number_form_1'] = 1 if re.search(r'\d+\(\.+\)',word) else 0
        
    d['Is_child_left'] = 1 if left=='dumu' else 0
    
    d['Is_child_right'] = 1 if right=='dumu' else 0
    
    d['Is_place'] = 1 if left=='ki' or right=='ki' else 0
    
    d['Is_witness_1'] = 1 if left=='igi' or right=='igi' else 0
    
    d['Is_witness_2'] = 1 if left=='igi' or right=='igi' and word.endswith('sze3') else 0
    
    d['Is_selling_tablet'] = 1 if left=='kiszib3' or right=='kiszib3' else 0
    
    d['Is_behalf_business'] = 1 if left=='giri3' or right=='giri3' else 0
    
    d['Is_selling_tablet'] = 1 if left=='kiszib3' or right=='kiszib3' else 0
    
    d['Is_selling_tablet'] = 1 if left=='kiszib3' or right=='kiszib3' else 0
    
    d['Is_PN_1'] = 1 if word.startswith('ur-') else 0
    
    d['Is_PN_2'] = 1 if word.startswith('lu2-') else 0
    
    d['Is_PN_3'] = 1 if word.endswith('-mu') else 0
    
    d['Is_PN_4'] = 1 if re.search(r'{d}',word) else 0
    
    d['Is_SN'] = 1 if re.search(r'{ki}',word) else 0
    
    d['Is_determinative'] = 1 if re.search(r'{',word) else 0
    
    d['Is_PN_5'] = 1 if re.search(r'lugal',word) else 0
    
    d['contains_numer'] = 1 if re.search(r'\d',word) else 0
    
    d['Is_PN_6'] = 1 if right=='sag' else 0
    
    d['Is_PN_7'] = 1 if right=='zarin' else 0
    
    d['Is_quantity'] = 1 if re.search(r'\d+\(\w+\)',left) else 0
    
    d['Is_month'] = 1 if left=='iti' else 0
    
    d['Is_mont_year'] = 1 if word=='iti' or word=='mu' else 0
    
    d['Is_PN_8'] = 1 if word[0].isupper() else 0
    
    d['Is_PN_9'] = 1 if word.startswith('{d}') else 0
    
    d['Is_hyphen'] = 1 if re.search(r'-',word) else 0
    
    d['prefix_1'] = word[0]
    
    d['prefix_2'] = word[:2]
    
    d['prefix_3'] = word[:3]
    
    d['prefix_4'] = word[:4]
    
    d['suffix_1'] = word[-1]
    
    d['suffix_2'] = word[-2:]
    
    d['suffix_3'] = word[-3:]
    
    d['suffix_4'] = word[-4:]
    
    return d

In [21]:
def word_list(sentence):
    list_of_words=[]
    for word,tag in sentence:
        list_of_words.append(word)
    return list_of_words

def prepareData(tagged_sentences):
    X,y=[],[]
    for sentence in tagged_sentences:
        single_sentence_feature=[]
        
        # Preparing features of all words of a single sentence/phrase
        for i in range(len(sentence)):
            #word list of sentence
            list_of_words=word_list(sentence)
            #feature of word at index i
            d=features(list_of_words,i)
            single_sentence_feature.append(d)
            
        X.append(single_sentence_feature)
        # append list of tags for the associated sentence
        y.append([tag for word,tag in sentence])
    return X,y

In [22]:
X_train,y_train=prepareData(train_set)
X_test,y_test=prepareData(test_set)

In [23]:
X_train[2]

[{'Is_first_word': 1,
  'Is_last_word': 0,
  'previous_word': '',
  'next_word': 'tir-kak-du6-da',
  'distorted_word_1': 0,
  'distorted_word_2': 0,
  'Is_number_form_1': 0,
  'Is_child_left': 0,
  'Is_child_right': 0,
  'Is_place': 0,
  'Is_witness_1': 0,
  'Is_witness_2': 0,
  'Is_selling_tablet': 0,
  'Is_behalf_business': 0,
  'Is_PN_1': 0,
  'Is_PN_2': 0,
  'Is_PN_3': 0,
  'Is_PN_4': 0,
  'Is_SN': 0,
  'Is_determinative': 0,
  'Is_PN_5': 0,
  'contains_numer': 1,
  'Is_PN_6': 0,
  'Is_PN_7': 0,
  'Is_quantity': 0,
  'Is_month': 0,
  'Is_mont_year': 0,
  'Is_PN_8': 0,
  'Is_PN_9': 0,
  'Is_hyphen': 0,
  'prefix_1': 'k',
  'prefix_2': 'ki',
  'prefix_3': 'kis',
  'prefix_4': 'kisz',
  'suffix_1': '3',
  'suffix_2': 'b3',
  'suffix_3': 'ib3',
  'suffix_4': 'zib3'},
 {'Is_first_word': 0,
  'Is_last_word': 1,
  'previous_word': 'kiszib3',
  'next_word': '',
  'distorted_word_1': 0,
  'distorted_word_2': 0,
  'Is_number_form_1': 0,
  'Is_child_left': 0,
  'Is_child_right': 0,
  'Is_plac

In [24]:
y_train

[['O', 'O', 'O', 'O'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'O', 'O', 'O', 'O', 'PN', 'O', 'O'],
 ['PN'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'O', 'PN'],
 ['O', 'O', 'PN'],
 ['O', 'O', 'PN'],
 ['O', 'O', 'PN'],
 ['O', 'PN'],
 ['PN'],
 ['O', 'O', 'O'],
 ['PN'],
 ['PN', 'O'],
 ['O', 'O'],
 ['O', 'FN', 'O'],
 ['O', 'PN'],
 ['O', 'SN'],
 ['PN'],
 ['PN', 'O'],
 ['PN', 'O'],
 ['DN'],
 ['O', 'PN'],
 ['O', 'PN'],
 ['PN'],
 ['PN'],
 ['O', 'O', 'O', 'O', 'O', 'PN'],
 ['O', 'PN'],
 ['O', 'O', 'O', 'O', 'PN', 'EN'],
 ['PN'],
 ['O', 'O', 'O', 'PN'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'O', 'GN', 'O'],
 ['PN', 'O'],
 ['O', 'PN'],
 ['O', 'DN'],
 ['O', 'PN'],
 ['PN'],
 ['PN', 'GN'],
 ['PN', 'O', 'O'],
 ['O', 'O', 'O', 'O'],
 ['O', 'O', 'PN', 'O'],
 ['O', 'PN'],
 ['O', 'PN'],
 ['O', 'PN', 'O'],
 ['O', 'PN'],
 ['O', 'O', 'DN'],
 ['PN', 'GN'],
 ['O', 'O', 'O', 'O', 'O', 'O'],
 ['PN'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'O', 'O', 'PN', 'O', 'O', 'O', 'O'],
 ['O', 'PN'],
 ['PN'],
 ['PN', 'O'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'PN'],

In [25]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.01, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

# Predicting Our 1.5M Data

In [38]:
import pickle

In [39]:
# Save the Modle to file in the current working directory
Pkl_Filename = "../Dataset/NER_CRF_Model.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(crf, file)

In [40]:
Pkl_Filename = "../Saved_Models/NER_CRF_Model.pkl" 
with open(Pkl_Filename, 'rb') as file:  
    crf = pickle.load(file)
crf



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.01, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [43]:
Monolingual_sumerian=[]
with open('../Dataset/sumerian_demo.txt') as f:
    for line in f:
        line=line.strip()
        Monolingual_sumerian.append(line)

In [44]:
Monolingual_sumerian

['sag-ki hu-mu-ra-ab-zalag-ge',
 'ki na-sa6-ta',
 'iti szu-numun-ta',
 '3(disz) gin2 i3 2(disz) gin2 naga',
 'ba-de6',
 'mu us2-sa ki-masz{ki} ba-hul',
 '1(asz) gu4 ab2',
 '1(barig) 1(ban2) 5(disz) 1(disz) 1/2(disz) geme2-{d}szul-pa-e3 dumu-ni',
 '1(asz) 2(ban2) nam-nin-...',
 'dumu ur-{d}nansze',
 'gu4 udu niga u2-sa',
 'e2 kiszib3-ba-kam',
 '1(disz) du-du-am3 u3 lugal-ba-tuku',
 '{d}nin-sun2',
 'nig2-du7-e pa mu-na-e3',
 '2(ban2) ...',
 'dub-sar',
 '1(disz) {munus}asz2-gar3',
 '8(disz) sila3 4(disz) gin2 tab-be-li2 u3 dan-num2',
 'dumu lu2-ga',
 'ur-kalam na-gada i3-dab5',
 '1(disz) munu4 si-e3',
 '1(u) 2(disz) gaszam',
 '2(gesz2) 3(u) 7(disz) udu 1(disz) sila3',
 'dub-sar',
 '5(disz) 1/2(disz) ma-na 5(disz) gin2 siki tug2 guz-za 3(disz)-kam us2',
 '5(ban2) 1(u) ba-at-ri',
 '2(disz) sila3 nig2-ar3-ra saga',
 '4(u) ka-dug ku3-babbar',
 'e2-masz-sze3',
 'ur-ma-ma lu2 bu3',
 'iti pa4-u2-e',
 'szunigin 1(disz) la2-ia3',
 '4(ban2) nin-tesz2-mu',
 '1(disz) szul-lu5 nu-{gesz}kiri6',
 'n gu2

In [45]:
def test_word_list(sentence):
    list_of_words=sentence.split()
    return list_of_words

def prepare_test_Data(all_sentences):
    X=[]
    for sentence in all_sentences:
        single_sentence_feature=[]
        #word list of sentence
        print(sentence)
        list_of_words=test_word_list(sentence)
        # Preparing features of all words of a single sentence/phrase
        for i in range(len(sentence.split())):
            #feature of word at index i
            d=features(list_of_words,i)
            single_sentence_feature.append(d)
            
        X.append(single_sentence_feature)
        
    return X

In [46]:
Processed_sumerian_monolingual=prepare_test_Data(Monolingual_sumerian)

sag-ki hu-mu-ra-ab-zalag-ge
ki na-sa6-ta
iti szu-numun-ta
3(disz) gin2 i3 2(disz) gin2 naga
ba-de6
mu us2-sa ki-masz{ki} ba-hul
1(asz) gu4 ab2
1(barig) 1(ban2) 5(disz) 1(disz) 1/2(disz) geme2-{d}szul-pa-e3 dumu-ni
1(asz) 2(ban2) nam-nin-...
dumu ur-{d}nansze
gu4 udu niga u2-sa
e2 kiszib3-ba-kam
1(disz) du-du-am3 u3 lugal-ba-tuku
{d}nin-sun2
nig2-du7-e pa mu-na-e3
2(ban2) ...
dub-sar
1(disz) {munus}asz2-gar3
8(disz) sila3 4(disz) gin2 tab-be-li2 u3 dan-num2
dumu lu2-ga
ur-kalam na-gada i3-dab5
1(disz) munu4 si-e3
1(u) 2(disz) gaszam
2(gesz2) 3(u) 7(disz) udu 1(disz) sila3
dub-sar
5(disz) 1/2(disz) ma-na 5(disz) gin2 siki tug2 guz-za 3(disz)-kam us2
5(ban2) 1(u) ba-at-ri
2(disz) sila3 nig2-ar3-ra saga
4(u) ka-dug ku3-babbar
e2-masz-sze3
ur-ma-ma lu2 bu3
iti pa4-u2-e
szunigin 1(disz) la2-ia3
4(ban2) nin-tesz2-mu
1(disz) szul-lu5 nu-{gesz}kiri6
n gu2 siki tug2 guz-za du
... nin-a-zu
szunigin 7(disz) gu4 mu 2(disz)
{d}inanna
dumu ur-e2-ninnu
1(disz) i3-li2-an-dul3
3(disz) gin2 i3 2(disz) gi

In [47]:
Prediction=crf.predict(Processed_sumerian_monolingual)

In [49]:
with open('../Output/Sumerian_NER_CRF.txt', 'w') as f:
    for i in range(len(my_list)):
        f.write("%s\n" %str(i+1))
        sentence="sentence: "+Monolingual_sumerian[i]
        f.write("%s\n" %sentence)
        f.write("POS:%s\n\n" % my_list[i])
        

NameError: name 'my_list' is not defined

In [50]:
my_list=[]
for i in range(len(Monolingual_sumerian)):
    print(i+1)
    print("sentence: "+Monolingual_sumerian[i])
    l=Monolingual_sumerian[i].split()
    POS=""
    for j in range(len(l)):
        POS=POS+"("+l[j]+","+Prediction[i][j]+")"+" "
    print('POS:'+POS)
    my_list.append(POS)
    print()

1
sentence: sag-ki hu-mu-ra-ab-zalag-ge
POS:(sag-ki,GN) (hu-mu-ra-ab-zalag-ge,GN) 

2
sentence: ki na-sa6-ta
POS:(ki,O) (na-sa6-ta,PN) 

3
sentence: iti szu-numun-ta
POS:(iti,PN) (szu-numun-ta,MN) 

4
sentence: 3(disz) gin2 i3 2(disz) gin2 naga
POS:(3(disz),O) (gin2,PN) (i3,PN) (2(disz),O) (gin2,PN) (naga,PN) 

5
sentence: ba-de6
POS:(ba-de6,PN) 

6
sentence: mu us2-sa ki-masz{ki} ba-hul
POS:(mu,GN) (us2-sa,GN) (ki-masz{ki},GN) (ba-hul,O) 

7
sentence: 1(asz) gu4 ab2
POS:(1(asz),O) (gu4,O) (ab2,PN) 

8
sentence: 1(barig) 1(ban2) 5(disz) 1(disz) 1/2(disz) geme2-{d}szul-pa-e3 dumu-ni
POS:(1(barig),O) (1(ban2),O) (5(disz),O) (1(disz),O) (1/2(disz),O) (geme2-{d}szul-pa-e3,PN) (dumu-ni,DN) 

9
sentence: 1(asz) 2(ban2) nam-nin-...
POS:(1(asz),O) (2(ban2),O) (nam-nin-...,PN) 

10
sentence: dumu ur-{d}nansze
POS:(dumu,DN) (ur-{d}nansze,PN) 

11
sentence: gu4 udu niga u2-sa
POS:(gu4,GN) (udu,GN) (niga,GN) (u2-sa,GN) 

12
sentence: e2 kiszib3-ba-kam
POS:(e2,PN) (kiszib3-ba-kam,DN) 

13
sentence:

In [53]:
s='POS:(dumu,N) (ia-szu-ub-dingir,NE) '

'(dumu,N)'

# Testing the algo

In [26]:
y_pred=crf.predict(X_test)
X_test[0]

[{'Is_first_word': 1,
  'Is_last_word': 0,
  'previous_word': '',
  'next_word': 'da-dam',
  'distorted_word_1': 0,
  'distorted_word_2': 0,
  'Is_number_form_1': 0,
  'Is_child_left': 0,
  'Is_child_right': 0,
  'Is_place': 0,
  'Is_witness_1': 0,
  'Is_witness_2': 0,
  'Is_selling_tablet': 0,
  'Is_behalf_business': 0,
  'Is_PN_1': 0,
  'Is_PN_2': 0,
  'Is_PN_3': 0,
  'Is_PN_4': 0,
  'Is_SN': 0,
  'Is_determinative': 0,
  'Is_PN_5': 0,
  'contains_numer': 1,
  'Is_PN_6': 0,
  'Is_PN_7': 0,
  'Is_quantity': 0,
  'Is_month': 0,
  'Is_mont_year': 0,
  'Is_PN_8': 0,
  'Is_PN_9': 0,
  'Is_hyphen': 0,
  'prefix_1': '2',
  'prefix_2': '2(',
  'prefix_3': '2(d',
  'prefix_4': '2(di',
  'suffix_1': ')',
  'suffix_2': 'z)',
  'suffix_3': 'sz)',
  'suffix_4': 'isz)'},
 {'Is_first_word': 0,
  'Is_last_word': 0,
  'previous_word': '2(disz)',
  'next_word': 'dumu',
  'distorted_word_1': 0,
  'distorted_word_2': 0,
  'Is_number_form_1': 0,
  'Is_child_left': 0,
  'Is_child_right': 1,
  'Is_place': 

In [27]:
y_pred

[['O', 'PN', 'O', 'PN'],
 ['O', 'O', 'O', 'O', 'O', 'PN'],
 ['O', 'DN'],
 ['O', 'PN'],
 ['O', 'PN', 'O', 'PN', 'O'],
 ['O', 'PN'],
 ['O', 'PN', 'O'],
 ['O', 'PN'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'PN'],
 ['O', 'O', 'PN', 'O'],
 ['PN', 'O', 'O'],
 ['PN'],
 ['O', 'O', 'O', 'DN'],
 ['O', 'FN'],
 ['O', 'PN'],
 ['O', 'O', 'PN', 'O', 'O'],
 ['O', 'O', 'PN'],
 ['O', 'PN', 'DN', 'SN', 'O'],
 ['O', 'PN'],
 ['O', 'PN'],
 ['O', 'O', 'DN', 'SN', 'O'],
 ['O', 'O', 'GN', 'O'],
 ['PN'],
 ['O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'PN'],
 ['O', 'PN', 'O'],
 ['O', 'PN'],
 ['O', 'O', 'O', 'O', 'O', 'PN'],
 ['O', 'O', 'FN'],
 ['O', 'O', 'O', 'O', 'O'],
 ['PN'],
 ['O', 'PN'],
 ['PN'],
 ['PN', 'O'],
 ['O', 'PN'],
 ['O', 'PN', 'O', 'O'],
 ['O', 'O', 'O', 'PN'],
 ['O', 'FN', 'O'],
 ['PN'],
 ['O', 'PN'],
 ['PN'],
 ['O', 'GN', 'GN'],
 ['O', 'PN', 'O'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'FN'],
 ['PN', 'SN'],
 ['O', 'PN'],
 ['O', 'PN'],
 ['PN'],
 ['PN'],
 ['O', 'PN'],
 ['O', 'PN'],
 ['O', 'PN'],
 ['PN'],
 [

In [28]:
print(metrics.flat_f1_score(y_test, y_pred,average='weighted',labels=crf.classes_))

0.9773412966745056


  _warn_prf(


In [29]:
y_pred_train=crf.predict(X_train)
metrics.flat_f1_score(y_train, y_pred_train,average='weighted',labels=crf.classes_)

0.9953594729274575

In [30]:
metrics.flat_accuracy_score(y_test,y_pred)

0.9777199544641405

In [31]:
metrics.flat_accuracy_score(y_train,y_pred_train)

0.995372211576702

In [32]:
print(metrics.flat_classification_report(
    y_test, y_pred, labels=crf.classes_, digits=3
))

              precision    recall  f1-score   support

           O      0.988     0.994     0.991      3887
          PN      0.972     0.971     0.972      1777
          FN      0.905     0.922     0.913       166
          SN      0.956     0.860     0.905        50
          DN      0.859     0.709     0.777        86
          EN      1.000     1.000     1.000         3
          GN      0.935     0.935     0.935       123
          RN      1.000     0.750     0.857        16
          MN      0.900     0.900     0.900        10
          WN      1.000     1.000     1.000        26
          ON      1.000     1.000     1.000         1
          TN      0.333     0.250     0.286         4
          AN      0.000     0.000     0.000         0

   micro avg      0.978     0.978     0.978      6149
   macro avg      0.834     0.792     0.810      6149
weighted avg      0.977     0.978     0.977      6149



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
print("Number of Transition Features ")
len(crf.transition_features_)

Number of Transition Features 


147

In [34]:
Counter(crf.transition_features_).most_common()[:20]

[(('O', 'O'), 4.866402),
 (('GN', 'GN'), 4.305474),
 (('PN', 'O'), 2.844378),
 (('SN', 'O'), 2.691818),
 (('O', 'WN'), 2.587741),
 (('O', 'PN'), 2.290635),
 (('O', 'DN'), 2.159187),
 (('RN', 'O'), 2.026138),
 (('WN', 'O'), 1.777777),
 (('ON', 'O'), 1.721653),
 (('GN', 'O'), 1.71974),
 (('DN', 'O'), 1.605265),
 (('O', 'MN'), 1.555794),
 (('FN', 'O'), 1.55328),
 (('DN', 'SN'), 1.493999),
 (('O', 'ON'), 1.351986),
 (('O', 'FN'), 1.334978),
 (('O', 'SN'), 1.167039),
 (('EN', 'ON'), 1.048175),
 (('TN', 'O'), 1.008371)]

In [35]:
print("Number of State Features ",len(crf.state_features_))

Number of State Features  13492


In [36]:
Counter(crf.state_features_).most_common(30)

[(('next_word:gub-ba', 'FN'), 8.866339),
 (('previous_word:guru7', 'FN'), 8.502178),
 (('previous_word:4(u)', 'FN'), 8.079066),
 (('previous_word:kab2-ku5', 'FN'), 7.993145),
 (('previous_word:a-sza3', 'FN'), 7.736178),
 (('previous_word:i7', 'WN'), 7.709769),
 (('next_word:ba-hul', 'GN'), 7.376593),
 (('previous_word:e2-duru5', 'GN'), 7.304868),
 (('next_word:tir-sze3', 'GN'), 7.300614),
 (('next_word:ba-na-...', 'PN'), 6.644704),
 (('previous_word:ka', 'WN'), 6.288906),
 (('next_word:nibru{ki}-sze3', 'FN'), 5.776503),
 (('next_word:lugal', 'RN'), 5.35052),
 (('previous_word:lugal', 'GN'), 5.295136),
 (('previous_word:ugu2', 'PN'), 5.273773),
 (('next_word:mu-kux(DU)', 'FN'), 5.21914),
 (('previous_word:gibil', 'WN'), 5.212556),
 (('previous_word:GAN2', 'FN'), 5.049837),
 (('next_word:lugal-e', 'RN'), 5.004249),
 (('previous_word:ki', 'PN'), 4.937904),
 (('next_word:ka(SAG)-guru7', 'PN'), 4.734107),
 (('previous_word:sze-ba', 'FN'), 4.676607),
 (('previous_word:ab-ba-gi-na', 'GN'), 4.

In [37]:
Counter(crf.state_features_).most_common()[-20:]

[(('previous_word:gibil', 'O'), -2.07774),
 (('next_word:i3-dab5', 'O'), -2.097007),
 (('prefix_4:ur-x', 'PN'), -2.108322),
 (('next_word:kuruszda', 'O'), -2.179476),
 (('suffix_4:a-na', 'DN'), -2.184469),
 (('previous_word:apin-la2', 'O'), -2.252605),
 (('previous_word:a-sza3', 'O'), -2.26109),
 (('previous_word:en', 'O'), -2.29316),
 (('next_word:u3', 'O'), -2.42417),
 (('next_word:gudu4', 'O'), -2.534748),
 (('suffix_4:8-um', 'PN'), -2.573472),
 (('previous_word:a-a-kal-la', 'O'), -2.580576),
 (('Is_SN', 'O'), -2.828562),
 (('suffix_4:i-zi', 'FN'), -2.879286),
 (('previous_word:e2', 'O'), -2.99784),
 (('previous_word:kab2-ku5', 'O'), -3.222128),
 (('next_word:ba-a-gar', 'O'), -3.289811),
 (('Is_PN_4', 'O'), -3.49454),
 (('Is_PN_5', 'O'), -3.80433),
 (('next_word:ba-hun', 'O'), -3.896669)]

# Shuffel Dataset

In [34]:
import random
lines=[]
with open("CDLI_Data/Sumerian_monolingual_processed.txt", "r") as f:
    for line in f:
        line=line.strip()
        lines.append(line)

In [35]:
lines

['su-mu-ra-pi2',
 'dumu ia-szu-ub-dingir',
 'ARAD2 sza {d}iszkur',
 'ia-am-s,i2-at-nu-u2',
 'dumu a-li2-wa-aq-rum',
 'ARAD2 sza {d}...',
 'ur-{d}nanna',
 'dumu {d}suen-i-qi2-sza-am',
 'i-ri {d}su-he2-zi',
 '{d}nin-sza3-tuk-x',
 'za-ri-a-nu-u2',
 'dumu sa-ma-nu',
 'ARAD2 sza {d}ku7',
 '4(iku) GAN2 DUR2-HAR sar',
 '1(u) uruda a-ru12-da ma-na',
 '4(asz) ku3 gin2',
 'sa10 GAN2-kam',
 '6(asz) sze gur',
 'nig2-diri',
 'nita-tur',
 'dam-ni',
 'i7-hi-li-su3',
 'lu2 sa10 gu7-me',
 '4(asz) siki ma-na',
 '1(asz) tug2 aktum',
 '1(asz) tug2 ib2-du3',
 '2(asz) i3 sila3',
 '1(asz) nig2-sag-kesz2',
 '2(asz) ninda',
 'nig2-ba',
 'ur-{d}en-lil2',
 'DI-{d}utu',
 'HAR-tu',
 'ba-da-da',
 'ur-{d}lamma',
 'lu2 ki-inim-ma-me',
 'il2',
 'ur-{d}nin-unu{ki}',
 'lu2 GAN2 sa10-me',
 'dingir-a-zu',
 'engar-bi',
 'ig-gi-nu-gi4',
 'dub-sar-bi',
 'ku3 2/3(|NINDA2x(SZE.2(ASZ))|)',
 '1(asz) i3 sila3',
 'nig2-ba-ni',
 'ensi2-bi',
 'inim-ma-ni',
 '6(asz) uruda ma-na',
 'sa10 GAN2',
 '4(iku) GAN2-bi',
 '4(asz) uruda ma-na'

In [36]:
random.seed(12)
random.shuffle(lines)

In [37]:
lines

['sag-ki hu-mu-ra-ab-zalag-ge',
 'ki na-sa6-ta',
 'iti szu-numun-ta',
 '3(disz) gin2 i3 2(disz) gin2 naga',
 'ba-de6',
 'mu us2-sa ki-masz{ki} ba-hul',
 '1(asz) gu4 ab2',
 '1(barig) 1(ban2) 5(disz) 1(disz) 1/2(disz) geme2-{d}szul-pa-e3 dumu-ni',
 '1(asz) 2(ban2) nam-nin-...',
 'dumu ur-{d}nansze',
 'gu4 udu niga u2-sa',
 'e2 kiszib3-ba-kam',
 '1(disz) du-du-am3 u3 lugal-ba-tuku',
 '{d}nin-sun2',
 'nig2-du7-e pa mu-na-e3',
 '2(ban2) ...',
 'dub-sar',
 '1(disz) {munus}asz2-gar3',
 '8(disz) sila3 4(disz) gin2 tab-be-li2 u3 dan-num2',
 'dumu lu2-ga',
 'ur-kalam na-gada i3-dab5',
 '1(disz) munu4 si-e3',
 '1(u) 2(disz) gaszam',
 '2(gesz2) 3(u) 7(disz) udu 1(disz) sila3',
 'dub-sar',
 '5(disz) 1/2(disz) ma-na 5(disz) gin2 siki tug2 guz-za 3(disz)-kam us2',
 '5(ban2) 1(u) ba-at-ri',
 '2(disz) sila3 nig2-ar3-ra saga',
 '4(u) ka-dug ku3-babbar',
 'e2-masz-sze3',
 'ur-ma-ma lu2 bu3',
 'iti pa4-u2-e',
 'szunigin 1(disz) la2-ia3',
 '4(ban2) nin-tesz2-mu',
 '1(disz) szul-lu5 nu-{gesz}kiri6',
 'n gu2

In [38]:
lines

['sag-ki hu-mu-ra-ab-zalag-ge',
 'ki na-sa6-ta',
 'iti szu-numun-ta',
 '3(disz) gin2 i3 2(disz) gin2 naga',
 'ba-de6',
 'mu us2-sa ki-masz{ki} ba-hul',
 '1(asz) gu4 ab2',
 '1(barig) 1(ban2) 5(disz) 1(disz) 1/2(disz) geme2-{d}szul-pa-e3 dumu-ni',
 '1(asz) 2(ban2) nam-nin-...',
 'dumu ur-{d}nansze',
 'gu4 udu niga u2-sa',
 'e2 kiszib3-ba-kam',
 '1(disz) du-du-am3 u3 lugal-ba-tuku',
 '{d}nin-sun2',
 'nig2-du7-e pa mu-na-e3',
 '2(ban2) ...',
 'dub-sar',
 '1(disz) {munus}asz2-gar3',
 '8(disz) sila3 4(disz) gin2 tab-be-li2 u3 dan-num2',
 'dumu lu2-ga',
 'ur-kalam na-gada i3-dab5',
 '1(disz) munu4 si-e3',
 '1(u) 2(disz) gaszam',
 '2(gesz2) 3(u) 7(disz) udu 1(disz) sila3',
 'dub-sar',
 '5(disz) 1/2(disz) ma-na 5(disz) gin2 siki tug2 guz-za 3(disz)-kam us2',
 '5(ban2) 1(u) ba-at-ri',
 '2(disz) sila3 nig2-ar3-ra saga',
 '4(u) ka-dug ku3-babbar',
 'e2-masz-sze3',
 'ur-ma-ma lu2 bu3',
 'iti pa4-u2-e',
 'szunigin 1(disz) la2-ia3',
 '4(ban2) nin-tesz2-mu',
 '1(disz) szul-lu5 nu-{gesz}kiri6',
 'n gu2

In [39]:
demo=lines[0:150]

In [40]:
demo

['sag-ki hu-mu-ra-ab-zalag-ge',
 'ki na-sa6-ta',
 'iti szu-numun-ta',
 '3(disz) gin2 i3 2(disz) gin2 naga',
 'ba-de6',
 'mu us2-sa ki-masz{ki} ba-hul',
 '1(asz) gu4 ab2',
 '1(barig) 1(ban2) 5(disz) 1(disz) 1/2(disz) geme2-{d}szul-pa-e3 dumu-ni',
 '1(asz) 2(ban2) nam-nin-...',
 'dumu ur-{d}nansze',
 'gu4 udu niga u2-sa',
 'e2 kiszib3-ba-kam',
 '1(disz) du-du-am3 u3 lugal-ba-tuku',
 '{d}nin-sun2',
 'nig2-du7-e pa mu-na-e3',
 '2(ban2) ...',
 'dub-sar',
 '1(disz) {munus}asz2-gar3',
 '8(disz) sila3 4(disz) gin2 tab-be-li2 u3 dan-num2',
 'dumu lu2-ga',
 'ur-kalam na-gada i3-dab5',
 '1(disz) munu4 si-e3',
 '1(u) 2(disz) gaszam',
 '2(gesz2) 3(u) 7(disz) udu 1(disz) sila3',
 'dub-sar',
 '5(disz) 1/2(disz) ma-na 5(disz) gin2 siki tug2 guz-za 3(disz)-kam us2',
 '5(ban2) 1(u) ba-at-ri',
 '2(disz) sila3 nig2-ar3-ra saga',
 '4(u) ka-dug ku3-babbar',
 'e2-masz-sze3',
 'ur-ma-ma lu2 bu3',
 'iti pa4-u2-e',
 'szunigin 1(disz) la2-ia3',
 '4(ban2) nin-tesz2-mu',
 '1(disz) szul-lu5 nu-{gesz}kiri6',
 'n gu2

In [42]:
def savefile(filename,LIST):
    with open(filename, 'w') as f:
        for line in LIST:
            f.write("%s\n" % line)
            
            
savefile("Dataset/sumerian_demo.txt",demo)