In [93]:
import xml.etree.cElementTree as ET
import os
import re
import pandas as pd
import json
import random
random.seed(0)

import warnings
warnings.filterwarnings('ignore')

In [94]:
# tag_mapper = {'nu':'num', 'v':'vblex', 'cnj':'cnjcoo','3':'p3','2':'p2','1':'p1','':'None','s':'subj','p':'obj','a':'subj'}
tag_mapper = {'nu':'num', 'v':'vblex', 'cnj':'cnjcoo'}
NER_LIST = ['pn', 'on', 'fn', 'mn', 'sn', 'dn', 'gn', 'tn', 'wn', 'en', 'an', 'rn']

In [95]:
#for i in NER_LIST:
#    print(i,end = " ")

In [152]:
def txt_file_write(filename,list_data,mode="a"):
    with open(filename, mode) as f:
        for line in list_data:
            f.write("%s\n" %line)

In [96]:
def txt_file_read(filename):
    lines=[]
    with open(filename, "r") as f:
        for line in f:
            line=line.strip()
            lines.append(line)
    return lines

In [97]:
def json_file_read(filename):
    with open(filename) as f:
        data = json.load(f)
    return data

In [98]:
def save_json(filename,data):
    with open(filename, 'w') as f:
        json.dump(data, f,indent=2)

In [99]:
def process_row(row):
    orignal = row[1]
    segm = row[2]
    xpostag = row[3]
    lem = "_"
    eng_translation = "_"
    # match pattern - all groups starting with word and containing square bracket

    start_index = segm.find('[')
    end_index = segm.find(']')
    lem = segm[:start_index]
    eng_translation = segm[start_index+1:end_index]
    # for syntax corpus
    eng_translation = eng_translation.replace('.','_')
    
    if eng_translation=='1':
        eng_translation = lem

    # process and save the tags 
    org_tag_spl = re.split("\.|\|",xpostag)
    mp_tags_spl = []
    ner_tag=False
    for t in org_tag_spl:
        t=t.lower()
        mp_tag = t
        if t in tag_mapper.keys():
            mp_tag = tag_mapper[t]
        if t in NER_LIST:
            ner_tag=True
        mp_tags_spl.append(mp_tag)
    if ner_tag:
        mp_tags_spl.append('np')

    return lem,eng_translation,org_tag_spl, mp_tags_spl

In [100]:
# s = 'N.DEM2.ABS'
# org_tag_spl = re.split("\.|\-",s)
# org_tag_spl

# Extracting data from Directory

In [101]:
# define dataframe
column = ['filename','id','form','seg','xpostag','lem','eng_translation','org_tags_split', 'processed_tags_split']
extracted_df = pd.DataFrame(columns = column)
extracted_df.head()

Unnamed: 0,filename,id,form,seg,xpostag,lem,eng_translation,org_tags_split,processed_tags_split


In [102]:
base_dir = 'data/consolidated/all_data/'
conll_files = os.listdir(base_dir)

In [103]:
for conll_file in conll_files:
    temp = os.path.join(base_dir,conll_file)
    f = txt_file_read(temp)
    f_lines = [line.split("\t") for line in f]


    # start processing file
    processed_ls = []
    for row in f_lines:
        row_len = len(row)
        if len(row)==8:
            # for syntax corpus
            row = row[:3] + row[4:]
        if len(row)==7 and row[1]!='' and row[2]!='SEGM' and row[3]!='_' and row[3]!='':
            lem,eng_translation,org_tag_spl, mp_tags_spl = process_row(row)
            processed_ls.append([conll_file, row[0], row[1], row[2], row[3], lem,eng_translation,org_tag_spl, mp_tags_spl])

        # print(orignal, lem,eng_translation,tags)
        # if orignal!="_" and lem!="_":
        
    processed_df = pd.DataFrame(processed_ls, columns=column)
    # print(processed_df)
    extracted_df = extracted_df.append(processed_df)

extracted_df = extracted_df.reset_index(inplace=False,drop=True)

In [104]:
extracted_df

Unnamed: 0,filename,id,form,seg,xpostag,lem,eng_translation,org_tags_split,processed_tags_split
0,P432211.conll,1,mu-na-gi-in,_[he.standardized.for.him],VEN.3-SG-H.DAT.3-SG-H-A.gin.3-SG-P,_,he_standardized_for_him,"[VEN, 3-SG-H, DAT, 3-SG-H-A, gin, 3-SG-P]","[ven, 3-sg-h, dat, 3-sg-h-a, gin, 3-sg-p]"
1,P432254.conll,2,nam-ti,namtil[life],N,namtil,life,[N],[n]
2,P432254.conll,4,{d}szul-gi-sze3,_[of.Szulgi],RN.GEN.TERM,_,of_Szulgi,"[RN, GEN, TERM]","[rn, gen, term, np]"
3,P432254.conll,6,x-sa6-ga,_[-saga],PN,_,-saga,[PN],"[pn, np]"
4,P432254.conll,8,nu-kiri6,_[the.orchardman],N.ERG,_,the_orchardman,"[N, ERG]","[n, erg]"
...,...,...,...,...,...,...,...,...,...
60198,P458715.conll,10,a-ba-{d}en-lil2-gin7,_[Aba-Enlilgin],PN,_,Aba-Enlilgin,[PN],"[pn, np]"
60199,P458715.conll,11,dumu,dumu[child],N,dumu,child,[N],[n]
60200,P458715.conll,12,id-da-a,Iddaya[1][-e],PN.ERG,Iddaya,Iddaya,"[PN, ERG]","[pn, erg, np]"
60201,P458715.conll,13,dam-gar3,damgar[merchant][-ak],N,damgar,merchant,[N],[n]


In [105]:
extracted_df.to_csv(r'data/sux_dict_process/sux_dict_vocab_all.txt', index=None, sep=' ', mode='w')

## Tag seperation

In [106]:
all_tags= set()
for i in extracted_df.processed_tags_split:
    for j in i:
        all_tags.add(j)

In [107]:
all_tags

{'',
 '1-sg-a',
 '1-sg-poss',
 '2-sg-a',
 '2-sg-poss',
 '3-nh',
 '3-nh-poss',
 '3-pl',
 '3-pl-dat',
 '3-pl-s',
 '3-sg-a',
 '3-sg-cop',
 '3-sg-dat',
 '3-sg-h',
 '3-sg-h-a',
 '3-sg-h-dat',
 '3-sg-h-p',
 '3-sg-h-poss',
 '3-sg-hn-p',
 '3-sg-nh',
 '3-sg-nh-a',
 '3-sg-nh-l3',
 '3-sg-nh-p',
 '3-sg-nh-poss',
 '3-sg-p',
 '3-sg-s',
 '_',
 'abl',
 'abs',
 'adv',
 'aj',
 'an',
 'ani',
 'ant',
 'av',
 'cnjcoo',
 'com',
 'cop',
 'cop-3-pl',
 'cop-3-sg',
 'dah-h',
 'dat',
 'dat-h',
 'dat-nh',
 'dem2',
 'det',
 'dn',
 'dub[tablet]-ak',
 'ed',
 'en',
 'equ',
 'erg',
 'except',
 'ešnunak',
 'f',
 'fin',
 'fn',
 'gen',
 'gen-abl',
 'gin',
 'gn',
 'h',
 'ip',
 'l1',
 'l1-syn',
 'l2',
 'l2-nh',
 'l2-syn',
 'l3',
 'l3-nh',
 'mid',
 'mid-v11=1-sg-a',
 'mn',
 'mod',
 'mod1',
 'mod4',
 'n',
 'n-rdp',
 'neg',
 'nf',
 'np',
 'num',
 'on',
 'pad',
 'pf',
 'pl',
 'pn',
 'pt',
 'rdp',
 'rn',
 'sabar',
 'sn',
 'stem',
 'sub',
 'taramurim',
 'term',
 'tn',
 'vblex',
 'ven',
 'wn',
 'x'}

## Creating the dict with most frequent tags and translation

In [108]:
extracted_df.head()

Unnamed: 0,filename,id,form,seg,xpostag,lem,eng_translation,org_tags_split,processed_tags_split
0,P432211.conll,1,mu-na-gi-in,_[he.standardized.for.him],VEN.3-SG-H.DAT.3-SG-H-A.gin.3-SG-P,_,he_standardized_for_him,"[VEN, 3-SG-H, DAT, 3-SG-H-A, gin, 3-SG-P]","[ven, 3-sg-h, dat, 3-sg-h-a, gin, 3-sg-p]"
1,P432254.conll,2,nam-ti,namtil[life],N,namtil,life,[N],[n]
2,P432254.conll,4,{d}szul-gi-sze3,_[of.Szulgi],RN.GEN.TERM,_,of_Szulgi,"[RN, GEN, TERM]","[rn, gen, term, np]"
3,P432254.conll,6,x-sa6-ga,_[-saga],PN,_,-saga,[PN],"[pn, np]"
4,P432254.conll,8,nu-kiri6,_[the.orchardman],N.ERG,_,the_orchardman,"[N, ERG]","[n, erg]"


In [109]:
extracted_df.shape

(60203, 9)

In [132]:
unique_eng_translation = {}
unique_tag_list = {}
for i, row in extracted_df.iterrows():
    # p_tag = sorted(row['processed_tags_split'])
    p_tag = row['processed_tags_split']
    p_tag_str = ".".join(p_tag)
    form = row['form']
    lemma = row['lem']
    dict_key = form+"_SEP_"+lemma
    eng_translation = row['eng_translation']

    # add english translation
    try:
        unique_eng_translation[dict_key].append(eng_translation)
    except:
        unique_eng_translation[dict_key] = [eng_translation]

    # add unique tags
    try:
        unique_tag_list[dict_key].append(p_tag_str)
    except:
        unique_tag_list[dict_key] = [p_tag_str]


In [133]:
# saving json files
save_json('data/sux_dict_process/unique_eng_translation.json',unique_eng_translation)
save_json('data/sux_dict_process/unique_tag_list.json',unique_tag_list)

In [134]:
#vocab size
len(unique_tag_list)

6323

In [135]:
def val_dict_formation(data_ls):
    d = {}
    for l in data_ls:
        try:
            d[l]+=1
        except:
            d[l] = 1
            
    return sorted(list(d.items()),key=lambda x:x[1], reverse=True)[0][0]


In [138]:
most_freq_translation = {}
most_freq_tags = {}

for key,val in unique_eng_translation.items():
    most_freq_val = val_dict_formation(val)
    most_freq_translation[key] = most_freq_val


for key,val in unique_tag_list.items():
    most_freq_val = val_dict_formation(val)
    most_freq_tags[key] = most_freq_val


In [142]:
most_freq_translation

{'mu-na-gi-in_SEP__': 'he_standardized_for_him',
 'nam-ti_SEP_namtil': 'life',
 '{d}szul-gi-sze3_SEP__': 'of_Szulgi',
 'x-sa6-ga_SEP__': '-saga',
 'nu-kiri6_SEP__': 'the_orchardman',
 'a_SEP_a': 'water',
 'mu-na-ru_SEP_mu-nn-a': '-n',
 'ur-ge6-par4_SEP_Ur-gepar': 'Ur-gepar',
 'u3-na-a-du11_SEP_u-nn-a-e-dug': 'speak',
 'gurusz_SEP_gurusz': 'male',
 '1(u)-am3_SEP_1(u)': 'ten',
 'ma2_SEP_ma': 'ship',
 'dug_SEP_dug': 'pot',
 'ba-al-e-de3_SEP__': 'the_pots_to_unload',
 'lu2-{d}szara2-ra_SEP__': 'to_Lu-Szara',
 'he2-na-szum2-mu_SEP__': 'may_he_give',
 'lu2_SEP_lu': 'person',
 'us2-gar_SEP__': 'May_with',
 'he2-eb-da-an-gi4-gi4_SEP__': 'it_return',
 'lu2-kal-la_SEP_Lukala': 'Lukala',
 'dub-sar_SEP_dubsar': 'scribe',
 'dumu_SEP_dumu': 'child',
 'ur-sa6-ga_SEP_Ursaga': 'Ursaga',
 '{d}szu-{d}suen_SEP_Szusuen': 'Szusuen',
 'lugal_SEP_lugal': 'king',
 'kal-ga_SEP_kalag': 'strong',
 'uri5{ki}-ma_SEP_Urim': 'Urim',
 'an_SEP_An': 'An',
 'ub-da_SEP_': '',
 'limmu2-ba_SEP_limmu': 'four',
 'a-ha-am-wa-q

In [143]:
most_freq_tags

{'mu-na-gi-in_SEP__': 'ven.3-sg-h.dat.3-sg-h-a.gin.3-sg-p',
 'nam-ti_SEP_namtil': 'n',
 '{d}szul-gi-sze3_SEP__': 'rn.gen.term.np',
 'x-sa6-ga_SEP__': 'pn.np',
 'nu-kiri6_SEP__': 'n.erg',
 'a_SEP_a': 'n.abs',
 'mu-na-ru_SEP_mu-nn-a': 'ven.3-sg-h.dat.3-sg-h-a.vblex.3-sg-p',
 'ur-ge6-par4_SEP_Ur-gepar': 'pn.np',
 'u3-na-a-du11_SEP_u-nn-a-e-dug': 'ant.3-sg-h.dat.2-sg-a.vblex.3-sg-p',
 'gurusz_SEP_gurusz': 'n',
 '1(u)-am3_SEP_1(u)': 'num.abs.cop-3-sg',
 'ma2_SEP_ma': 'n',
 'dug_SEP_dug': 'n',
 'ba-al-e-de3_SEP__': 'nf.vblex.pf.dat-nh',
 'lu2-{d}szara2-ra_SEP__': 'pn.np',
 'he2-na-szum2-mu_SEP__': 'mod.fin.3-sg-h.dat.3-sg-nh-p.vblex.3-sg-a',
 'lu2_SEP_lu': 'n',
 'us2-gar_SEP__': 'nf.vblex.3-sg-s',
 'he2-eb-da-an-gi4-gi4_SEP__': 'mod1.fin.3-sg-nh-p.vblex.3-sg-a',
 'lu2-kal-la_SEP_Lukala': 'pn.gen.np',
 'dub-sar_SEP_dubsar': 'n',
 'dumu_SEP_dumu': 'n',
 'ur-sa6-ga_SEP_Ursaga': 'pn.np',
 '{d}szu-{d}suen_SEP_Szusuen': 'rn.np',
 'lugal_SEP_lugal': 'n',
 'kal-ga_SEP_kalag': 'nf.vblex.sub',
 'uri5{

## Preparing skeleton of Sux vocab and the dictionary

### Sux vocab

In [441]:
# sumerian vocab
sux_morph_name = '../apertium-sux/apertium-sux.sux.lexd'
with open(sux_morph_name,'w') as f:
    f.write("# Morphological Transducer for Sumerian \n")

In [442]:
s = ''' \n\n
# =========================================================

# 1. Verb

PATTERNS 
VerbRoot


# =========================================================

# 2. Noun

PATTERNS 
NounRoot



# =========================================================

# 3. Named Entity PATTERNS

PATTERNS 
Named_Entities



# =========================================================

# 4. Numbers

PATTERNS 
Numbers


# =========================================================

# 5. Coordinative Conjugations

PATTERNS 
Conjugation








###############################################################################
###                          L E X I C O N                                  ###
###############################################################################

'''
txt_file_write(sux_morph_name,[s])


### Sux-Eng bi-dict

In [443]:
# sumerian english bilingual dictionary
sux_eng_dict_name = 'apertium-sux-eng.sux-eng.dix'
with open(sux_eng_dict_name,'w') as f:
    f.write("<?xml version='1.0' encoding='us-ascii'?> \n")

In [444]:
# start dictionary
l = ['<dictionary>']
txt_file_write(sux_eng_dict_name,l)

In [445]:
# put all the tags
s = ''' <sdefs>
    <sdef n="n" 	c="Noun"/>
    <sdef n="np" 	c="Proper name"/>
    <sdef n="pr" 	c="Preposition"/>
    <sdef n="vblex" 	c="Verb"/>
    <sdef n="vbmod" 	c="Modal verb"/>
    <sdef n="vaux" 	c="Auxiliary verb"/>
    <sdef n="vbser" 	c="Verb to be"/>
    <sdef n="vbhaver" 	c="Verb to have"/>
    <sdef n="vbdo" 	c="Verb to do"/>
    <sdef n="det" 	c="Determiner"/>
    <sdef n="predet" 	c="Predeterminer"/>
    <sdef n="prn" 	c="Pronoun"/>
    <sdef n="cnjcoo" 	c="Coordinating conjunction"/>
    <sdef n="cnjsub" 	c="Subordinating conjunction"/>
    <sdef n="p1" 	c="First person"/>
    <sdef n="p2" 	c="Second person"/>
    <sdef n="p3" 	c="Third person"/>
    <sdef n="attr" 	c="Attributive"/>
    <sdef n="sg" 	c="Singular"/>
    <sdef n="pl" 	c="Plural"/>
    <sdef n="sp" 	c="Singular / plural"/>
    <sdef n="m" 	c="Masculine"/>
    <sdef n="f" 	c="Feminine"/>
    <sdef n="mf" 	c="Masculine / feminine"/>
    <sdef n="nt" 	c="Neuter"/>
    <sdef n="mfn" 	c="Masculine / feminine / neuter"/>
    <sdef n="def" 	c="Definite"/>
    <sdef n="ind" 	c="Indefinite"/>
    <sdef n="ref" 	c="Reflexive"/>
    <sdef n="dem" 	c="Demonstrative"/>
    <sdef n="top" 	c="Toponym"/>
    <sdef n="al" 	c="Altres (other)"/>
    <sdef n="ant" 	c="Anthroponym (first name)"/>
    <sdef n="cog" 	c="Cognomen (family name)"/>
    <sdef n="prs" 	c="Present subjunctive"/>
    <sdef n="pres" 	c="Present (tense)"/>
    <sdef n="past" 	c="Past"/>
    <sdef n="pii" 	c="Past indicative"/>
    <sdef n="pis" 	c="Past subjunctive"/>
    <sdef n="imp" 	c="Imperative"/>
    <sdef n="inf" 	c="Infinitive"/>
    <sdef n="pp" 	c="Past participle"/>
    <sdef n="subs" 	c="Verbal noun"/>
    <sdef n="pprs" 	c="Present participle"/>
    <sdef n="ger" 	c="Gerund"/>
    <sdef n="cm" 	c="Comma"/>
    <sdef n="sent" 	c="End of sentence marker"/>
    <sdef n="quot" 	c="Quote mark"/>
    <sdef n="lquot" 	c="Quote mark left"/>
    <sdef n="rquot" 	c="Quote mark right"/>
    <sdef n="abbr" 	c="Abbreviation"/>
    <sdef n="adj" 	c="Adjective"/>
    <sdef n="adv" 	c="Adverb"/>
    <sdef n="preadv" 	c="Pre-adverb"/>
    <sdef n="rel" 	c="Relative"/>
    <sdef n="itg" 	c="Interrogative"/>
    <sdef n="cnjadv" 	c="Adverbial conjunction"/>
    <sdef n="comp" 	c="Comparative"/>
    <sdef n="sup" 	c="Superlative"/>
    <sdef n="pred" 	c="Predicative"/>
    <sdef n="ord" 	c="Ordinal"/>
    <sdef n="qnt" 	c="Quantifier"/>
    <sdef n="num" 	c="Numeral / Number"/>
    <sdef n="pron" 	c="Pronoun"/>
    <sdef n="ij" 	c="Interjection"/>
    <sdef n="lpar" 	c="Left parenthesis"/>
    <sdef n="rpar" 	c="Right parenthesis"/>
    <sdef n="apos" 	c="Apostrophe"/>
    <sdef n="percent" 	c="Percentage"/>
    <sdef n="guio" 	c="Hyphen"/>
    <sdef n="sep" 	c="Seperable verb"/>
    <sdef n="sint" 	c="Synthetic adjective"/>
    <sdef n="nom" 	c="Nominative"/>
    <sdef n="gen" 	c="Genitive"/>
    <sdef n="acr" 	c="Acronym"/>
    <sdef n="org" 	c="Organisation"/>
    <sdef n="tn" 	c=""/>
    <sdef n="nn" 	c=""/>
    <sdef n="aa" 	c="Animate"/>
    <sdef n="an" 	c="Inanimate"/>
    <sdef n="pos" 	c="Possessive"/>
    <sdef n="lquest" 	c="left question mark"/>
    <sdef n="obj" 	c="Object"/>
    <sdef n="subj" 	c="Subject"/>
    <sdef n="pers" 	c="Personal (pronoun)"/>
    <sdef n="file" 	c="Filename"/>
    <sdef n="l3" c="l3"/>
<sdef n="mn" c="mn"/>
<sdef n="fn" c="fn"/>
<sdef n="abl" c="abl"/>
<sdef n="nh" c="nh"/>
<sdef n="sn" c="sn"/>
<sdef n="en" c="en"/>
<sdef n="on" c="on"/>
<sdef n="pt" c="pt"/>
<sdef n="ven" c="ven"/>
<sdef n="gn" c="gn"/>
<sdef n="mod" c="mod"/>
<sdef n="ak" c="ak"/>
<sdef n="pn" c="pn"/>
<sdef n="dah" c="dah"/>
<sdef n="h" c="h"/>
<sdef n="wn" c="wn"/>
<sdef n="rn" c="rn"/>
<sdef n="hn" c="hn"/>
<sdef n="nf" c="nf"/>
<sdef n="pf" c="pf"/>
<sdef n="abs" c="abs"/>
<sdef n="dem2" c="dem2"/>
<sdef n="fin" c="fin"/>
<sdef n="poss" c="poss"/>
<sdef n="rdp" c="rdp"/>
<sdef n="neg" c="neg"/>
<sdef n="l2" c="l2"/>
<sdef n="dat" c="dat"/>
<sdef n="None" c="None"/>
<sdef n="l1" c="l1"/>
<sdef n="mid" c="mid"/>
<sdef n="com" c="com"/>
<sdef n="cop" c="cop"/>
<sdef n="erg" c="erg"/>
<sdef n="term" c="term"/>
<sdef n="sub" c="sub"/>
<sdef n="syn" c="syn"/>
<sdef n="dn" c="dn"/>
</sdefs> '''

# start dictionary
l = [s]
txt_file_write(sux_eng_dict_name,l)

In [446]:
# start main section
l = ['\n <section id="main" type="standard">']
txt_file_write(sux_eng_dict_name,l)

# Working based on POS TAGS
### np(Named Entity), num(numbers), cnjcoo, n, vblex

In [447]:
# most_freq_tags

## Named Entity, Numbers, Cnjcoo, Noun, Verb

In [448]:
NE_sux = '''\n'''
NE_dict = '''\n'''

Num_sux = '''\n'''
Num_dict = '''\n'''

Cnjcoo_sux = '''\n'''
Cnjcoo_dict = '''\n'''

N_sux = '''\n'''
N_dict = '''\n'''

V_sux = '''\n'''
V_dict = '''\n'''



for key, val in most_freq_tags.items():
    form,lemma = key.split('_SEP_')
    form = form.replace(':','-').replace('{','\{').replace('}','\}').replace('[','\[').replace(']','\]')\
        .replace('(','\(').replace(')','\)').replace('|','\|').replace('<','').replace('>','').replace('@','').replace('.','') # to escape the special meaning of brackets
    bi_dict_key = re.sub('[^a-zA-Z0-9]+','_',key)
    eng_translation = most_freq_translation[key].replace('’','').replace('\'','')
    p_tag = val.split('.')



    # check if there is somthing in form or not
    if len(form)<2:
        continue



    # for named entity
    if 'np' in p_tag:
        remainig_tags = list(set(p_tag)-set(['np']))
        pos_tag = 'np'
        tg_str = '<np>'
        for t in remainig_tags:
            tg_str+='<'+t+'>'

        # for morph dict
        sux_morph = bi_dict_key+tg_str+':'+form

        # for translation dict
        sux_eng_dict = f'''<e><p><l>{bi_dict_key}<s n="{pos_tag}"/></l><r>{eng_translation}<s n="{pos_tag}"/></r></p></e>'''
        
        NE_sux+=sux_morph+"\n"
        NE_dict+=sux_eng_dict+"\n"

    
    
    # for numbers
    if 'num' in p_tag:
        remainig_tags = list(set(p_tag)-set(['num']))
        pos_tag = 'num'
        tg_str = '<num>'
        for t in remainig_tags:
            tg_str+='<'+t+'>'

        # for morph dict
        sux_morph = bi_dict_key+tg_str+':'+form

        # for translation dict
        sux_eng_dict = f'''<e><p><l>{bi_dict_key}<s n="{pos_tag}"/></l><r>{eng_translation}<s n="{pos_tag}"/></r></p></e>'''
        
        Num_sux+=sux_morph+"\n"
        Num_dict+=sux_eng_dict+"\n"



    # for conjugation
    if 'cnjcoo' in p_tag:
        remainig_tags = list(set(p_tag)-set(['cnjcoo']))
        pos_tag = 'cnjcoo'
        tg_str = '<cnjcoo>'
        for t in remainig_tags:
            tg_str+='<'+t+'>'

        # for morph dict
        sux_morph = bi_dict_key+tg_str+':'+form

        # for translation dict
        sux_eng_dict = f'''<e><p><l>{bi_dict_key}<s n="{pos_tag}"/></l><r>{eng_translation}<s n="{pos_tag}"/></r></p></e>'''
        
        Cnjcoo_sux+=sux_morph+"\n"
        Cnjcoo_dict+=sux_eng_dict+"\n"



    # for nouns
    if 'n' in p_tag:
        remainig_tags = list(set(p_tag)-set(['n']))
        pos_tag = 'n'
        tg_str = '<n>'
        for t in remainig_tags:
            tg_str+='<'+t+'>'

        # for morph dict
        sux_morph = bi_dict_key+tg_str+':'+form

        # for translation dict
        sux_eng_dict = f'''<e><p><l>{bi_dict_key}<s n="{pos_tag}"/></l><r>{eng_translation}<s n="{pos_tag}"/></r></p></e>'''
        

        N_sux+=sux_morph+"\n"
        N_dict+=sux_eng_dict+"\n"


    # for verbs
    if 'vblex' in p_tag:
        remainig_tags = list(set(p_tag)-set(['vblex']))
        pos_tag = 'vblex'
        tg_str = '<vblex>'
        for t in remainig_tags:
            tg_str+='<'+t+'>'

        # for morph dict
        sux_morph = bi_dict_key+tg_str+':'+form

        # for translation dict
        sux_eng_dict = f'''<e><p><l>{bi_dict_key}<s n="{pos_tag}"/></l><r>{eng_translation}<s n="{pos_tag}"/></r></p></e>'''
        
        V_sux+=sux_morph+"\n"
        V_dict+=sux_eng_dict+"\n"



## Writing data to sux morphdict

In [449]:
s = '''





#=============================================================
LEXICON VerbRoot
'''
l = [s, V_sux]
txt_file_write(sux_morph_name,l)

In [450]:
s = '''





#=============================================================
LEXICON NounRoot
'''
l = [s, N_sux]
txt_file_write(sux_morph_name,l)

In [451]:
s = '''





#=============================================================
LEXICON Named_Entities
'''
l = [s, NE_sux]
txt_file_write(sux_morph_name,l)

In [452]:
s = '''






#=============================================================
LEXICON Numbers
'''
l = [s, Num_sux]
txt_file_write(sux_morph_name,l)

In [453]:
s = '''






#=============================================================
LEXICON Conjugation
'''
l = [s, Cnjcoo_sux]
txt_file_write(sux_morph_name,l)

## Writing data to bilingual dict

In [454]:
s = '''



<!-- Verbs -->
'''
l = [s, V_dict]
txt_file_write(sux_eng_dict_name,l)

In [455]:
s = '''



<!-- Noun -->
'''
l = [s, N_dict]
txt_file_write(sux_eng_dict_name,l)

In [456]:
s = '''



<!-- Named_Entity -->
'''
l = [s, NE_dict]
txt_file_write(sux_eng_dict_name,l)

In [457]:
s = '''



<!-- Numbers -->
'''
l = [s, Num_dict]
txt_file_write(sux_eng_dict_name,l)

In [458]:
s = '''



<!-- Conjugation -->
'''
l = [s, Cnjcoo_dict]
txt_file_write(sux_eng_dict_name,l)

In [459]:
# ending dict and main section
l = ['''\n </section>
</dictionary>''']
txt_file_write(sux_eng_dict_name,l)