In [23]:
import xml.etree.cElementTree as ET
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import subprocess
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sys

random.seed(0)

apertium_mt_path = '.'
import warnings
warnings.filterwarnings('ignore')
chencherry = SmoothingFunction()

In [24]:
# # loading t5 model
# model = T5ForConditionalGeneration.from_pretrained("Unbabel/gec-t5_small")
# tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer = AutoTokenizer.from_pretrained("SafiUllahShahid/EnGECmodel")
model = AutoModelForSeq2SeqLM.from_pretrained("SafiUllahShahid/EnGECmodel")

In [25]:
def txt_file_read(filename):
    lines=[]
    with open(filename, "r") as f:
        for line in f:
            line=line.strip()
            lines.append(line)
    return lines

In [26]:
def txt_file_save(filename, sentence_list):
    with open(filename, 'w') as filehandle:
        for listitem in sentence_list:
            filehandle.write('%s\n' % listitem)

In [27]:
def grammer_error_correction(eng_sentences):
    corrected_sentence = []
    for eng_sentence in tqdm(eng_sentences):
        tokenized_sentence = tokenizer(eng_sentence, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        gec_result = tokenizer.decode(
        model.generate(
            input_ids = tokenized_sentence.input_ids,
            attention_mask = tokenized_sentence.attention_mask, 
            max_length=512,
            num_beams=5,
            early_stopping=True,
        )[0],
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True
        )
        corrected_sentence.append(gec_result)

    return corrected_sentence

In [28]:
def rule_machine_translation(sux_sentences):
    sux_RBMT = []
    for sux_sentence in sux_sentences:
        try:
            sux_sentence = sux_sentence.replace(':','-').replace('(','\(').replace(')','\)').replace("'","\\'").replace('|','\|')
            
            ## Calling apertium rule based Engine
            apertium_translation_command = f'''echo {sux_sentence} | apertium -d {apertium_mt_path} sux-eng'''
            output = subprocess.check_output(apertium_translation_command, shell=True)

            output = output.decode('ascii').strip().replace('#','').replace('-',' ').replace('*',' ').split()
            output = " ".join(output)
            sux_RBMT.append(output)
        except:
            sux_RBMT.append('')

    return sux_RBMT

In [29]:
def nn_machine_translation(sux_sentence):
    src_file_location = 'data/src.txt'
    tgt_file_location = 'data/tgt.txt'
    weight_location = '../Data/_step_10000.pt'
    txt_file_save(src_file_location, sux_sentence)

    os.system(f'''onmt_translate -model {weight_location}  -src {src_file_location} -output {tgt_file_location}''')    
    
    sux_NNMT = txt_file_read(tgt_file_location)
    
    return sux_NNMT

In [30]:
def process_conll_files(dir_path):
    eng_reference_sentneces = []
    sumerian_sentences = []
    file_names = []

    # extracting sumerian and english translation from the file
    file_name = os.listdir(dir_path)

    # reading data from conll files
    for file in tqdm(file_name):

        file_path = os.path.join(dir_path,file)
        file_data = txt_file_read(file_path)

        # extracting data from conll files 
        sux_sentence = ''
        eng_tranlation_reference = ''
        for row in file_data:
            if row.startswith('# tr.en'):
                eng_tranlation_reference = row.split('tr.en:')[1]
            row_line = row.split('\t')
            if row_line[0].isdigit() and 'XPOSTAG' not in row:
                sux_sentence+=row_line[1]+' '

        # basic cleaning of english reference sentence so we do not miss correct words match because of basic errors like (Su-zen same as Suzen and suzen same as suzen)
        eng_tranlation_reference = eng_tranlation_reference.replace('-',' ').lower()

        # basic cleaning of sumerian sentence before passing to rulebased translation
        sux_sentence = sux_sentence.replace('<','').replace('>','').lower()


        eng_reference_sentneces.append(eng_tranlation_reference)
        sumerian_sentences.append(sux_sentence)
        file_names.append(file)


    return file_names, eng_reference_sentneces, sumerian_sentences

## Translatiion for conll files

In [31]:
dir_path = 'data/consolidated/dev'

In [32]:
file_names, eng_tranlation_references, sux_sentences = process_conll_files(dir_path)

100%|██████████| 157/157 [00:00<00:00, 9238.56it/s]


In [33]:
sux_RBMT = rule_machine_translation(sux_sentences)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [34]:
# gec_sux_RBMT = grammer_error_correction(sux_RBMT)

In [35]:
sux_NNMT = nn_machine_translation(sux_sentences)
sux_NNMT = [s.lower().replace('basketoftablets','basket of tablets') for s in sux_NNMT]

[2022-09-01 09:40:22,640 INFO] Translating shard 0.
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
  self._batch_index = self.topk_ids // vocab_size
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
[2022-09-01 09:40:49,109 INFO] PRED AVG SCORE: -0.3543, PRED PPL: 1.4252


In [36]:
#================================ BLEU SCORE ================================================ #
translated_df = []
rule_bleu_list = []
nn_bleu_list = []


for file, eng_tranlation_reference, sux_sentence,sux_RB,sux_NN in tqdm(zip(file_names,eng_tranlation_references,sux_sentences,sux_RBMT,sux_NNMT)):

    rule_bleu = sentence_bleu([eng_tranlation_reference.split()], sux_RB.split(),smoothing_function=chencherry.method1,weights = (0.75,0.25,0,0))*100
    rule_bleu_list.append(rule_bleu)


    nn_bleu = sentence_bleu([eng_tranlation_reference.split()], sux_NN.split(), smoothing_function=chencherry.method1, weights = (0.75,0.25,0,0))*100
    nn_bleu_list.append(nn_bleu)

    translated_df.append([file, sux_sentence, eng_tranlation_reference, sux_RB, sux_NN, rule_bleu, nn_bleu])

    # if count==33:
    #     break


print(f'''\n average bleu score for rule based is''', (np.mean(rule_bleu_list),np.median(rule_bleu_list)))
print(f'''\n average bleu score for neural network based is''',(np.mean(nn_bleu_list),np.median(nn_bleu_list)))

157it [00:00, 1405.82it/s]


 average bleu score for rule based is (15.548035875261364, 13.860675302522605)

 average bleu score for neural network based is (22.15353084859962, 13.287635348602514)





In [37]:
col_name = ['file', 'sux_sentence', 'eng_tranlation_reference', 'eng_rule_based_translation', 'eng_nn_based_translation', 'rule_bleu', 'nn_bleu']
trainslation_pd = pd.DataFrame(translated_df,columns = col_name)
trainslation_pd.to_csv('translation_results.csv')
trainslation_pd.head()

Unnamed: 0,file,sux_sentence,eng_tranlation_reference,eng_rule_based_translation,eng_nn_based_translation,rule_bleu,nn_bleu
0,P320163.conll,pisan-dub-ba mu 2(disz) sze-ba giri3-se3-ga ug...,basket of tablets years of rations personnel ...,filing_basket one year ration attendant xxx li...,basket of tablets 2 years of the barley ration...,4.277116,59.69492
1,P125272.conll,usz2 ur-sila-luh 1(asz@c) gan2 e2-ur2-bi-du10 ...,dead ur silalu a c field e urbidu a foreman ...,dead Ur silaluh 1(towards his <n>@ c) gan2 Eur...,the masarwoods took in charge 1 aworker luebga...,22.225439,2.361315e-21
2,P416458.conll,4(disz) ki szu-{d}idim-ta mu-kux(du) iti ezem-...,oxen cows male equids female equids old from ...,one day from Szu Idim year kux(went ) unit Eze...,4 dur3jacks 6 eme6jennies 2 bucks,13.816689,0.0
3,P107372.conll,pisan-dub-ba kiszib3 didli masz-x-x e2 lu2-gi-...,basket of tablets sealed documents varied fro...,filing_basket several unit sealed documents va...,basket of tablets sealed documents varied from...,34.534645,81.9747
4,P101172.conll,5(disz) sila3 kasz saga 5(disz) sila3 ninda 5(...,sila fine beer sila bread shekels onions shek...,day sila good beer day sila pole day unit one ...,for inanna 5 sila3 fine beer 5 sila3 bread 5 s...,19.415519,0.002364715


## Translation from  sentences

In [38]:
eng_tranlation_reference = " basket of tablets accounts barley of distribution plowmen young oxen turners of abbamu an ur lamma are here year en  of inanna by goat was found"

In [79]:
sux_sentence = "pisan-dub-ba nig2-ka9-ak a2 buru14 a2 en-te szabra-ne i3-gal2 mu en {d}nanna kar-zi-da ba-a-hun"

In [82]:
sux_sentence_updated = sux_sentence.replace('x','').replace('.','')
sux_sentence_updated = " ".join(sux_sentence_updated.strip().split())
sux_sentence_updated

'pisan-dub-ba nig2-ka9-ak a2 buru14 a2 en-te szabra-ne i3-gal2 mu en {d}nanna kar-zi-da ba-a-hun'

In [83]:
RBMT_result = rule_machine_translation([sux_sentence])
RBMT_result

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


['basket of tablets account labor harvest labor xxx and managers life in year priest Nanna of Karzida was hired']

In [64]:
final_sentence = grammer_error_correction(RBMT_result)
final_sentence

100%|██████████| 1/1 [00:02<00:00,  2.47s/it]


['for Szulgir male strong king of Urim king An n> of its four kings.']

In [57]:
NNMT_results = nn_machine_translation([sux_sentence])
NNMT_results

[2022-09-01 11:07:14,329 INFO] Translating shard 0.
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
  self._batch_index = self.topk_ids // vocab_size
[2022-09-01 11:07:14,468 INFO] PRED AVG SCORE: -0.1070, PRED PPL: 1.1129


['Basketoftablets accounts of Abbamu via Atu']

In [44]:
rule_bleu = sentence_bleu([eng_tranlation_reference.split()], final_sentence[0].split(),smoothing_function=chencherry.method1,weights = (0.75,0.25,0,0))*100
nn_bleu = sentence_bleu([eng_tranlation_reference.split()], NNMT_results[0].split(), smoothing_function=chencherry.method1, weights = (0.75,0.25,0,0))*100

print(f'''\n average bleu score for rule based is''', rule_bleu)
print(f'''\n average bleu score for neural network based is''',nn_bleu)


 average bleu score for rule based is 2.7303931712945935

 average bleu score for neural network based is 2.7303931712945935


# Exrta

In [17]:
# result = os.system(f'''echo "e2 lugal-la mu-un-du3" | apertium -d apertium-sux-eng/ sux-eng''')

In [18]:
# sux_sentence = "pisan-dub-ba dub gid2-da i3-dub giri3 ku5-da-mu i3-gal2 mu si-mu-ru-um{ki} lu-lu-bu{ki} <a>-ra2 1(u) la2 1(disz)-kam-asz ba-hul"
sux_sentence = "e2 lugal-la mu-un-du3"

In [19]:
sux_sentence = sux_sentence.replace('<','').replace('>','')
sux_sentence 

'e2 lugal-la mu-un-du3'

In [20]:
sux_sentence = sux_sentence.replace(':','-').replace('(','\(').replace(')','\)').replace("'","\\'").replace('|','\|')
apertium_translation_command = f'''echo {sux_sentence} | apertium -d . sux-eng'''
output = subprocess.check_output(apertium_translation_command, shell=True)
output = output.decode('ascii').strip().replace('#','').replace('-',' ').replace('*',' ').split()
" ".join(output)

'he built house of king'

### NLTK BlEU

In [None]:

reference = [['this','boy']]
candidate = ['this', 'is']
chencherry = SmoothingFunction()
score = sentence_bleu(reference, candidate, smoothing_function=chencherry.method1, weights=(1,0,0,0))
print(score)

0.5


In [None]:
sentence_bleu([eng_tranlation_reference.split()], eng_nn_based_translation.split(), smoothing_function=chencherry.method1)

### Huggingface evaluate

In [None]:
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
predictions = ["hello there"]
references = [
     ["hello bro"],
 ]
bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=predictions, references=references, smooth_method= 'floor')
print(results)

{'score': 0.0, 'counts': [2, 1, 0, 0], 'totals': [2, 1, 0, 0], 'precisions': [100.0, 100.0, 0.0, 0.0], 'bp': 1.0, 'sys_len': 2, 'ref_len': 2}


In [None]:
predictions = ["hello"]
references = [["hello there"]]
sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=predictions, 
                             references=references, smooth_method = 'add-k')

In [None]:
results

{'score': 36.78794411714425,
 'counts': [1, 1, 1, 1],
 'totals': [1, 1, 1, 1],
 'precisions': [100.0, 100.0, 100.0, 100.0],
 'bp': 0.36787944117144233,
 'sys_len': 1,
 'ref_len': 2}

## merging tags for lexd file

In [3]:
p = ['bi2-in-du8']
# p = ['ub-ta-e3','in-szi-sa10','i3-gal2','bi2-in-ak-bi','bi2-in-du11','bi2-in-ur3','bi2-in-ak-bi','in-szi-sa10','ib2-szi-ag2-ge26-a','in-szi-sa10','i3-gal2','ba-hul-a']

In [6]:
s = ['du8:<3-SG-NH><L3><3-SG-H-A><V><3-SG-P>']
# s= ['e3:<ANT><3-NH><ABL><3-SG-H-A><V><3-SG-P>','sa:<FIN><3-SG-H><TERM><3-SG-H-A><V><3-SG-P><SUB>','gal2:<FIN><L1><V><3-SG-S>','ak:<3-NH><L2><3-SG-NH-P><V><3-SG-A><SUB>','dug:<3-SG-NH><L3><3-SG-H-A><V><3-SG-P>','ur3:<3-NH><L2><3-SG-H-P><V><3-SG-A><SUB>','ak:<3-NH><L2><3-SG-NH-P><V><3-SG-A><SUB>','sa:<FIN><3-SG-H><TERM><3-SG-H-A><V><3-SG-P><SUB>','ag2:<FIN><3-NH><TERM><V><3-SG-A><SUB>','sa:<FIN><3-SG-H><TERM><3-SG-H-A><V><3-SG-P><SUB>','gal2:<FIN><L1><V><3-SG-S>','hul<MID><V><3-SG-S><SUB>']

In [7]:
# s = '<ANT><3-NH><ABL><3-SG-H-A><V><3-SG-P>'
for i,j in zip(p,s):
    j = j.replace('<V>','').replace(':','<VBLEX>').lower()
    print(j+':'+i)

du8<vblex><3-sg-nh><l3><3-sg-h-a><3-sg-p>:bi2-in-du8


## Torch check

In [2]:
import torch
import math

In [4]:
dtype = torch.float
device = torch.device("mps")

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

# Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')


99 3244.34814453125
199 2151.0498046875
299 1427.306884765625
399 948.1649780273438
499 630.9312744140625
599 420.8768310546875
699 281.77752685546875
799 189.65609741210938
899 128.6400909423828
999 88.22222900390625
1099 61.44561004638672
1199 43.70391082763672
1299 31.947072982788086
1399 24.154930114746094
1499 18.989810943603516
1599 15.565485000610352
1699 13.2947998046875
1799 11.788824081420898
1899 10.789817810058594
1999 10.126983642578125
Result: y = -0.009623829275369644 + 0.8226878643035889 x + 0.0016602700343355536 x^2 + -0.08848666399717331 x^3


## Extract tags

In [7]:
import re

In [30]:
s = '''
            (
            if ((1.v14 = "1-sg-p") or (1.v6 = "1-sg"))
                  [prpers@prn.obj.p1.mf.sg _ ]
            else-if (1.v6 = "1-pl")
                  [prpers@prn.obj.p1.mf.pl _ ]
            else-if ((1.v14 = "2-sg-p") or (1.v6 = "2-sg"))
                  [prpers@prn.obj.p2.mf.sg _ ]
            else-if ((1.v14 = "3-sg-p") or (1.v11 = "3-sg-h-p") or (1.v11 = "3-sg-h-l3") or (1.v6 = "3-sg-h"))
                  [prpers@prn.obj.p3.m.sg _ ]
            else-if ((1.v11 = "3-sg-nh-p") or (1.v11 = "3-sg-nh-l3") or (1.v5 = "3-nh"))
                  [prpers@prn.obj.p3.nt.sg _ ]
            else-if ((1.v14 = "3-pl-p") or (1.v11 = "3-pl-h-p") or (1.v6 = "3-pl"))
                  [prpers@prn.obj.p3.m.pl _ ]
            else
                  []
            );
'''

In [31]:
l = []
for i in re.findall('(\"[a-z1-9-]+\")',s):
    t = i.replace("\"","")
    l.append(t)
    print(t+', ')

1-sg-p, 
1-sg, 
1-pl, 
2-sg-p, 
2-sg, 
3-sg-p, 
3-sg-h-p, 
3-sg-h-l3, 
3-sg-h, 
3-sg-nh-p, 
3-sg-nh-l3, 
3-nh, 
3-pl-p, 
3-pl-h-p, 
3-pl, 
