In [95]:
# importing libraties
import xml.etree.cElementTree as ET
import os
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import subprocess
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import sys
import string

random.seed(0)

# base path where we have all the bin files for sux-eng
apertium_mt_path = '../'

# NMT paths
src_file_location = '../data/NMT_temp_data/src.txt'
tgt_file_location = '../data/NMT_temp_data/tgt.txt'
weight_location = '../../Data/_step_10000.pt'


import warnings
warnings.filterwarnings('ignore')
chencherry = SmoothingFunction()

In [96]:
# # loading t5 model for grammer error correction
# model = T5ForConditionalGeneration.from_pretrained("Unbabel/gec-t5_small")
# tokenizer = T5Tokenizer.from_pretrained('t5-small')

# tokenizer = AutoTokenizer.from_pretrained("SafiUllahShahid/EnGECmodel")
# model = AutoModelForSeq2SeqLM.from_pretrained("SafiUllahShahid/EnGECmodel")

## Supporting functions

In [97]:
def txt_file_read(filename):
    lines=[]
    with open(filename, "r") as f:
        for line in f:
            line=line.strip()
            lines.append(line)
    return lines

In [98]:
def txt_file_save(filename, sentence_list):
    with open(filename, 'w') as filehandle:
        for listitem in sentence_list:
            filehandle.write('%s\n' % listitem)

In [99]:
# function to run grammer error correction
def grammer_error_correction(eng_sentences):
    corrected_sentence = []
    for eng_sentence in tqdm(eng_sentences):
        tokenized_sentence = tokenizer(eng_sentence, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
        gec_result = tokenizer.decode(
        model.generate(
            input_ids = tokenized_sentence.input_ids,
            attention_mask = tokenized_sentence.attention_mask, 
            max_length=512,
            num_beams=5,
            early_stopping=True,
        )[0],
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True
        )
        corrected_sentence.append(gec_result)

    return corrected_sentence

In [120]:
# rule based machine translation system
# Note - there is no api call, here we are calling os.system('bash code') as there is no inbuilt python library for apertium 
def rule_machine_translation(sux_sentences):
    sux_RBMT = []
    for sux_sentence in sux_sentences:
        try:
            sux_sentence = sux_sentence.replace(':','-').replace('(','\(').replace(')','\)').replace("'","\\'").replace('|','\|').replace('x','').replace('.','').replace('<','').replace('>','')
            sux_sentence = " ".join(sux_sentence.strip().split())
            
            ## Calling apertium rule based Engine
            apertium_translation_command = f'''echo {sux_sentence} | apertium -d {apertium_mt_path} sux-eng'''
            output = subprocess.check_output(apertium_translation_command, shell=True)

            output = output.decode('ascii').strip().replace('x','').replace('#','').replace('-',' ').replace('*',' ').split()
            output = " ".join(output).lower()
            sux_RBMT.append(output)
        except:
            sux_RBMT.append('')

    return sux_RBMT

In [121]:
# Neural machine translation
# Here also there is no python api call, rather it is a bash call using os.system
def nn_machine_translation(sux_sentence):
    txt_file_save(src_file_location, sux_sentence)

    os.system(f'''onmt_translate -model {weight_location}  -src {src_file_location} -output {tgt_file_location}''')    
    
    sux_NNMT = txt_file_read(tgt_file_location)
    
    return sux_NNMT

In [122]:
def eng_reference_preprocessing(eng_tranlation_reference):
    eng_tranlation_reference = eng_tranlation_reference.replace('-',' ').lower()
    eng_tranlation_reference = ''.join(i for i in s if i not in string.punctuation)
    return eng_tranlation_reference

In [123]:
# main function to call and process conll files 
# translating sentences from a directory containg conll files
def process_conll_files(dir_path):
    eng_reference_sentneces = []
    sumerian_sentences = []
    file_names = []

    # extracting sumerian and english translation from the file
    file_name = os.listdir(dir_path)

    # reading data from conll files
    for file in tqdm(file_name):

        file_path = os.path.join(dir_path,file)
        file_data = txt_file_read(file_path)

        # extracting data from conll files 
        sux_sentence = ''
        eng_tranlation_reference = ''
        for row in file_data:
            if row.startswith('# tr.en'):
                eng_tranlation_reference = row.split('tr.en:')[1]
            row_line = row.split('\t')
            if row_line[0].isdigit() and 'XPOSTAG' not in row:
                sux_sentence+=row_line[1]+' '

        # basic cleaning of english reference sentence so we do not miss correct words match because of basic errors like (Su-zen same as Suzen and suzen same as suzen)
        eng_tranlation_reference = eng_reference_preprocessing(eng_tranlation_reference)

        # basic cleaning of sumerian sentence before passing to rulebased translation
        # sux_sentence = sux_sentence.replace('<','').replace('>','').lower()


        eng_reference_sentneces.append(eng_tranlation_reference)
        sumerian_sentences.append(sux_sentence)
        file_names.append(file)


    return file_names, eng_reference_sentneces, sumerian_sentences

# TRANSLATION

## 1. Translating conll files

In [124]:
dir_path = '../data/mtaac_syntax_corpus_consolidated/dev/'

In [125]:
# extract file names and english, sumerian sentences
file_names, eng_tranlation_references, sux_sentences = process_conll_files(dir_path)

100%|██████████| 157/157 [00:00<00:00, 11403.88it/s]


#### a. Rule based machine translation 

In [126]:
sux_RBMT = rule_machine_translation(sux_sentences)



In [127]:
# grammer error correction if needed 
# gec_sux_RBMT = grammer_error_correction(sux_RBMT)

#### b. Neural machine translation 

In [129]:
sux_NNMT = nn_machine_translation(sux_sentences)

[2022-09-12 15:51:44,522 INFO] Translating shard 0.
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
  self._batch_index = self.topk_ids // vocab_size
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
[2022-09-12 15:52:09,891 INFO] PRED AVG SCORE: -0.3415, PRED PPL: 1.4070


#### c. Calculating Bleu score

In [130]:
#================================ BLEU SCORE ================================================ #
translated_df = []
rule_bleu_list = []
nn_bleu_list = []


for file, eng_tranlation_reference, sux_sentence,sux_RB,sux_NN in tqdm(zip(file_names,eng_tranlation_references,sux_sentences,sux_RBMT,sux_NNMT)):

    rule_bleu = sentence_bleu([eng_tranlation_reference.split()], sux_RB.split(),smoothing_function=chencherry.method1,weights = (0.75,0.25,0,0))*100
    rule_bleu_list.append(rule_bleu)


    nn_bleu = sentence_bleu([eng_tranlation_reference.split()], sux_NN.split(), smoothing_function=chencherry.method1, weights = (0.75,0.25,0,0))*100
    nn_bleu_list.append(nn_bleu)

    translated_df.append([file, sux_sentence, eng_tranlation_reference, sux_RB, sux_NN, rule_bleu, nn_bleu])

    # if count==33:
    #     break


print(f'''\n average bleu score for rule based is''', (np.mean(rule_bleu_list),np.median(rule_bleu_list)))
print(f'''\n average bleu score for neural network based is''',(np.mean(nn_bleu_list),np.median(nn_bleu_list)))

157it [00:00, 7115.37it/s]


 average bleu score for rule based is (2.24364436822736, 2.026704983791064)

 average bleu score for neural network based is (8.060353282516942, 4.714877157589494)





In [131]:
# for test and analysis
col_name = ['file', 'sux_sentence', 'eng_tranlation_reference', 'eng_rule_based_translation', 'eng_nn_based_translation', 'rule_bleu', 'nn_bleu']
trainslation_pd = pd.DataFrame(translated_df,columns = col_name)
trainslation_pd.to_csv('../results/translation_results.csv')
trainslation_pd.head()

Unnamed: 0,file,sux_sentence,eng_tranlation_reference,eng_rule_based_translation,eng_nn_based_translation,rule_bleu,nn_bleu
0,P320163.conll,pisan-dub-ba mu 2(disz) sze-ba giri3-se3-ga ug...,Szusuen strong king king of Urim king of the f...,filing_basket one year ration attendant ugnim{...,Basketoftablets 2 years of the barley rations ...,0.0,13.198351
1,P125272.conll,usz2 ur-sila-luh 1(asz@c) GAN2 e2-ur2-bi-du10 ...,Szusuen strong king king of Urim king of the f...,dead <n> unit luh 1(towards his <n>@ c) field ...,the arasifieldwoods took in charge 1 acworker ...,0.19077,3.814166
2,P416458.conll,4(disz) ki szu-{d}idim-ta mu-kux(DU) iti ezem-...,Szusuen strong king king of Urim king of the f...,one day at hand { d} idim unknown year <vble> ...,4 garments from uIdim delivery of Ninazu year ...,1.315486,5.045325
3,P107372.conll,pisan-dub-ba kiszib3 didli masz-x-x e2 lu2-gi-...,Szusuen strong king king of Urim king of the f...,filing_basket several unit interest house brou...,Basketoftablets sealed documents varied from t...,0.0,8.785822
4,P101172.conll,5(disz) sila3 kasz saga 5(disz) sila3 ninda 5(...,Szusuen strong king king of Urim king of the f...,day month good beer day month bread day unit o...,for Inanna 5 sila3 fine beer 5 sila3 bread 5 s...,0.366923,0.0


## 2. Translating sumerian txt files

In [132]:
# txt file location
txt_file_location = '../test/sux-eng-input.txt'

In [133]:
sux_sentences_list = txt_file_read(txt_file_location)
sux_sentences_list

['e2 lugal-la mu-un-du3', 'lugal', 'dummu']

In [134]:
# a. rule based engine
RBMT_result = rule_machine_translation(sux_sentences_list)
RBMT_result

['house intercalary_month <np> year un quality_designation',
 'intercalary_month',
 'dummu']

In [135]:
# b. NMT engine
NMT_results = nn_machine_translation(sux_sentences_list)
NMT_results

[2022-09-12 15:52:11,751 INFO] Translating shard 0.
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
  self._batch_index = self.topk_ids // vocab_size
[2022-09-12 15:52:11,935 INFO] PRED AVG SCORE: -0.7102, PRED PPL: 2.0343


['built the temple of the king he built', 'king king', 'Tirmium and']

## 3. Translating single sumerian sentence

In [136]:
sux_sentence = "{d}szul-gi nita kal-ga lugal uri5{ki}-ma lugal an ub-da limmu2-ba"

In [137]:
# a. rule based engine
RBMT_result = rule_machine_translation([sux_sentence])
# final_sentence = grammer_error_correction(RBMT_result)
RBMT_result

['szulgir male month intercalary_month year intercalary_month an <n> king']

In [138]:
# b. NMT engine
NNMT_results = nn_machine_translation([sux_sentence])
NNMT_results

[2022-09-12 15:52:13,568 INFO] Translating shard 0.
  torch.mul(self.topk_scores, length_penalty, out=self.topk_log_probs)
  self._batch_index = self.topk_ids // vocab_size
[2022-09-12 15:52:13,792 INFO] PRED AVG SCORE: -0.0810, PRED PPL: 1.0844


['ulgi mighty man king of Ur and king of the four world quarters']

In [139]:
# c. calculate bleu score 

# pass the reference senetnce 
eng_tranlation_reference = "Szusuen, strong king, king of Urim, king of the four quarters"
eng_tranlation_reference = eng_reference_preprocessing(eng_tranlation_reference)

rule_bleu = sentence_bleu([eng_tranlation_reference.split()], RBMT_result[0].split(),smoothing_function=chencherry.method1,weights = (0.75,0.25,0,0))*100
nn_bleu = sentence_bleu([eng_tranlation_reference.split()], NNMT_results[0].split(), smoothing_function=chencherry.method1, weights = (0.75,0.25,0,0))*100

print(f'''\n average bleu score for rule based is''', rule_bleu)
print(f'''\n average bleu score for neural network based is''',nn_bleu)


 average bleu score for rule based is 5.152710423703126

 average bleu score for neural network based is 47.76235168541087


# Exrta

### NLTK BlEU

In [None]:

reference = [['this','boy']]
candidate = ['this', 'is']
chencherry = SmoothingFunction()
score = sentence_bleu(reference, candidate, smoothing_function=chencherry.method1, weights=(1,0,0,0))
print(score)

0.5


In [None]:
sentence_bleu([eng_tranlation_reference.split()], eng_nn_based_translation.split(), smoothing_function=chencherry.method1)

### Huggingface evaluate

In [None]:
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
predictions = ["hello there"]
references = [
     ["hello bro"],
 ]
bleu = evaluate.load("sacrebleu")
results = bleu.compute(predictions=predictions, references=references, smooth_method= 'floor')
print(results)

{'score': 0.0, 'counts': [2, 1, 0, 0], 'totals': [2, 1, 0, 0], 'precisions': [100.0, 100.0, 0.0, 0.0], 'bp': 1.0, 'sys_len': 2, 'ref_len': 2}


In [None]:
predictions = ["hello"]
references = [["hello there"]]
sacrebleu = evaluate.load("sacrebleu")
results = sacrebleu.compute(predictions=predictions, 
                             references=references, smooth_method = 'add-k')

In [None]:
results

{'score': 36.78794411714425,
 'counts': [1, 1, 1, 1],
 'totals': [1, 1, 1, 1],
 'precisions': [100.0, 100.0, 100.0, 100.0],
 'bp': 0.36787944117144233,
 'sys_len': 1,
 'ref_len': 2}

## Torch check

In [2]:
import torch
import math

In [4]:
dtype = torch.float
device = torch.device("mps")

# Create random input and output data
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Randomly initialize weights
a = torch.randn((), device=device, dtype=dtype)
b = torch.randn((), device=device, dtype=dtype)
c = torch.randn((), device=device, dtype=dtype)
d = torch.randn((), device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(2000):
    # Forward pass: compute predicted y
    y_pred = a + b * x + c * x ** 2 + d * x ** 3

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

# Backprop to compute gradients of a, b, c, d with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    # Update weights using gradient descent
    a -= learning_rate * grad_a
    b -= learning_rate * grad_b
    c -= learning_rate * grad_c
    d -= learning_rate * grad_d


print(f'Result: y = {a.item()} + {b.item()} x + {c.item()} x^2 + {d.item()} x^3')


99 3244.34814453125
199 2151.0498046875
299 1427.306884765625
399 948.1649780273438
499 630.9312744140625
599 420.8768310546875
699 281.77752685546875
799 189.65609741210938
899 128.6400909423828
999 88.22222900390625
1099 61.44561004638672
1199 43.70391082763672
1299 31.947072982788086
1399 24.154930114746094
1499 18.989810943603516
1599 15.565485000610352
1699 13.2947998046875
1799 11.788824081420898
1899 10.789817810058594
1999 10.126983642578125
Result: y = -0.009623829275369644 + 0.8226878643035889 x + 0.0016602700343355536 x^2 + -0.08848666399717331 x^3
