In [104]:
import pandas as pd
import glob
import re

In [108]:
!pip3 freeze > requirements.txt

In [88]:
files = glob.glob("./data/*.txt")
lines = []
for i in range(len(files)):
	with open(files[i]) as f:
		lines += f.readlines()

In [89]:
def remove_numbers(text):
	# Define regex pattern to match numbers followed by a dot and space
	pattern = r'^\d+\.\s+'
	# Replace matched pattern with an empty string
	cleaned_text = re.sub(pattern, '', text)
	cleaned_text = re.sub('"', '', cleaned_text)
	cleaned_text = re.sub('\'', "'", cleaned_text)
	return cleaned_text.strip('\n')



In [101]:
lines = [remove_numbers(line) for line in lines if line != '\n']
test_data_file = './data/eng_test_data.txt'

for line in lines:
	with open(test_data_file, 'a') as file:
		file.write(line + '\n')


In [91]:
test_data = pd.read_csv(test_data_file, delimiter='*', header=None)
test_data = test_data.drop_duplicates().reset_index()

In [5]:
api_endpoint = 'http://10.10.45.1:1405/multilang-translate'


text_data = ['Citizens are demanding accountability from their leaders and an end to corruption.',
				'I\'m thinking of simplifying my life.']

In [6]:
import requests
import json

def batch_translate(text_data: list, from_lang: str='english', to_lang: str='pigin') -> list:
	"""
	Translates a list of text strings from one language to another using the multilang-translate API.

	Args:
		text_data (list): a list of text strings to be translated
		from_lang (str, optional): the language of the input text, defaults to 'english'
		to_lang (str, optional): the language to which the text should be translated, defaults to 'pigin'

	Returns:
		list: a list of translated text strings

	Raises:
		ValueError: if the input language or output language is not supported
	"""
	translations = []
	languages = ['english', 'pigin', 'hausa', 'igbo', 'yoruba']
	headers = {
		'content-type' : 'application/json',
	}
	assert from_lang in languages and to_lang in languages, \
		f'lang must be english, pigin, hausa, igbo or yoruba, got {from_lang, to_lang}'

	assert type(text_data) == list, f'text_data must be a list'

	for i in range(len(text_data)):
		line = text_data[i]
		payload = dict(text = line, from_ = from_lang, to_ = to_lang)
		response = requests.post(api_endpoint, json=payload, headers=headers)
		if response.status_code == 200:
			try:
				response = json.loads(response.text)
				translations.append(response['translated'])
			except Exception as e:
				print(f'An exception occured {e}')
		else:
			print(f'Cant reach API, error code-> {response.status_code}')
	print(f'Translation Completed')
	return translations

In [7]:
hausa_text = [
	"su ke ciyar da shi su ke shayar da shi har sai yaro ya zama mutum sosai",
	"mutanensa suna binsa, har ya iso kusa da sarki",
	"Asalin hausawa an ce, wadansu mutane ne, wa da kane, suka zo daga kasal larabawa da matan su biu. Su ka zamna wani jeji kusa da kasal barno, sunansa gabi, su ka yi bukoki, su ka yi shimge, su na halbin namun jeji, don su ma-halba ne. Yau, mutane kuwa su na zua daga barno, su na sayasayar nama, kuma su na zua daga wasu gurare, su na sayasaya, har gun nan ya zama gari-gari. Su na nan, har matar kanen nan ta haifi ya, su ka sa ma ta suna fetsima, amma su na yi ma ta lakabi dauratu da larabci, shi ne kewaya, kaman sun ce, su na yin kewayal duniya, har su ka zo gabi su ka haife ta.",
	"idan yaran suka shekara bakwai sai a sa su a makaranta",
	"kowa ya sani cewa duk duniyan nan babu sana'ar da ta fi noma wuya",
	"ina yin kuka saboda gajiya"
]
english_text = [
    "they give him food and drink until the boy is a grown man",
    "his people followed him, as long as he had not approached the emperor",
    "It is said that the origin of the Hausa was thus. Some people, an elder brother and a younger brother, came from Arab countries with their wives. They settled in a wild place not far from the country of Bornu. This place was called Gabi. Here they built huts and put up a fence, They hunted wild animals, because they were hunters. People came from Bornu and from other places and bought meat from them. In the end, the place was turned into a town. They lived there until the wife of the younger brother bore a girl. To this girl they gave the name Fatsima and as well as this, the nickname Daurata which means ‘circle’ in Arabic. They said that they had wandered about in the world until they came to Gabi and this girl was born.",
    "when boys have completed their seventh year, they are sent to school",
    "everyone knows that there is no heavier work on earth than agriculture",
	"I wept for tiredness""they give him food and drink until the boy is a grown man"
]

hausa_text_model = []


In [8]:
hausa_text_model = batch_translate(english_text, 'english', 'hausa')
print(hausa_text_model)

Translation Completed
['suna ba shi abinci da sha har sai yaron ya girma', 'Mutanensa sun bi shi, muddin bai kusanci sarki ba', 'An ce asalin Hausa ne. Wasu mutane, babban dan uwansu da dan uwansu, sun fito ne daga kasashen Larabawa tare da matansu. Sun zauna a wani wuri na daji da ke da nisa da kasar Bornu. Ana. Anan suka gina kwanton gida suka sanya shingen shingen, suna farautar dabbobin daji, domin masu farauta ne. Mutane daga Bornu da sauran wurare sun sayi nama daga gare su. A karshe sai aka y. Ga wannan yarinyar sun ba da sunan Fatsima kuma da wannan, sunan Daurata wanda ke nufin ‘zagaye’ a harshen Larabci. Sun ce sun yi yawo a duniya har sai sun zo Gabi kuma an haifi wannan yarin', 'idan yara sun gama shekaru bakwai, a tura su zuwa makaranta', 'kowa ya san cewa babu wani aiki mai nauyi a duniya fiye da noma', 'Na yi kuka domin gajiya suna ba shi abinci da sha har sai yaron ya girma']


In [49]:
def split_to_words(sentences: list):
	splitted = []
	for line in sentences:
		line = line.replace(',', '')
		line = line.replace('.', '')
		line = line.split(' ')
		splitted.append(line)
	return splitted


In [81]:
# https://pytorch.org/text/stable/data_metrics.html

from torchtext.data.metrics import bleu_score

candidate_corpus = split_to_words(hausa_text_model)
references_corpus = split_to_words(hausa_text)

bleu_score(candidate_corpus, references_corpus, 1, weights=[1.0])


0.041420117020606995