In [38]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import re
import string

In [39]:
# Specify the directory containing the files
directory = 'D:\\PUCP\\2024-1\\Temas Avanzados en Ciencias de la Computacion\\Proyecto\\tacc-pukyu-yachay\\Corpus-Aprendizaje'

In [40]:
all_files = os.listdir(directory)

In [41]:
es_files = [f for f in all_files if f.endswith('.es')]
quy_files = [f for f in all_files if f.endswith('.quy')]

In [42]:
es_files.sort()
quy_files.sort()

In [43]:
spanish_csv = 'spanish.csv'
quechua_csv = 'quechua.csv'

In [44]:
def read_file_content(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return file.read().split('\n')

In [45]:
content = read_file_content(os.path.join(directory, es_files[0]))
print(len(content))

34831


In [46]:
spanish_corpus = []
for es_file in es_files:
    filepath = os.path.join(directory, es_file)
    content = read_file_content(filepath)
    spanish_corpus.extend(content)

In [47]:
quechua_corpus = []
for quy_file in quy_files:
    filepath = os.path.join(directory, quy_file)
    content = read_file_content(filepath)
    quechua_corpus.extend(content)

In [48]:
print(len(spanish_corpus))

181572


In [49]:
def preprocesamiento(lista_cadenas):
    resultado = []
    signos_a_eliminar = string.punctuation + '"' + '¿' + '¡'
    for cadena in lista_cadenas:
        cadena = cadena.translate(str.maketrans('', '', signos_a_eliminar))
        cadena = cadena.lower()
        cadena = cadena.split()
        cadena = ' '.join(cadena)
        resultado.append(cadena)
    return resultado

In [50]:
spanish_corpus = preprocesamiento(spanish_corpus)
quechua_corpus = preprocesamiento(quechua_corpus)

In [51]:
spanish_corpus[0]

'en el principio creó dios los cielos y la tierra'

In [52]:
print(len(quechua_corpus))

181572


In [53]:
quechua_corpus[0]

'tukuy imapa qallariyninpi cielokunata hinaspa kay pachata dios unanchaptinmi'

In [54]:
assert len(spanish_corpus) == len(quechua_corpus)

In [55]:
data = [{'es': es, 'quy': quy} for es, quy in zip(spanish_corpus, quechua_corpus)]

In [56]:
resultado = [{'translations': f"{quy} ###>{es}"} for es, quy in zip(spanish_corpus, quechua_corpus)]

In [57]:
resultado[:2]

[{'translation': 'tukuy imapa qallariyninpi cielokunata hinaspa kay pachata dios unanchaptinmi ###>en el principio creó dios los cielos y la tierra'},
 {'translation': 'kay pacha karqa mana formayoq hinaspa mana imayoq ukuuku tutayaq lamar qocham karqa diospa espiritunpas yakukunapa hawanpim muyurqa ###>y la tierra estaba sin orden y vacía y las tinieblas cubrían la superficie del abismo y el espíritu de dios se movía sobre la superficie de las aguas'}]

In [58]:
train_data, temp_data = train_test_split(resultado, test_size=0.2, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [59]:
train_dataset = Dataset.from_list(train_data)
validation_dataset = Dataset.from_list(validation_data)
test_dataset = Dataset.from_list(test_data)

In [60]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

In [61]:
print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 145257
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 18157
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 18158
    })
})


In [62]:
train_df = pd.DataFrame(train_data)
train_df.columns = ['translations']

val_df = pd.DataFrame(validation_data)
val_df.columns = ['translations']

test_df = pd.DataFrame(test_data)
test_df.columns = ['translations']

In [63]:
train_df.head()

Unnamed: 0,translations
0,estados unidos nacionpi wawqenchikmi kay texto...
1,qamqa pasakunankamam wiñaypaqña runata venceru...
2,jarkay ###>obstruir
3,imay urataq ###>¿a qué hora
4,mana allin ruraqkunam ichaqa kay pachamanta ch...


In [64]:
val_df.head()

Unnamed: 0,translations
0,bibliapiqa ninmi “tupa tupaykachispa ” penqayp...
1,yachankichikmi imaynatam taytaqa churin rimapa...
2,ñoqanchikqa ¿imaynatam jesusta hinalla qatichw...
3,chaypaqqa manka ruwaq runaqa puntatam chay mit...
4,bibliam willawanchik hamuq tiempopiqa imawanpa...


In [65]:
test_df.head()

Unnamed: 0,translations
0,áfrica law ghana sutiyoq nacionpim wilsonpa hu...
1,¿imaynatam llakiyuqta yanapana 6 ###>cómo dar ...
2,¿imanasqataq hinalla mañakunanchik ###>¿por qu...
3,diosqa yachanmi ima munasqanchikta ###>para co...
4,roboamqa allin kamachiq kaytam munarqa chaymi ...


In [67]:
train_df.to_csv('dataset/train.csv', index=False)
val_df.to_csv('dataset/validation.csv', index=False)
test_df.to_csv('dataset/test.csv', index=False)

In [70]:
train_dataset1 = Dataset.from_csv('dataset/train.csv')

Generating train split: 145257 examples [00:00, 271540.44 examples/s]


In [80]:
train_dataset1['translations'][:5]

['estados unidos nacionpi wawqenchikmi kay textopa nisqanta kasukurqa ###>un hermano de estados unidos tomó a pecho este consejo',
 'qamqa pasakunankamam wiñaypaqña runata vencerunki rikchaynintapas tikraruspaykim paytaqa qarqorunki ###>prevaleces para siempre contra él y se va cambias su apariencia y lo despides',
 'jarkay ###>obstruir',
 'imay urataq ###>¿a qué hora',
 'mana allin ruraqkunam ichaqa kay pachamanta chinkachisqa kanqaku traicionaqkunam ichaqa qora pilarusqa hina kay pachamanta qechusqa kanqaku ###>pero los impíos serán cortados de la tierra y los pérfidos serán desarraigados de ella']