In [None]:
import pandas as pd
import re
from datasets import Dataset, Audio

In [None]:
dataset = "./ABBYY_clear.txt"

In [None]:
with open(dataset, "r") as f: # reading a txt file
    data = f.readlines()

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df[['word', 'description']] = df[0].str.extract(r'^([^(=\[\-,]+?)(?:\s*[\(=\[\-,]\s*(.*))?$', expand=True)
df = df.drop(0, axis=1)

In [None]:
# Cleaning
#########################
df.drop_duplicates(inplace=True)
df = df[~df["word"].str.contains(r'-\d+-', regex=True, na=False)] # -number- deletion
df['description'] = df['description'].fillna('N/A')
#########################

In [None]:
df = df.dropna(subset=['word']) # NaN for word section

In [None]:
df[df['description'] == 'N/A'].shape[0] # 2044 left, probably to drop, most of them are just page initials
df.drop(df[df['description'] == 'N/A'].index, inplace=True)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("ZiartisNikolas/NMT-cypriot-dialect-to-greek")
model = AutoModelForSeq2SeqLM.from_pretrained("ZiartisNikolas/NMT-cypriot-dialect-to-greek")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Если используете несколько GPU, включите DataParallel
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

In [None]:
# !pip install sacremoses

In [None]:
df.info()

In [None]:
complete = pd.read_csv('full8-12.csv')
really_full = pd.read_csv('complete_dataset.csv')

In [None]:
def translate_to_greek(text):
    if pd.isna(text) or text == '':
        return ''

    # Токенизация и перенос на GPU
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        # Исправление: убрать .module если не используете DataParallel
        if torch.cuda.device_count() > 1:
            outputs = model.module.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
        else:
            outputs = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)

    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

In [None]:
tqdm.pandas(desc="Translating")

# df['greek_word'] = df['word'][15000:].progress_apply(translate_to_greek)
complete['greek_description'] = complete['description'][8000:12000].progress_apply(translate_to_greek)

# print(df[['word', 'description', 'greek_translation']].head())

In [None]:
complete.to_csv('full8-12.csv')

In [None]:
complete.drop()

In [None]:
complete[8050:8100]

In [None]:
ready.info()

In [None]:
# Вместо concat, просто заполним пропуски
ready = really_full.copy()

# Заполнить пропуски в greek_description данными из complete
ready.loc[8000:11999, 'greek_description'] = complete['greek_description'][8000:12000].values

print("Пропусков после заполнения:", ready['greek_description'].isna().sum())

In [None]:
ready.info()

In [None]:
ready.drop_duplicates()

In [None]:
ready.info()

In [None]:
ready = ready.drop(columns=['Unnamed: 0'], errors='ignore')

# Удалить дубликаты
ready = ready.drop_duplicates()

In [None]:
ready = ready.dropna()

In [None]:
ready.info()

In [None]:
ready.to_csv('DATASET.csv')

In [None]:
ready.head()

In [None]:
ready = ready[['word', 'description', 'greek_word', 'greek_description']]

In [None]:
ready

In [None]:
#### Step 1
word_lengths = ready['word'].str.len()
desc_lengths = ready['description'].str.len()
shorter_desc = desc_lengths < word_lengths

print(f"Строк для удаления (description короче word): {shorter_desc.sum()}")

# Удаляем эти строки
ready = ready[~shorter_desc]

print(f"После удаления: {len(ready)}")
print(f"Удалено строк: {shorter_desc.sum()}")

In [None]:
#############
# Step 2
print(f"Было строк: {len(ready)}")

# Пересчитываем для текущего датасета
has_digits = ready['word'].str.contains(r'\d', na=False)

print(f"Строк для удаления (word содержит цифры): {has_digits.sum()}")

# Удаляем эти строки
ready = ready[~has_digits]

print(f"После удаления: {len(ready)}")
print(f"Удалено строк: {has_digits.sum()}")

In [None]:
# Step 3
print(f"Было строк: {len(ready)}")

repeated_chars_word = ready['greek_word'].str.contains(r'(.)\1{3,}', na=False)
repeated_chars_desc = ready['greek_description'].str.contains(r'(.)\1{3,}', na=False)

print(f"Строк с повторениями в greek_word: {repeated_chars_word.sum()}")
print(f"Строк с повторениями в greek_description: {repeated_chars_desc.sum()}")


to_remove = repeated_chars_word | repeated_chars_desc
print(f"Строк для удаления (повторения в greek_word ИЛИ greek_description): {to_remove.sum()}")


ready = ready[~to_remove]

print(f"После удаления: {len(ready)}")
print(f"Удалено строк: {to_remove.sum()}")

In [None]:
##########
# Step 4
print(f"Было строк: {len(ready)}")
single_char_words = ready['word'].str.len() == 1

print(f"Строк для удаления (односимвольные слова): {single_char_words.sum()}")

ready = ready[~single_char_words]

print(f"После удаления: {len(ready)}")
print(f"Удалено строк: {single_char_words.sum()}")

In [None]:
#########
# Step 5
print(f"Было строк: {len(ready)}")

digit_count = ready['description'].str.count(r'\d').fillna(0)
total_count = ready['description'].str.len().fillna(0)
non_digit_count = total_count - digit_count

more_digits = digit_count > non_digit_count

print(f"Строк для удаления (в description цифр больше чем остальных): {more_digits.sum()}")

ready = ready[~more_digits]

print(f"После удаления: {len(ready)}")
print(f"Удалено строк: {more_digits.sum()}")

In [None]:
# single_char_words = ready['word'].str.len() == 1
# count_single_char = single_char_words.sum()

# has_digits = ready['word'].str.contains(r'\d', na=False)
# count_digits = has_digits.sum()

# repeated_chars_word = ready['greek_word'].str.contains(r'(.)\1{3,}', na=False)
# repeated_chars_desc = ready['greek_description'].str.contains(r'(.)\1{3,}', na=False)
# count_repeated_word = repeated_chars_word.sum()
# count_repeated_desc = repeated_chars_desc.sum()
# both_repeated = repeated_chars_word & repeated_chars_desc

# digit_count = ready['description'].str.count(r'\d').fillna(0)
# total_count = ready['description'].str.len().fillna(0)
# non_digit_count = total_count - digit_count

# # Найти строки где цифр больше чем остальных символов
# more_digits = digit_count > non_digit_count
# count_more_digits = more_digits.sum()

In [None]:
ready.info()

In [None]:
ready.to_csv('DATASET_CLEARED.csv')

In [None]:
hf_df = Dataset.from_dict(ready)

In [None]:
hf_df

In [None]:
from huggingface_hub import login
login()

In [None]:
# Shuffling data and make it work for HF
from datasets import Dataset, DatasetDict
import pandas as pd

ready_shuffled = ready.sample(frac=1, random_state=42).reset_index(drop=True)

shuffled_dict = {
    'word': ready_shuffled['word'].tolist(),
    'description': ready_shuffled['description'].tolist(),
    'greek_word': ready_shuffled['greek_word'].tolist(),
    'greek_description': ready_shuffled['greek_description'].tolist()
}

In [None]:
# Количество строк одинаковое?
print(f"Строк до: {len(ready)}")
print(f"Строк после: {len(ready_shuffled)}")

# Содержат ли одинаковые данные (но в другом порядке)?
print(f"Одинаковые слова: {set(ready['word']) == set(ready_shuffled['word'])}")
print(f"Одинаковые описания: {len(set(ready['description'])) == len(set(ready_shuffled['description']))}")

In [None]:
ready_shuffled.info()

In [None]:
hf_shuffled = Dataset.from_dict(ready_shuffled)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
hf_shuffled

In [None]:
train_temp = hf_shuffled.train_test_split(test_size=0.2, seed=42)
val_test = train_temp['test'].train_test_split(test_size=0.5, seed=42)

In [None]:
final_dataset = DatasetDict({
    'train': train_temp['train'],        # 80% (~21,600)
    'validation': val_test['train'],     # 10% (~2,700)
    'test': val_test['test']            # 10% (~2,700)
})

In [None]:
final_dataset

In [None]:
final_dataset.push_to_hub(
   "Elormiden/cypriot-greek-dictionary",
   private=True
)