In [1]:
import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
import string
from underthesea import word_tokenize
from gensim.models import FastText
from deep_translator import GoogleTranslator
import random
import os
import re
from sklearn.model_selection import train_test_split

# Augment

In [2]:

from concurrent.futures import ThreadPoolExecutor
import time
import random

# Dịch ngược với Deep Translator
"""def back_translation_deep_translator(doc):
    translator = GoogleTranslator(source='auto', target='en')
    back_translator = GoogleTranslator(source='en', target='vi')

    def translate_text(text, max_retries=10, delay=1):
        for attempt in range(max_retries):
            try:
                translated = translator.translate(text)
                back_translated = back_translator.translate(translated)
                return back_translated
            except Exception as e:
                time.sleep(delay)
                
        print('Không thể dịch')
        return text

    chunk_size = 1000
    if len(doc) > chunk_size:
        chunks = [doc[i:i + chunk_size] for i in range(0, len(doc), chunk_size)]
        results = [translate_text(chunk) for chunk in chunks]
        return ' '.join(results)
    else:
        return translate_text(doc)"""


# Thay thế từ đồng nghĩa
def synonym_replacement(doc, synonym_dict):
    tokens = word_tokenize(doc)
    new_tokens = []
    for word in tokens:
        if word.lower() in synonym_dict:
            if synonym_dict[word.lower()]:
                new_tokens.append(random.choice(list(synonym_dict[word.lower()])))
            else:
                new_tokens.append(word.lower())
        else:
            new_tokens.append(word)
    return ' '.join(new_tokens)


# Tăng cường dữ liệu
def augmentation_text(data, option,  synonym_dict, num_new_texts, max_workers = 10):
    augmented_docs = []
    labels = []
    
    
    for label in data[option].unique():
        label_data = data[data[option] == label]
        
        def process_document(idx):
            doc = data.loc[idx, 'Text']
            new_doc = synonym_replacement(doc, synonym_dict)
            #new_doc = back_translation_deep_translator(new_doc)
            return new_doc
        
        indices = [random.choice(label_data.index) for _ in range(num_new_texts)]
        labels.extend(data.loc[indices, option])
        try:
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                results = list(executor.map(process_document, indices)
                               ) 
        except Exception as e:
            print(f"Lỗi xảy ra trong ThreadPoolExecutor: {e}")
        augmented_docs.extend(results)
    
    return augmented_docs, labels


In [3]:
# tạo thư viện từ đồng nghĩa
def create_synonyms_dict(directory):
    synonyms_dict = {}

    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    for line in file:
                        line = line.strip()
                        if line:  
                            synonyms = line.split(',') 

                            if len(synonyms) == 1:
                                continue

                            for word in synonyms:
                                word = word.strip()
                                if word:
                                    if word not in synonyms_dict:
                                        synonyms_dict[word] = set()
                                    synonyms_dict[word].update(synonym.strip() for synonym in synonyms if synonym.strip() != word)
            except Exception as e:
                print(f"Lỗi {e}")

    return synonyms_dict

In [4]:


#translator = GoogleTranslator()
directory = '..\\..\\vi-wordnet'
synonyms_dict = create_synonyms_dict(directory)
num_news_train = 1000
num_news_test = 300
max_worker = 100



In [5]:
data = pd.read_csv('..\\..\\data\\processed\\labeled_data.csv')

for option in ['Near', 'Mid', 'Far', 'Potential']:
    df = data[['Text', option]]
    documents = df
    label = data.loc[:, option]
    doc_train, doc_test, label_train, label_test = train_test_split(documents, label, test_size=0.3, random_state=42)
    
    new_acticles_train = augmentation_text(doc_train, option, synonyms_dict, num_new_texts=num_news_train, max_workers=max_worker)
    new_acticles_test = augmentation_text(doc_test, option, synonyms_dict, num_new_texts=num_news_test, max_workers=max_worker)

    new_acticles_train = pd.DataFrame({'Text' : new_acticles_train[0], option : new_acticles_train[1]})
    new_acticles_train['origin'] = 1
    new_acticles_test = pd.DataFrame({'Text' : new_acticles_test[0], option : new_acticles_test[1]})
    new_acticles_test['origin'] = 1

    doc_train = pd.DataFrame(doc_train)
    doc_train['origin'] = 0
    doc_test = pd.DataFrame(doc_test)
    doc_test['origin'] = 0
    doc_test_gr = pd.concat([doc_test, new_acticles_test], axis = 0)
    doc_train_gr = pd.concat([doc_train, new_acticles_train], axis = 0)

    doc_train_gr.to_csv(f'..\\..\\data\\processed\\{option}_train.csv')
    doc_test_gr.to_csv(f'..\\..\\data\\processed\\{option}_test.csv')

