In [6]:
import os
from googletrans import Translator

In [7]:
# Extract Texts from Folders
def read_files_from_directory(reviews_path):
    reviews = []
    for filename in os.listdir(reviews_path):
        if filename.endswith(".txt"):
            with open(os.path.join(reviews_path, filename), 'r', encoding='ISO-8859-1') as file:
                reviews.append(file.read())
    return reviews

In [8]:
# Define the paths to the positive and negative reviews
pos_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Reviews\Dataset\pos'
neg_reviews_path = r'D:\Gethub\Sentiment-Analysis-of-Movie-Reviews\Dataset\neg'
# Read the positive and negative reviews
pos_reviews = read_files_from_directory(pos_reviews_path)
neg_reviews = read_files_from_directory(neg_reviews_path)

In [9]:
# Data Augmentation By Back Translation
def back_translate(texts, target_language='fr', source_language='en', max_chunk_size=4000):
    # Initialize a list to store all texts
    all_texts = []
    # Initialize the translator
    translator = Translator()
    
    # Loop through all the texts
    for i, text in enumerate(texts):
        # Append the original text to the list
        all_texts.append(text)
        # If the text length exceeds the maximum chunk size, split it into smaller chunks
        if len(text) > max_chunk_size:
            chunks = [text[j:j+max_chunk_size] for j in range(0, len(text), max_chunk_size)]
            print(f"Text number {i + 1} was splited into {len(chunks)} chunks.")
        else:
            chunks = [text]
        
        # Initialize a list to store translated chunks
        translated_chunks = []
        
        # Translate each chunk individually
        for j, chunk in enumerate(chunks):
            try:
                # Translate the chunk to the target language
                translated_chunk = translator.translate(chunk, dest=target_language).text
                # Translate the translated chunk back to the original language
                back_translated_chunk = translator.translate(translated_chunk, dest=source_language).text
                # Append the back translated chunk to the list
                translated_chunks.append(back_translated_chunk)
            except Exception as e:
                print(f"Error occurred for chunck number {j + 1}: {e}")
        
        # Combine the translated chunks and append them to the list
        combined_text = ' '.join(translated_chunks)
        all_texts.append(combined_text)
        print(f"Text number {i + 1} is done.")
    
    return all_texts

In [10]:
print("Lenght Of Positive Reviews Before Augmentation: ", len(pos_reviews))
print("Lenght Of Negative Reviews Before Augmentation: ", len(neg_reviews))
pos_reviews = back_translate(pos_reviews)
neg_reviews = back_translate(neg_reviews)
print("Lenght Of Positive Reviews After Augmentation: ", len(pos_reviews))
print("Lenght Of Negative Reviews Before Augmentation: ", len(neg_reviews))

Lenght Of Positive Reviews Before Augmentation:  1000
Lenght Of Negative Reviews Before Augmentation:  1000
Text number 1 was splited into 2 chunks.
Text number 1 is done.
Text number 2 was splited into 2 chunks.
Text number 2 is done.
Text number 3 is done.
Text number 4 was splited into 2 chunks.
Text number 4 is done.
Text number 5 is done.
Text number 6 was splited into 2 chunks.
Text number 6 is done.
Text number 7 was splited into 2 chunks.
Text number 7 is done.
Text number 8 is done.
Text number 9 is done.
Text number 10 is done.
Text number 11 was splited into 2 chunks.
Text number 11 is done.
Text number 12 was splited into 2 chunks.
Text number 12 is done.
Text number 13 is done.
Text number 14 is done.
Text number 15 was splited into 2 chunks.
Text number 15 is done.
Text number 16 is done.
Text number 17 is done.
Text number 18 was splited into 2 chunks.
Text number 18 is done.
Text number 19 is done.
Text number 20 is done.
Text number 21 is done.
Text number 22 is done.


In [11]:
# Write the augmented reviews to text files
def write_reviews_to_files(reviews, directory):
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Write each review to a separate text file
    for i, review in enumerate(reviews, start=1):
        filename = os.path.join(directory, f"review_{i}.txt")
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(review)

In [12]:
# Create the main directory
main_directory = 'Augmented_Dataset'
os.makedirs(main_directory, exist_ok=True)

# Create subdirectories for positive and negative reviews
pos_directory = os.path.join(main_directory, 'pos')
neg_directory = os.path.join(main_directory, 'neg')
os.makedirs(pos_directory, exist_ok=True)
os.makedirs(neg_directory, exist_ok=True)

In [13]:
# Write positive and negative reviews to files
write_reviews_to_files(pos_reviews, pos_directory)
write_reviews_to_files(neg_reviews, neg_directory)

print("Reviews have been written to files successfully!!")

Reviews have been written to files successfully!!
