In [2]:
import pandas as pd
from datasets import Dataset
from datasets import load_dataset
from langdetect import detect
from tqdm import tqdm 

In [3]:
# An empty list to store DataFrames for each language
dataframes = []

# Define the list of language configurations
languages = ['de', 'es', 'fr', 'ru', 'tu']

# Number of records to download for each language
records_to_download = 1200

# Load the dataset for each language configuration
for lang in languages:
    print(f"Loading {records_to_download} records from MLSUM dataset for language: {lang}")
    dataset = load_dataset('mlsum', lang, split=f'train[:{records_to_download}]', trust_remote_code=True)

    # Convert to pandas DataFrame for easier manipulation
    df = pd.DataFrame(dataset)

    # Add a column for language
    df['language'] = lang

    # Append the DataFrame to the list
    dataframes.append(df)

# Concatenate all DataFrames into one
data = pd.concat(dataframes, ignore_index=True)

# Filter to keep relevant columns
data = data.drop('url', axis=1, errors='ignore')

Loading 1200 records from MLSUM dataset for language: de
Loading 1200 records from MLSUM dataset for language: es
Loading 1200 records from MLSUM dataset for language: fr
Loading 1200 records from MLSUM dataset for language: ru
Loading 1200 records from MLSUM dataset for language: tu


In [4]:
data

Unnamed: 0,text,summary,topic,title,date,language
0,"Transport im Viehwaggon, Fleischgeruch in der ...","Transport im Viehwaggon, Fleischgeruch in der ...",politik,So war Auschwitz: Erinnerungen einer Holocaust...,00/01/2010,de
1,"Marmorner Zebrastreifen, pompöse Gebäude: Sind...","Marmorner Zebrastreifen, pompöse Gebäude: Sind...",politik,Kommunen in Not (3): Sindelfingen - Jenseits g...,00/01/2010,de
2,Wenn an diesem Montag die Landesvorsitzenden d...,Oskar Lafontaine gibt den Parteivorsitz der Li...,politik,Personaldebatte bei der Linken - Wer kommt nac...,00/01/2010,de
3,Das Portrait von 1791 zeigt Haitis Nationalhel...,Die Wurzeln des Elends liegen in der Vergangen...,politik,Geschichte von Haiti - Napoleons Schmach,00/01/2010,de
4,Neue Köpfe und alte Bekannte: Die neue Regieru...,Schwarz-Gelb ist noch nicht jene Traumkoalitio...,politik,Schwarz-gelbes Kabinett - Merkels Mannschaft i...,00/01/2010,de
...,...,...,...,...,...,...
5995,"Bodrum'da etkisini yitiren yağışlı hava, manda...",Muğla'nın Bodrum ilçesinde etkili olan olumsuz...,unknown,Yağış mandalina bahçelerini vurdu,00/01/2010,tu
5996,"Edinilen bilgiye göre, Adapazarı'ndan Bilecik ...",Sakarya'nın Pamukova ilçesindeki trafik kazası...,unknown,Pamukovada trafik kazası,00/01/2010,tu
5997,- İstanbul kent genelinde Saat 18:18 itibari i...,Yoğun kar yağışı ve tipinin etkisi altındaki İ...,unknown,İstanbulda trafik durumu,00/01/2010,tu
5998,Tükettiğimiz gıdaların hijyenik ve temiz olmas...,işte güvenli ve sağlıklı yemek hazırlamak için...,unknown,Güvenli yemek için 10 altın öneri,00/01/2010,tu


In [5]:
dataset = Dataset.from_pandas(data)

In [6]:
dataset

Dataset({
    features: ['text', 'summary', 'topic', 'title', 'date', 'language'],
    num_rows: 6000
})

In [7]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
from tqdm import tqdm

# Load translation model and tokenizer
translation_model_name = "facebook/mbart-large-50-many-to-one-mmt"
translation_model = MBartForConditionalGeneration.from_pretrained(translation_model_name)
translation_tokenizer = MBart50Tokenizer.from_pretrained(translation_model_name, tgt_lang="en_XX")

2024-12-04 19:28:10.222821: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-04 19:28:12.033172: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2024-12-04 19:28:12.033391: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/loca

#### Translate dataset to english

In [8]:
# Define a mapping of detected languages to MBART language codes
lang_code_mapping = {
    "de": "de_DE",  # German
    "es": "es_XX",  # Spanish
    "fr": "fr_XX",  # French
    "ru": "ru_RU",  # Russian
    "tu": "tr_TR"   # Turkish
}

# Define a function to translate text
def translate_text(text):
    try:
        # Detect the language
        detected_lang = detect(text)
        # Map detected language to MBART code, default to English if not found
        lang_code = lang_code_mapping.get(detected_lang, "en_XX")
        # Set the source language dynamically
        translation_tokenizer.src_lang = lang_code
        inputs = translation_tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
        outputs = translation_model.generate(**inputs)
        translation = translation_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return translation
    except Exception as e:
        print(f"Error translating text: {text[:30]}... - {e}")
        return text  # Return the original text if translation fails
    
# Translate the dataset
def translate_dataset(dataset):
    translated_texts = []
    for text in tqdm(dataset["text"], desc="Translating Texts"):
        translated_text = translate_text(text)
        translated_texts.append(translated_text)
    return translated_texts

In [None]:
# Example usage
translated_texts = translate_dataset(dataset)
dataset = dataset.add_column("translated_text", translated_texts)

In [67]:
# Save the translated dataset
dataset.save_to_disk("./fully_translated_dataset_v2")
print("All columns translated and dataset saved!")

Saving the dataset (0/1 shards):   0%|          | 0/6000 [00:00<?, ? examples/s]

All columns translated and dataset saved!


In [None]:
translated_dataset

In [None]:
# export GOOGLE_APPLICATION_CREDENTIALS="nlp_proj/service-account-key/nlp-translate-summarize-ea64d2a44317.json"

### compress and upload to gcs

In [68]:
from google.cloud import storage
from datasets import load_from_disk
import os
import shutil

# Load the translated dataset
translated_dataset_path = "./fully_translated_dataset_v2"
translated_dataset = load_from_disk(translated_dataset_path)

# Create a new directory for saving the compressed dataset
compressed_dataset_dir = "./compressed_translated_dataset_v2"
if os.path.exists(compressed_dataset_dir):
    shutil.rmtree(compressed_dataset_dir)  # Remove the directory if it already exists
os.makedirs(compressed_dataset_dir, exist_ok=True)

# Save the dataset to the new directory
translated_dataset.save_to_disk(compressed_dataset_dir)

# Compress the directory into a tar.gz file
compressed_file = "translated_dataset_v2.tar.gz"
os.system(f"tar -czvf {compressed_file} -C {compressed_dataset_dir} .")

# Upload to GCS
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to a GCS bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

# Define GCS bucket and file details
bucket_name = "nlp_proj"  # Replace with your bucket name
destination_blob_name = "datasets/translated_dataset_v2.tar.gz"  # Desired path in GCS

# Upload the dataset to GCS
upload_to_gcs(bucket_name, compressed_file, destination_blob_name)

Saving the dataset (0/1 shards):   0%|          | 0/6000 [00:00<?, ? examples/s]

./
./state.json
./data-00000-of-00001.arrow
./dataset_info.json
File translated_dataset_v2.tar.gz uploaded to datasets/translated_dataset_v2.tar.gz.


### load and unzip data

In [14]:
from google.cloud import storage
import os
import tarfile

# Define GCS bucket and file details
bucket_name = "nlp_proj"  # Replace with your bucket name
source_blob_name = "datasets/translated_dataset_v2.tar.gz"  # Path in GCS
destination_file_name = "translated_dataset.tar_v2.gz"  # Local file name

# Function to download a file from GCS
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from GCS bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)
    print(f"File {source_blob_name} downloaded to {destination_file_name}.")

# Download the file from GCS
download_from_gcs(bucket_name, source_blob_name, destination_file_name)

# Unzip the downloaded tar.gz file
def extract_tar_gz(file_name, extract_path):
    """Extracts a tar.gz file to the specified path."""
    with tarfile.open(file_name, "r:gz") as tar:
        tar.extractall(path=extract_path)
    print(f"Extracted {file_name} to {extract_path}.")

# Define extraction path
extracted_path = "./extracted_translated_dataset_v2"

# Ensure the directory exists
os.makedirs(extracted_path, exist_ok=True)

# Extract the dataset
extract_tar_gz(destination_file_name, extracted_path)


File datasets/translated_dataset_v2.tar.gz downloaded to translated_dataset.tar_v2.gz.
Extracted translated_dataset.tar_v2.gz to ./extracted_translated_dataset_v2.


In [15]:
from datasets import load_from_disk

# Load the dataset from the extracted path
dataset = load_from_disk(extracted_path)
print(dataset)

Dataset({
    features: ['text', 'summary', 'topic', 'title', 'date', 'language', 'translated_text'],
    num_rows: 6000
})


In [16]:
df = dataset.to_pandas()
print(df.head())

                                                text  \
0  Transport im Viehwaggon, Fleischgeruch in der ...   
1  Marmorner Zebrastreifen, pompöse Gebäude: Sind...   
2  Wenn an diesem Montag die Landesvorsitzenden d...   
3  Das Portrait von 1791 zeigt Haitis Nationalhel...   
4  Neue Köpfe und alte Bekannte: Die neue Regieru...   

                                             summary    topic  \
0  Transport im Viehwaggon, Fleischgeruch in der ...  politik   
1  Marmorner Zebrastreifen, pompöse Gebäude: Sind...  politik   
2  Oskar Lafontaine gibt den Parteivorsitz der Li...  politik   
3  Die Wurzeln des Elends liegen in der Vergangen...  politik   
4  Schwarz-Gelb ist noch nicht jene Traumkoalitio...  politik   

                                               title        date language  \
0  So war Auschwitz: Erinnerungen einer Holocaust...  00/01/2010       de   
1  Kommunen in Not (3): Sindelfingen - Jenseits g...  00/01/2010       de   
2  Personaldebatte bei der Linken - Wer k

In [17]:
# from tqdm import tqdm

# # Define a function to translate an entire column
# def translate_column(column_data):
#     translated_column = []
#     for value in tqdm(column_data, desc=f"Translating Column"):
#         if isinstance(value, str) and value.strip():  # Translate only non-empty strings
#             translated_value = translate_text(value)
#         else:
#             translated_value = value  # Leave non-string or empty values as is
#         translated_column.append(translated_value)
#     return translated_column

# # Translate only the specified columns
# columns_to_translate = ['summary', 'topic', 'title']
# translated_columns = {}

# for column_name in columns_to_translate:
#     print(f"Translating column: {column_name}")
#     translated_columns[column_name] = translate_column(dataset[column_name])

# # Create a new dataset with the desired columns
# desired_columns = ['summary', 'topic', 'title', 'date', 'translated_text']
# final_dataset_dict = {column: dataset[column] for column in desired_columns if column != 'translated_text'}

# # Add translated columns to the final dataset dictionary
# for column_name, translated_data in translated_columns.items():
#     final_dataset_dict[column_name] = translated_data

# # Convert the final dataset dictionary to a Dataset object
# from datasets import Dataset
# final_dataset = Dataset.from_dict(final_dataset_dict)

In [23]:
from tqdm import tqdm
from datasets import Dataset

# Define a function to translate an entire column
def translate_column(column_data):
    translated_column = []
    for value in tqdm(column_data, desc=f"Translating Column"):
        if isinstance(value, str) and value.strip():  # Translate only non-empty strings
            translated_value = translate_text(value)
        else:
            translated_value = value  # Leave non-string or empty values as is
        translated_column.append(translated_value)
    return translated_column

# Define the number of records to process
record_limit = 3000

# Slice the dataset to limit the number of records
dataset_limited = dataset.select(range(record_limit))

# Translate the 'translated_text' column if it's not already translated
if "translated_text" not in dataset_limited.column_names:
    print("Translating 'text' column to create 'translated_text'...")
    translated_text_column = translate_column(dataset_limited["text"])
else:
    translated_text_column = dataset_limited["translated_text"]

# Translate only the specified columns
columns_to_translate = ['summary', 'topic', 'title']
translated_columns = {}

for column_name in columns_to_translate:
    print(f"Translating column: {column_name}")
    translated_columns[column_name] = translate_column(dataset_limited[column_name])

# Create a new dataset with the desired columns
desired_columns = ['summary', 'topic', 'title', 'date', 'translated_text']
final_dataset_dict = {column: dataset_limited[column] for column in desired_columns if column != 'translated_text'}

# Add translated columns to the final dataset dictionary
for column_name, translated_data in translated_columns.items():
    final_dataset_dict[column_name] = translated_data

# Add the translated_text column
final_dataset_dict["translated_text"] = translated_text_column

# Convert the final dataset dictionary to a Dataset object
final_dataset = Dataset.from_dict(final_dataset_dict)

Translating column: summary


Translating Column: 100%|██████████| 3000/3000 [2:40:32<00:00,  3.21s/it]  


Translating column: topic


Translating Column: 100%|██████████| 3000/3000 [2:24:55<00:00,  2.90s/it]  


Translating column: title


Translating Column: 100%|██████████| 3000/3000 [2:15:53<00:00,  2.72s/it]  


In [24]:
# Save the final dataset
final_dataset.save_to_disk("./final_translated_dataset_v3")
print("Specified columns translated, final dataset saved!")

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

Specified columns translated, final dataset saved!


### compress and upload to gcs

In [26]:
from google.cloud import storage
from datasets import load_from_disk
import os
import shutil

# Load the translated dataset
translated_dataset_path = "./final_translated_dataset_v3"
translated_dataset = load_from_disk(translated_dataset_path)

# Create a new directory for saving the compressed dataset
compressed_dataset_dir = "./compressed_translated_dataset_v3"
if os.path.exists(compressed_dataset_dir):
    shutil.rmtree(compressed_dataset_dir)  # Remove the directory if it already exists
os.makedirs(compressed_dataset_dir, exist_ok=True)

# Save the dataset to the new directory
translated_dataset.save_to_disk(compressed_dataset_dir)

# Compress the directory into a tar.gz file
compressed_file = "translated_dataset_v3.tar.gz"
os.system(f"tar -czvf {compressed_file} -C {compressed_dataset_dir} .")

# Upload to GCS
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to a GCS bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)
    print(f"File {source_file_name} uploaded to {destination_blob_name}.")

# Define GCS bucket and file details
bucket_name = "nlp_proj"  # Replace with your bucket name
destination_blob_name = "datasets/translated_dataset_v3.tar.gz"  # Desired path in GCS

# Upload the dataset to GCS
upload_to_gcs(bucket_name, compressed_file, destination_blob_name)

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

./
./state.json
./data-00000-of-00001.arrow
./dataset_info.json
File translated_dataset_v3.tar.gz uploaded to datasets/translated_dataset_v3.tar.gz.


### load and unzip 

In [27]:
from google.cloud import storage
import os
import tarfile

# Define GCS bucket and file details
bucket_name = "nlp_proj"  # Replace with your bucket name
source_blob_name = "datasets/translated_dataset_v3.tar.gz"  # Path in GCS
destination_file_name = "translated_dataset.tar_v3.gz"  # Local file name

# Function to download a file from GCS
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from GCS bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)

    blob.download_to_filename(destination_file_name)
    print(f"File {source_blob_name} downloaded to {destination_file_name}.")

# Download the file from GCS
download_from_gcs(bucket_name, source_blob_name, destination_file_name)

# Unzip the downloaded tar.gz file
def extract_tar_gz(file_name, extract_path):
    """Extracts a tar.gz file to the specified path."""
    with tarfile.open(file_name, "r:gz") as tar:
        tar.extractall(path=extract_path)
    print(f"Extracted {file_name} to {extract_path}.")

# Define extraction path
extracted_path = "./extracted_translated_dataset_v3"

# Ensure the directory exists
os.makedirs(extracted_path, exist_ok=True)

# Extract the dataset
extract_tar_gz(destination_file_name, extracted_path)

File datasets/translated_dataset_v3.tar.gz downloaded to translated_dataset.tar_v3.gz.
Extracted translated_dataset.tar_v3.gz to ./extracted_translated_dataset_v3.


In [28]:
from datasets import load_from_disk

# Load the dataset from the extracted path
dataset = load_from_disk(extracted_path)
print(dataset)

Dataset({
    features: ['summary', 'topic', 'title', 'date', 'translated_text'],
    num_rows: 3000
})


In [29]:
dataset[:1]

{'summary': ['Transport in a cattle carriage, smell of meat in the air, selection with Dr Mengele: Holocaust survivor Lisa Miková remembers the Auschwitz-Birkenau extermination camp.'],
 'topic': ['Politics'],
 'title': ['Auschwitz: Memories of a Holocaust Survivor'],
 'date': ['00/01/2010'],
 'translated_text': ['Transport in a cattle carriage, smell of meat in the air, selection with Dr Mengele: Holocaust survivor Lisa Miková remembers the Auschwitz-Birkenau extermination camp. Lisa Miková was born in 1922 in Prague. In her largely secular Jewish family, German and Czech were spoken. In 1942, she was deported to the Theresienstadt concentration camp. From there, her parents were first deported, then, in the autumn of 1944, her husband František to the Auschwitz extermination camp. Lisa Miková volunteered shortly afterwards.']}