# Preparing data 

This script will read in the separate files with news coverage from the different countries. 
In addition, it will create a random sample + translate those sample texts to English (for validation purposes)

## Load packages + set paths

In [11]:
import os
import logging
import pandas as pd
from transformers import MarianMTModel, MarianTokenizer
import logging
import torch  # This is the missing import for PyTorch
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("translation")

n_samples_per_country = 10  # Adjust this number as needed

## Read the full data --> find these datasets on Research Drive
path_to_news = "/home/akroon/data/volume_2/RESPONDE/data/data_conbined/"
path_to_RESPOND_data = '/home/akroon/data/volume_2/RESPONDE/'

countries = ["Sweden", "Netherlands", "United_Kingdom", "Hungary", "Italy", 
             "France", "Ukraine", "Serbia", "Bulgaria"]

In [None]:
dataframes = []

# Read and load data for each country
for country in countries:
    filename = f'{country}_combined_news.csv'
    file_path = os.path.join(path_to_news, filename)
    logger.info(f"Reading data from {file_path}")
    
    # Load CSV with appropriate dtypes and memory optimization
    news_data = pd.read_csv(file_path, dtype={'title': str, 'body': str}, low_memory=False)
    news_data['country'] = country
    dataframes.append(news_data)

combined_news_data = pd.concat(dataframes, ignore_index=True)
logger.info("Data combined successfully.")
combined_news_data['date'] = pd.to_datetime(combined_news_data['date'], errors='coerce')
combined_news_data['year'] = combined_news_data['date'].dt.year

# Fill NaN values in 'title' and 'body' and create 'combined_text' column
combined_news_data['title'] = combined_news_data['title'].fillna('').astype(str)
combined_news_data['body'] = combined_news_data['body'].fillna('').astype(str)
combined_news_data['combined_text'] = combined_news_data['title'] + ' ' + combined_news_data['body']

# Save a random sample of data for easy access later if needed
combined_news_data.to_csv(f'{path_to_RESPOND_data}combined_sample.csv', index=False)

In [None]:
combined_news_data = pd.read_csv(f'{path_to_RESPOND_data}combined_sample.csv')
print(len(combined_news_data))

In [None]:
combined_news_data['country'].value_counts()

## Take a stratified sample with 100 cases per country

In [None]:
# Sample data to a manageable size and retrieve 'combined_text' with equal samples from each country
n_samples_per_country = 100  # Adjust this number as needed

# Take equal size samples from each country
sample_df = combined_news_data.groupby('country').apply(lambda x: x.sample(n=n_samples_per_country, random_state=42) if len(x) >= n_samples_per_country else x)
sample_df = sample_df.reset_index(drop=True)  # Reset index after groupby operation

sample_df.to_csv(f'{path_to_RESPOND_data}sample.csv', index=False)

### Translate all texts to English

In [13]:
sample_df = pd.read_csv(f'{path_to_RESPOND_data}sample.csv')
# List of countries and their corresponding language codes for MarianMT
country_to_lang = {
    "Sweden": "sv",
    "Netherlands": "nl",
    "United_Kingdom": "en",
    "Hungary": "hu",
    "Italy": "it",
    "France": "fr",
    "Ukraine": "uk",
    "Serbia": "sr",
    "Bulgaria": "bg"
}

In [14]:
# Retrieve 'combined_text'
texts = sample_df['combined_text'].tolist()
# Cache for loaded models
models_cache = {}

# Step 1: Initialize MarianMT translation model (dynamically selecting based on source language)
def get_translation_model(source_lang):
    if source_lang not in models_cache:
        model_name = f'Helsinki-NLP/opus-mt-{source_lang}-en'  # Use language to English
        try:
            model = MarianMTModel.from_pretrained(model_name)
            tokenizer = MarianTokenizer.from_pretrained(model_name)
            models_cache[source_lang] = (model, tokenizer)
        except Exception as e:
            logger.error(f"Failed to load model for {source_lang}: {e}")
            return None, None
    return models_cache[source_lang]

# Function to translate text (from any of the supported languages to English)
def translate_text(text, source_lang='es'):
    try:
        # Load the correct translation model based on source language
        translation_model, translation_tokenizer = get_translation_model(source_lang)
        
        if translation_model is None or translation_tokenizer is None:
            return text  # If model loading failed, return the original text
        
        # Move inputs to GPU if available
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        translation_model.to(device)

        # Split long text into smaller chunks if needed
        max_input_length = translation_tokenizer.model_max_length
        input_chunks = [text[i:i + max_input_length] for i in range(0, len(text), max_input_length)]
        
        translated_chunks = []
        for chunk in input_chunks:
            inputs = translation_tokenizer(chunk, return_tensors='pt', padding=True, truncation=True).to(device)
            translated = translation_model.generate(
                **inputs,
                num_beams=4,  # Use beam search to prevent repetitive output
                no_repeat_ngram_size=3,  # Prevent repetition of 3-grams
                temperature=1.0,  # Control randomness
                top_k=50,  # Top-k sampling for more diverse translations
                top_p=0.95,  # Top-p sampling for more diverse translations
            )
            translated_text = translation_tokenizer.decode(translated[0], skip_special_tokens=True)
            translated_chunks.append(translated_text)
        
        # Combine all chunks into one translated text
        return " ".join(translated_chunks)
    
    except Exception as e:
        logger.error(f"Error translating text: {e}")
        return text  # Return original text in case of failure

# Step 2: Translate texts in the 'combined_text' column to English
translated_texts = []
total_texts = len(texts)

# Use tqdm for a progress bar
for i, (text, country) in tqdm(enumerate(zip(texts, sample_df['country']), 1), total=total_texts, desc="Translating Texts"):
    if country not in country_to_lang:
        logger.warning(f"Country {country} not found in country_to_lang dictionary.")
        translated_texts.append(text)
    else:
        source_lang = country_to_lang[country]  # Get language code for translation
        translated_text = translate_text(text, source_lang=source_lang)
        translated_texts.append(translated_text)

    # Log progress and save intermediate results every 10,000 translations
    if i % 10000 == 0:
        logger.info(f"{i}/{total_texts} texts translated...")
        partial_output_file = f'{path_to_RESPOND_data}partial_translation_{i}.csv'
        sample_df.iloc[:i].to_csv(partial_output_file, index=False)
        logger.info(f"Intermediate translations saved: {partial_output_file}")

# Add translated texts to the dataframe
sample_df['translated_text'] = translated_texts

Translating Texts:  56%|█████▌    | 500/900 [35:50<20:45,  3.11s/it]  ERROR:translation:Failed to load model for sr: Helsinki-NLP/opus-mt-sr-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.
Translating Texts:  56%|█████▌    | 501/900 [35:50<14:42,  2.21s/it]ERROR:translation:Failed to load model for sr: Helsinki-NLP/opus-mt-sr-en is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.
Translating Texts:  56%|█████▌    | 502/900 [35:50<10:28,  1.58s/it]ERROR:translation:Failed to load model for sr: Helsinki-NLP/opus-mt-sr-en is not

In [16]:
output_path =  f'{path_to_RESPOND_data}translated_sample_df.csv'
sample_df.to_csv(output_path, index=False)

In [18]:
sample_df[['combined_text', 'translated_text']]

Unnamed: 0,combined_text,translated_text
0,"""Нова демокрация"" прекрати партийното членство...","""New Democracy"" ended the party membership of ..."
1,Съдът намали паричната гаранция на Васил Божко...,The court reduced Vasil Bozhkov's monetary gua...
2,"""Равен мач"" за Зеленски, но всъщност - победа ...","""Equal match"" for Zelinski, but in fact - vict..."
3,Трима задържани за измама с евросредства за зе...,Three detained for fraud with euro funds for a...
4,Окончателно: Стайко Стайков ще се лекува под д...,Final: Staiko Staykov will be treated under ho...
...,...,...
895,Deep in the weeds: How to solve the legal prob...,Deep in the weeds: How to solve the legal prob...
896,Martin Canning hoping duo can show fighting sp...,Martin Canning hoping duo can show fighting sp...
897,BBC to pay 'substantial' sum for story libelli...,BBC to pay 'substantial' sum for story libelli...
898,German officer 'posed as Syrian refugee in plo...,German officer 'posed as Syrian refugee in plo...
