# BiasBios Translation

Target languages: Icelandic, German, Dutch, Turkish, Italian and Norwegian

In [1]:
import pandas as pd
import os

## Load the dataset

In [2]:
# Specify whether to translate maked or unmasked data
masked = True

if masked:
    train_data = "../data/bios/en/train_split_balanced_masked.csv"
    test_data = "../data/bios/en/test_split_balanced_masked.csv"
else:
    train_data = "../data/bios/en/train_split_balanced.csv"
    test_data = "../data/bios/en/test_split_balanced.csv"

# Load data as pandas dataframes
train_df = pd.read_csv(train_data)
test_df = pd.read_csv(test_data)
train_df.head(10)

Unnamed: 0,hard_text,profession,gender,hard_text_masked
0,He is Advanced Jivamukti Certified and a mento...,yoga_teacher,Male,they are Advanced Jivamukti Certified and a me...
1,Damien is a graduate of the University of Illi...,paralegal,Male,[NAME] is a graduate of the University of Illi...
2,"Having lost his dad at just 11 years old, he b...",poet,Male,"Having lost their dad at just 11 years old, th..."
3,"He brings his particular brand of dark, irreve...",comedian,Male,"they bring their particular brand of dark, irr..."
4,"He is an expert on healthy eating, food, and n...",dietitian,Male,"they are an expert on healthy eating, food, an..."
5,He graduated with honors in 2001. Having more ...,nurse,Male,they graduated with honors in 2001. Having mor...
6,Tarra is onto her 4th year in business and has...,interior_designer,Female,Tarra is onto them 4th year in business and ha...
7,He specializes in the therapy of younger child...,psychologist,Male,they specialize in the therapy of younger chil...
8,He studied at the University of Anatolu from w...,painter,Male,they studied at the University of Anatolu from...
9,"She has a calling to young women, with the man...",pastor,Female,"they have a calling to young women, with the m..."


In [3]:
data_dir = os.path.dirname(train_data).split('en')[0]
print(data_dir)

../data/bios/


In [4]:
# Print the shape of the dataframes
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (4480, 4)
Test shape: (1120, 4)


## Translate

In [5]:
target_language = 'NB'

In [None]:
# Import the Google Cloud Translation library.
from google.cloud import translate_v3

PROJECT_ID = "biasbios-translation"
# Load Google Cloud credentials securely
import sys
import os
sys.path.append('.')

try:
    from utils.config_loader import setup_google_credentials
    setup_google_credentials()
    print("✅ Google Cloud credentials loaded securely")
except Exception as e:
    print(f"⚠️ Warning: Could not load Google credentials: {e}")
    # Manually set if needed:
    # os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "path/to/your/credentials.json"

# Tranlating with Google Cloud Translation API
def translate_text(
    text: str = "YOUR_TEXT_TO_TRANSLATE",
    source_language_code: str = "en-US",
    target_language_code: str = "fr",
) -> translate_v3.TranslationServiceClient:
    """Translate Text from a Source language to a Target language.
    Args:
        text: The content to translate.
        source_language_code: The code of the source language.
        target_language_code: The code of the target language.
            For example: "fr" for French, "es" for Spanish, etc.
            Find available languages and codes here:
            https://cloud.google.com/translate/docs/languages#neural_machine_translation_model
    """

    # Initialize Translation client.
    client = translate_v3.TranslationServiceClient()
    parent = f"projects/{PROJECT_ID}/locations/global"

    # MIME type of the content to translate.
    # Supported MIME types:
    # https://cloud.google.com/translate/docs/supported-formats
    mime_type = "text/plain"

    # Translate text from the source to the target language.
    response = client.translate_text(
        contents=[text],
        parent=parent,
        mime_type=mime_type,
        source_language_code=source_language_code,
        target_language_code=target_language_code,
    )

    return response.translations[0].translated_text

In [None]:
import deepl
import requests
import os

# Load DeepL API key securely
try:
    from utils.config_loader import get_api_key
    deepl_api_key = get_api_key('deepl')
    translator = deepl.DeepLClient(deepl_api_key)
    print("✅ DeepL translator initialized securely")
except Exception as e:
    print(f"⚠️ Warning: Could not load DeepL API key: {e}")
    # Fallback to environment variable
    deepl_api_key = os.getenv('DEEPL_API_KEY')
    if deepl_api_key:
        translator = deepl.DeepLClient(deepl_api_key)
        print("✅ DeepL loaded from environment variable")
    else:
        translator = None
        print("❌ DeepL translator not available - please configure API key")

In [8]:
# count the number of characters in the dataset

def count_characters_in_dataset(df, text_column):
    """
    Count total characters in a specific text column of a dataframe.
    
    Args:
        df (pd.DataFrame): The dataframe containing text data
        text_column (str): Name of the column containing text to count
        
    Returns:
        dict: Statistics about character counts
    """
    # Get the text column and handle NaN values
    texts = df[text_column].fillna('')
    
    # Calculate character counts
    char_counts = texts.str.len()
    total_chars = char_counts.sum()
    avg_chars = char_counts.mean()
    max_chars = char_counts.max()
    min_chars = char_counts.min()
    
    # Count non-empty texts
    non_empty_texts = (char_counts > 0).sum()
    
    return {
        'total_characters': total_chars,
        'average_characters': avg_chars,
        'max_characters': max_chars,
        'min_characters': min_chars,
        'total_texts': len(df),
        'non_empty_texts': non_empty_texts,
        'estimated_cost_eur': total_chars * 0.000020  # DeepL Pro pricing: €20 per 1M chars
    }

# Count characters in both datasets
text_col = 'hard_text_masked' if masked else 'hard_text'

print("=== CHARACTER COUNT ANALYSIS ===")
print(f"Text column: {text_col}")
print()

# Train dataset
train_stats = count_characters_in_dataset(train_df, text_col)
print("TRAIN DATASET:")
print(f"  Total characters: {train_stats['total_characters']:,}")
print(f"  Total texts: {train_stats['total_texts']:,}")
print(f"  Average chars per text: {train_stats['average_characters']:.1f}")
print(f"  Min chars: {train_stats['min_characters']:,}")
print(f"  Max chars: {train_stats['max_characters']:,}")
print(f"  Estimated cost: €{train_stats['estimated_cost_eur']:.2f}")
print()

# Test dataset
test_stats = count_characters_in_dataset(test_df, text_col)
print("TEST DATASET:")
print(f"  Total characters: {test_stats['total_characters']:,}")
print(f"  Total texts: {test_stats['total_texts']:,}")
print(f"  Average chars per text: {test_stats['average_characters']:.1f}")
print(f"  Min chars: {test_stats['min_characters']:,}")
print(f"  Max chars: {test_stats['max_characters']:,}")
print(f"  Estimated cost: €{test_stats['estimated_cost_eur']:.2f}")
print()

# Combined totals
total_chars = train_stats['total_characters'] + test_stats['total_characters']
total_texts = train_stats['total_texts'] + test_stats['total_texts']
total_cost = train_stats['estimated_cost_eur'] + test_stats['estimated_cost_eur']

print("COMBINED TOTALS:")
print(f"  Total characters: {total_chars:,}")
print(f"  Total texts: {total_texts:,}")
print(f"  Estimated total cost: €{total_cost:.2f}")
print()

=== CHARACTER COUNT ANALYSIS ===
Text column: hard_text_masked

TRAIN DATASET:
  Total characters: 1,668,632
  Total texts: 4,480
  Average chars per text: 372.5
  Min chars: 133
  Max chars: 957
  Estimated cost: €33.37

TEST DATASET:
  Total characters: 406,158
  Total texts: 1,120
  Average chars per text: 362.6
  Min chars: 140
  Max chars: 970
  Estimated cost: €8.12

COMBINED TOTALS:
  Total characters: 2,074,790
  Total texts: 5,600
  Estimated total cost: €41.50



In [9]:
# Add these utility functions before translation

def check_deepl_usage(translator):
    """Check current DeepL API usage."""
    try:
        usage = translator.get_usage()
        print(f"DeepL API Usage:")
        print(f"  Characters used: {usage.character.count:,}")
        print(f"  Character limit: {usage.character.limit:,}")
        print(f"  Remaining: {usage.character.limit - usage.character.count:,}")
        print(f"  Usage: {(usage.character.count/usage.character.limit)*100:.1f}%")
        return usage
    except Exception as e:
        print(f"Could not check usage: {e}")
        return None

def translate_bios_with_monitoring(df, target_language=target_language, batch_size=100):
    """
    Translate bios with character count monitoring and batch processing.
    """
    total_chars_translated = 0
    
    print(f"Starting translation of {len(df)} texts...")
    print(f"Target language: {target_language}")
    
    # Check initial usage
    usage = check_deepl_usage(translator)
    
    for i, (index, row) in enumerate(df.iterrows()):
        if masked:
            text = row['hard_text_masked']
            target_col = f'hard_text_masked_{target_language}'
        else:
            text = row['hard_text']
            target_col = f'hard_text_{target_language}'
        
        # Count characters before translation
        char_count = len(text)
        total_chars_translated += char_count
        
        try:
            if target_language == 'IS':
                # Use google translate for Icelandic
                result = translate_text(text, source_language_code='en', target_language_code='is')
                df.at[index, target_col] = result
                # Progress reporting
                if (i + 1) % batch_size == 0:
                    print(f"Translated {i+1}/{len(df)} texts ({total_chars_translated:,} chars)")
            else:
                # Use DeepL for other languages
                result = translator.translate_text(text, target_lang=target_language)
                df.at[index, target_col] = result.text
            
                # Progress reporting
                if (i + 1) % batch_size == 0:
                    print(f"Translated {i+1}/{len(df)} texts ({total_chars_translated:,} chars)")
                
        except Exception as e:
            print(f"Error translating row {index}: {e}")
            df.at[index, target_col] = "[TRANSLATION_ERROR]"
    
    print(f"Translation complete! Total characters: {total_chars_translated:,}")
    
    # Check final usage
    final_usage = check_deepl_usage(translator)
    
    return df

In [10]:
# Translate the data
train_df_translated = translate_bios_with_monitoring(train_df)
test_df_translated = translate_bios_with_monitoring(test_df)

Starting translation of 4480 texts...
Target language: NB
DeepL API Usage:
  Characters used: 10,774,351
  Character limit: 1,000,000,000,000
  Remaining: 999,989,225,649
  Usage: 0.0%
Translated 100/4480 texts (40,492 chars)
Translated 200/4480 texts (77,193 chars)
Translated 300/4480 texts (113,862 chars)
Translated 400/4480 texts (148,909 chars)
Translated 500/4480 texts (188,892 chars)
Translated 600/4480 texts (227,923 chars)
Translated 700/4480 texts (264,690 chars)
Translated 800/4480 texts (301,520 chars)
Translated 900/4480 texts (340,170 chars)
Translated 1000/4480 texts (377,623 chars)
Translated 1100/4480 texts (414,448 chars)
Translated 1200/4480 texts (451,880 chars)
Translated 1300/4480 texts (493,251 chars)
Translated 1400/4480 texts (529,278 chars)
Translated 1500/4480 texts (568,068 chars)
Translated 1600/4480 texts (605,310 chars)
Translated 1700/4480 texts (638,338 chars)
Translated 1800/4480 texts (677,519 chars)
Translated 1900/4480 texts (713,356 chars)
Translate

In [11]:
test_df_translated.head(10)

Unnamed: 0,hard_text,profession,gender,hard_text_masked,hard_text_masked_NB
0,His Requiem was the result of a joint commissi...,composer,Male,their [NAME] was the result of a joint commiss...,deres [NAVN] var et resultat av en felles best...
1,Fleming used YouTube to hone his skills throug...,comedian,Male,Fleming used YouTube to hone their skills thro...,Fleming brukte YouTube til å finpusse ferdighe...
2,"He is a Preacher of the end time Gospel, prepa...",pastor,Male,"they are a Preacher of the end time [NAME], pr...","de er en forkynner av endetiden [NAVN], og for..."
3,Marine graduated from the University of Califo...,attorney,Female,Marine graduated from the University of Califo...,Marine ble uteksaminert fra University of Cali...
4,He is director of Westminster Therapy Associat...,psychologist,Male,they are director of Westminster Therapy Assoc...,de er direktør for Westminster Therapy Associa...
5,He has worked with the firm for 14 years and a...,paralegal,Male,they have worked with the firm for 14 years an...,De har jobbet i firmaet i 14 år og bistår firm...
6,"Educated at St John's College, Cambridge, choi...",composer,Male,"Educated at St John's College, Cambridge, choi...","John's College, Cambridge, korskole og Lancing..."
7,"She serves as music administrator, playing the...",pastor,Female,"they serve as music administrator, playing the...","de er musikkadministrator, spiller orgel og sy..."
8,Marc graduated from UC Davis with a degree in ...,pastor,Male,[NAME] graduated from UC Davis with a degree i...,[NAVN] ble uteksaminert fra UC Davis med en gr...
9,Although hearing her material is akin to conve...,comedian,Female,Although hearing their material is akin to con...,Selv om det er som å snakke med en moden vokse...


In [12]:
# Save the translated data

target_path = data_dir + f"/{target_language.lower()}/"
os.makedirs(target_path, exist_ok=True)

if masked:
    print("Saving masked translated datasets...")
    train_df_translated.to_csv(target_path +'train_split_balanced_masked.csv')
    test_df_translated.to_csv(target_path +'test_split_balanced_masked.csv')
else:
    print("Saving unmasked translated datasets...")
    train_df_translated.to_csv(target_path +'train_split_balanced.csv')
    test_df_translated.to_csv(target_path +'test_split_balanced.csv')
print("Datasets saved.")

Saving masked translated datasets...
Datasets saved.
