In [None]:
!pip install tmx
!pip install beautifulsoup4
!pip install bitsandbytes
!pip install --upgrade transformers
!pip install --upgrade peft
!pip install datasets
!pip install sacrebleu
!pip install bert-score
!pip install langdetect

In [None]:
from bs4 import BeautifulSoup
import random
import json

# Load and parse the TMX file
with open("de-fr_04-11-2015_Website_final.tmx", "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "xml")

# Extract all translation units
translation_units = soup.find_all("tu")
print("Number of translation units found:", len(translation_units))

# List to store German-French pairs
sentence_pairs = []

for tu in translation_units:
    tuv_tags = tu.find_all("tuv")
    # Ensure there are exactly 2 translation units per 'tu'
    if len(tuv_tags) != 2:
        continue

    # Try to get language attributes from either "xml:lang" or "lang", then convert to uppercase
    lang1 = (tuv_tags[0].get("xml:lang") or tuv_tags[0].get("lang") or "").upper()
    lang2 = (tuv_tags[1].get("xml:lang") or tuv_tags[1].get("lang") or "").upper()

    # Extract the sentence segments
    seg1 = tuv_tags[0].find("seg")
    seg2 = tuv_tags[1].find("seg")
    if seg1 is None or seg2 is None:
        continue
    text1 = seg1.text.strip()
    text2 = seg2.text.strip()

    # Use string matching to determine language, e.g. "DE-DE" should count as German.
    if lang1.startswith("DE") and lang2.startswith("FR"):
        sentence_pairs.append({"german": text1, "french": text2})
    elif lang1.startswith("FR") and lang2.startswith("DE"):
        sentence_pairs.append({"german": text2, "french": text1})
    # Otherwise, skip

print("Number of German-French pairs found:", len(sentence_pairs))

# Randomly sample up to 1000 pairs (or all if fewer)
sampled_pairs = random.sample(sentence_pairs, min(1000, len(sentence_pairs)))

# Save the dataset as JSON
with open("dataset_a.json", "w", encoding="utf-8") as json_file:
    json.dump(sampled_pairs, json_file, ensure_ascii=False, indent=4)

# Preview the first 5 pairs
print(sampled_pairs[:5])


In [None]:
import json
import random

# Assume sentence_pairs is your list of all 11,852 German-French pairs
# Randomly sample 1000 pairs to create Dataset A
sampled_pairs = random.sample(sentence_pairs, 1000)

# Save the 1000 pairs to a JSON file for later reuse
with open("dataset_a.json", "w", encoding="utf-8") as json_file:
    json.dump(sampled_pairs, json_file, ensure_ascii=False, indent=4)

# Load the saved Dataset A (optional if you want to split in a fresh session)
with open("dataset_a.json", "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# Define your split ratio (e.g., 80% training, 20% testing)
split_ratio = 0.8
train_size = int(len(data) * split_ratio)
train_data = data[:train_size]
test_data = data[train_size:]

# Save the training and testing sets into separate JSON files
with open("dataset_a_train.json", "w", encoding="utf-8") as train_file:
    json.dump(train_data, train_file, ensure_ascii=False, indent=4)
with open("dataset_a_test.json", "w", encoding="utf-8") as test_file:
    json.dump(test_data, test_file, ensure_ascii=False, indent=4)

print("Number of training pairs:", len(train_data))
print("Number of testing pairs:", len(test_data))


# **Model A**

In [None]:
import json
import torch
import re
from langdetect import detect
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import Dataset
import sacrebleu
from bert_score import score

model_name = "microsoft/phi-2"
# Create a BitsAndBytesConfig for 8-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True  # Optional: offload some operations to CPU for stability
)

# Load the model with the quantization configuration
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Load the saved test dataset
with open("dataset_a_test.json", "r", encoding="utf-8") as file:
    test_data = json.load(file)


# Function to filter out non-French parts while keeping valid French sentences
def filter_french_text(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split into sentences
    french_sentences = [sent for sent in sentences if detect(sent) == "fr"]  # Keep only French sentences
    return " ".join(french_sentences).strip()  # Reconstruct filtered text

# Function to generate translation and retry if needed
def generate_translation(input_text, max_new_tokens=50, max_attempts=2):
    input_text = input_text.replace("->", "").strip()
    conditioned_text = "Translate the following German text into French: " + input_text

    attempt = 0
    while attempt < max_attempts:
        attempt += 1
        inputs = tokenizer(conditioned_text, return_tensors="pt", truncation=True, max_length=128).to(model.device)
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            temperature=0.7,        # Reduced randomness
            repetition_penalty=2.0,  # Increase penalty to discourage repetition
            length_penalty=1.5,      # Encourage longer, more complete responses,
        )
        translation = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        translation = translation.replace("->", "").strip()

        # Filter out non-French parts
        filtered_translation = filter_french_text(translation)

        # If there is at least some French text, return it
        if filtered_translation:
            return filtered_translation

        print(f"Attempt {attempt}: No French detected. Retrying...")

    # If max attempts reached, return the last attempt (even if incorrect)
    print(f"Max retries reached. Returning last attempt: {translation}")
    return translation

# Initialize lists for results
generated_translations = []
reference_translations = []  # List of reference texts (one per sample)

# Loop through test data
for idx, sample in enumerate(test_data):
    try:
        german_input = sample["german"]
        ref_french = sample["french"]
        generated = generate_translation(german_input, max_new_tokens=50)

        if generated:  # Only add non-empty translations
            print(f"Sample {idx}:")
            print("German input:", german_input)
            print("Generated translation (validated):", generated)
            print("Reference translation:", ref_french)
            print("-" * 50)

            generated_translations.append(generated)
            reference_translations.append(ref_french)
        else:
            print(f"Skipping Sample {idx} due to repeated non-French output.")
    except Exception as e:
        print(f"Error processing sample {idx}: {e}")

print("Number of valid French translations:", len(generated_translations))

# Compute BLEU score only if translations exist
if generated_translations:
    bleu = sacrebleu.corpus_bleu(generated_translations, [[ref] for ref in reference_translations])
    print("BLEU score on quick test set:", bleu.score)
else:
    print("No valid translations generated. Skipping BLEU score.")

# Compute BERTScore only if translations exist
if generated_translations:
    P, R, F1 = score(generated_translations, reference_translations, lang="fr", verbose=True)
    print("Average BERTScore F1 on test set:", F1.mean().item())
else:
    print("No valid translations generated. Skipping BERTScore.")


# **Model B**

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Create a quantization configuration object
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,                      # load the model in 8-bit mode
    llm_int8_enable_fp32_cpu_offload=True,    # optional: offload some operations to CPU for stability
)

model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
from datasets import Dataset

# Create training texts by concatenating the German input and French translation
train_texts = [sample["german"] + " -> " + sample["french"] for sample in train_data]

# Create a Hugging Face Dataset from the training texts
train_dataset = Dataset.from_dict({"text": train_texts})

# Tokenize the dataset (reduce max_length to help with memory)
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])



In [None]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # For language modeling tasks
    inference_mode=False,
    r=32,              # Increase the rank to 32 for more capacity
    lora_alpha=64,     # Increase scaling factor to 64
    lora_dropout=0.1,  # Keep dropout at 0.1
)

model = get_peft_model(model, peft_config)

# Freeze the base parameters (if not already done)
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False



In [None]:
import torch
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
import json
from datasets import Dataset
from transformers import TrainerCallback

# Optionally clear GPU cache
torch.cuda.empty_cache()

# Create a data collator for causal language modeling (no masked LM)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Load test dataset
test_set_path = "/content/dataset_a_test.json"  # Update path if needed
with open(test_set_path, "r", encoding="utf-8") as file:
    test_data = json.load(file)

# Convert test data into Dataset format
test_dataset = Dataset.from_list(test_data)

def preprocess_function(example):
    tokenized = tokenizer(
        example["french"], truncation=True, padding="max_length", max_length=512
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": tokenized["input_ids"]  # Labels needed for causal LM
    }

# Tokenize test dataset
tokenized_eval = test_dataset.map(preprocess_function, batched=True)

# Convert dataset into PyTorch tensors
tokenized_eval.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Define a custom callback to monitor GPU utilization
class GPUUsageCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        gpu_memory_allocated = torch.cuda.memory_allocated() / 1024 ** 3  # GB
        gpu_memory_reserved = torch.cuda.memory_reserved() / 1024 ** 3  # GB
        logs['gpu_memory_allocated_gb'] = gpu_memory_allocated
        logs['gpu_memory_reserved_gb'] = gpu_memory_reserved

# Define training arguments with corrected `eval_strategy`
training_args = TrainingArguments(
    output_dir="./phi2-8bit-lora-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    fp16=False,
    save_steps=100,
    save_total_limit=2,
    eval_strategy="steps",  # Updated to avoid FutureWarning
    eval_steps=100,         # Frequency of evaluation
    logging_steps=100,      # How often to log training metrics
    logging_dir='./logs',   # Directory for TensorBoard logs
    load_best_model_at_end=True,  # Save the best model
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    prediction_loss_only=False,  # Enable reporting more than just loss
    report_to=["tensorboard"]    # Enable TensorBoard monitoring
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,  # Now properly formatted
    data_collator=data_collator,
    callbacks=[GPUUsageCallback()]
)

# Fine-tune the quantized model
trainer.train()

# Save the model (including LoRA adapter weights) locally
trainer.save_model("./my_finetuned_phi2")


In [None]:

import matplotlib.pyplot as plt
import pandas as pd

# Extract logged history
log_history = trainer.state.log_history

# Convert to DataFrame for easy processing
df = pd.DataFrame(log_history)

# Plot Training & Validation Loss
plt.figure(figsize=(10, 5))
plt.plot(df[df['loss'].notna()]['step'], df[df['loss'].notna()]['loss'], label="Training Loss")
if 'eval_loss' in df.columns:
    plt.plot(df[df['eval_loss'].notna()]['step'], df[df['eval_loss'].notna()]['eval_loss'], label="Validation Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.show()

# Plot Learning Rate if available
if 'learning_rate' in df.columns:
    plt.figure(figsize=(10, 5))
    plt.plot(df['step'], df['learning_rate'], label="Learning Rate")
    plt.xlabel("Steps")
    plt.ylabel("Learning Rate")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.show()

# Plot GPU Memory Usage if logged
if 'gpu_memory_allocated_gb' in df.columns:
    plt.figure(figsize=(10, 5))
    plt.plot(df['step'], df['gpu_memory_allocated_gb'], label="GPU Memory Allocated (GB)")
    plt.plot(df['step'], df['gpu_memory_reserved_gb'], label="GPU Memory Reserved (GB)")
    plt.xlabel("Steps")
    plt.ylabel("Memory (GB)")
    plt.title("GPU Memory Usage Over Training")
    plt.legend()
    plt.show()

In [None]:
import torch
from langdetect import detect
import sacrebleu
from bert_score import score
import re

# Function to filter out non-French parts while keeping valid French sentences
def filter_french_text(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split into sentences
    french_sentences = [sent for sent in sentences if detect(sent) == "fr"]  # Keep only French sentences
    return " ".join(french_sentences).strip()  # Reconstruct filtered text

# Function to generate translation and retry if needed
def generate_translation(input_text, max_new_tokens=50, max_attempts=5):
    input_text = input_text.replace("->", "").strip()
    conditioned_text = "Translate the following German text into French: " + input_text

    attempt = 0
    while attempt < max_attempts:
        attempt += 1
        inputs = tokenizer(conditioned_text, return_tensors="pt", truncation=True, max_length=128).to(model.device)
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            temperature=0.7,        # Reduced randomness
            repetition_penalty=2.0,  # Increase penalty to discourage repetition
            length_penalty=1.5,      # Encourage longer, more complete responses,
        )
        translation = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        translation = translation.replace("->", "").strip()

        # Filter out non-French parts
        filtered_translation = filter_french_text(translation)

        # If there is at least some French text, return it
        if filtered_translation:
            return filtered_translation

        print(f"Attempt {attempt}: No French detected. Retrying...")

    # If max attempts reached, return the last attempt (even if incorrect)
    print(f"Max retries reached. Returning last attempt: {translation}")
    return translation

# Initialize lists for results
generated_translations = []
reference_translations = []  # List of reference texts (one per sample)

# Loop through test data
for idx, sample in enumerate(test_data):
    try:
        german_input = sample["german"]
        ref_french = sample["french"]
        generated = generate_translation(german_input, max_new_tokens=50)

        if generated:  # Only add non-empty translations
            print(f"Sample {idx}:")
            print("German input:", german_input)
            print("Generated translation (validated):", generated)
            print("Reference translation:", ref_french)
            print("-" * 50)

            generated_translations.append(generated)
            reference_translations.append(ref_french)
        else:
            print(f"Skipping Sample {idx} due to repeated non-French output.")
    except Exception as e:
        print(f"Error processing sample {idx}: {e}")

print("Number of valid French translations:", len(generated_translations))

# Compute BLEU score only if translations exist
if generated_translations:
    bleu = sacrebleu.corpus_bleu(generated_translations, [[ref] for ref in reference_translations])
    print("BLEU score on quick test set:", bleu.score)
else:
    print("No valid translations generated. Skipping BLEU score.")

# Compute BERTScore only if translations exist
if generated_translations:
    P, R, F1 = score(generated_translations, reference_translations, lang="fr", verbose=True)
    print("Average BERTScore F1 on test set:", F1.mean().item())
else:
    print("No valid translations generated. Skipping BERTScore.")


# **Dataset B**

In [None]:
import openai
import os
import time

from google.colab import drive  # Import Google Drive for saving files

# **Mount Google Drive**
drive.mount('/content/drive')

# **Set up the directory inside Google Drive**
GDRIVE_PATH = "/content/drive/MyDrive/raw_responses"
if not os.path.exists(GDRIVE_PATH):
    os.makedirs(GDRIVE_PATH)

# **Set up the OpenAI-compatible client for SamBanova API**
client = openai.Client(
    api_key="5f1ecebe-f844-4345-b474-ef4d4d35f27f",
    base_url="https://api.sambanova.ai/v1",
)

# Track existing sentences to avoid repetitions
existing_sentences = set()

def generate_synthetic_data(prompt, max_tokens=4096, temperature=0.9, top_p=0.95, retries=5):
    """Requests translation pairs from the SamBanova API with retry logic for rate limits."""
    for attempt in range(retries):
        try:
            response = client.chat.completions.create(
                model="Meta-Llama-3.3-70B-Instruct",
                messages=[
                    {"role": "system", "content": "You are a highly creative assistant that generates advanced, unique, and diverse translations."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
            )
            return response.choices[0].message.content
        except openai.APIError as e:
            if "rate_limit_exceeded" in str(e):
                print(f"Rate limit exceeded. Retrying... (Attempt {attempt+1}/{retries})")
                time.sleep(30)  # Wait 30 seconds before retrying
            else:
                print(f"API request failed due to unexpected error: {e}")
                return None
    print("Max retries reached. Skipping this batch.")
    return None

# === SETTINGS ===
TOTAL_PAIRS = 2000  # Total translation pairs
BATCH_SIZE = 20     # Each request generates exactly 20 pairs
NUM_CALLS = TOTAL_PAIRS // BATCH_SIZE  # Total API calls (100 requests)

# === BATCH REQUEST LOOP ===
for i in range(NUM_CALLS):
    print(f"Requesting batch {i+1}/{NUM_CALLS} ({BATCH_SIZE} pairs)...")

    prompt_text = (
        "IMPORTANT: Output ONLY a valid JSON array and nothing else. Do not include markdown formatting, "
        "explanation, chain-of-thought, or extra text. The output must start with '[' and end with ']'.\n\n"
        f"Generate exactly {BATCH_SIZE} completely unique and diverse German–French translation pairs. "
        "Each element must be a JSON object with two keys: 'german' and 'french'.\n\n"
        "### **INSTRUCTIONS FOR COMPLEXITY & UNIQUENESS:**\n"
        "- **Ensure every sentence is completely unique and does not repeat previous structures.**\n"
        "- **Use varied sentence structures** (passive voice, indirect speech, complex clauses, figurative speech).\n"
        "- **Incorporate idioms, rich vocabulary, and professional/formal phrasing**.\n"
        "- **Use complex word order, inversion, and subjunctive mood (Konjunktiv).**\n"
        "- **Avoid repetitive sentence patterns. Every sentence must feel fresh.**\n"
        "- **Do not use common sentences. Push creativity and originality.**\n\n"
        "**EXAMPLES:**\n"
        "[\n"
        "  {\"german\": \"Wäre ich damals nicht in die falsche Richtung gelaufen, hätte ich den letzten Bus noch erwischt, doch stattdessen musste ich die Nacht auf einer Parkbank verbringen.\",\n"
        "   \"french\": \"Si je n'avais pas pris la mauvaise direction à ce moment-là, j'aurais encore attrapé le dernier bus, mais à la place, j'ai dû passer la nuit sur un banc dans le parc.\"},\n"
        "  {\"german\": \"Die alte Villa am See, die seit Jahren verlassen stand, wirkte in der Dämmerung fast gespenstisch, als ob die Wände die Flüstereien vergangener Zeiten bewahrten.\",\n"
        "   \"french\": \"L'ancienne villa au bord du lac, abandonnée depuis des années, semblait presque hantée au crépuscule, comme si ses murs avaient conservé les murmures du passé.\"},\n"
        "  {\"german\": \"Sollte es jemals regnen, wenn wir auf dem Gipfel des Berges sind, dann werde ich mich daran erinnern, wie du mir erzählt hast, dass die beste Aussicht immer nach dem Sturm kommt.\",\n"
        "   \"french\": \"S'il devait pleuvoir lorsque nous serons au sommet de la montagne, alors je me souviendrai que tu m'as dit que la plus belle vue vient toujours après la tempête.\"}\n"
        "]\n\n"
        "### **IMPORTANT RULES:**\n"
        "- **MUST contain exactly 20 pairs. No more, no less.**\n"
        "- **DO NOT generate repetitive structures or phrases.**\n"
        "- **Every sentence must be unique, no slight variations of the same sentence.**\n"
        "- **Output only the JSON array.**\n"
    )

    response_text = generate_synthetic_data(prompt_text, max_tokens=4096, temperature=0.9, top_p=0.95)

    if response_text is None:
        print(f"Skipping batch {i+1} due to API request failure.")
        continue

    # Check for duplicates before saving
    if response_text in existing_sentences:
        print(f"Duplicate response detected in batch {i+1}. Skipping.")
        continue
    existing_sentences.add(response_text)

    # Save raw response to a text file in Google Drive
    raw_filename = f"{GDRIVE_PATH}/raw_batch_{i+1}.txt"
    with open(raw_filename, "w", encoding="utf-8") as raw_file:
        raw_file.write(response_text)
    print(f"Batch {i+1} saved to {raw_filename}")

    # **Wait 30 seconds between API calls to prevent rate limits**
    time.sleep(10)

print("\n All batches completed! Raw JSON text files saved in Google Drive under 'raw_responses/'.")


In [None]:
import shutil
from google.colab import drive

# **Mount Google Drive**
drive.mount('/content/drive')

# **Define Paths**
LOCAL_FOLDER = "/content/raw_responses 5"  # The folder in Colab's local storage
GDRIVE_FOLDER = "/content/drive/MyDrive/raw_responses 5"  # Destination in Google Drive

# **Move the folder**
shutil.move(LOCAL_FOLDER, GDRIVE_FOLDER)

print(f"Folder '{LOCAL_FOLDER}' moved to '{GDRIVE_FOLDER}' in Google Drive!")


In [None]:
import os
import shutil
from google.colab import drive

# **Mount Google Drive**
drive.mount('/content/drive')

# **Define Paths in Google Drive**
source_folder = "/content/drive/MyDrive/raw_responses 5"  # Folder containing the last 45 files
destination_folder = "/content/drive/MyDrive/raw_responses"  # Destination folder

# **Ensure destination folder exists**
os.makedirs(destination_folder, exist_ok=True)

# **Get list of all files in source folder, sort them by name**
files = sorted(os.listdir(source_folder))

# **Select the last 45 files**
files_to_copy = files[-58:]  # Get last 48 files

# **Rename files starting from raw_batch_55 to raw_batch_100**
start_batch_number = 55

for file_name in files_to_copy:
    old_path = os.path.join(source_folder, file_name)  # Original file path
    new_file_name = f"raw_batch_{start_batch_number}.txt"  # New name format
    new_path = os.path.join(destination_folder, new_file_name)  # Destination path

    # **Copy file to destination folder with new name**
    shutil.copy(old_path, new_path)
    print(f" Copied '{file_name}' → '{new_file_name}'")

    start_batch_number += 1  # Increment batch number

print("\n All 45 files copied and renamed successfully!")


In [None]:
import os
import json
from collections import defaultdict
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the dataset folder in Google Drive
dataset_folder = "/content/drive/MyDrive/raw_responses"

# Ensure the folder exists
if not os.path.exists(dataset_folder):
    raise FileNotFoundError(f"Folder '{dataset_folder}' not found in Google Drive.")

# Initialize tracking variables
total_files_checked = 0
total_pairs = 0
all_pairs = set()  # Store unique pairs
duplicate_pairs = defaultdict(list)  # Track duplicate pairs and their file locations
corrupted_files = []

# Loop through all files in the dataset folder
for file_name in sorted(os.listdir(dataset_folder)):
    file_path = os.path.join(dataset_folder, file_name)

    if not file_name.endswith(".txt"):  # Ensure we only process .txt files
        continue

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)

            # Ensure exactly 20 pairs per file
            if len(data) != 20:
                print(f"Warning: {file_name} contains {len(data)} pairs instead of 20.")

            # Check for duplicate pairs
            for pair in data:
                pair_tuple = (pair["german"], pair["french"])  # Create a tuple for uniqueness check
                if pair_tuple in all_pairs:
                    duplicate_pairs[pair_tuple].append(file_name)
                else:
                    all_pairs.add(pair_tuple)

            total_pairs += len(data)
            total_files_checked += 1

    except json.JSONDecodeError:
        print(f"Error: {file_name} is not a valid JSON file or is corrupted.")
        corrupted_files.append(file_name)

# Validation Results
print("\n=== Dataset Validation Summary ===")
print(f"Checked {total_files_checked} files.")
print(f"Total pairs found: {total_pairs}.")

# Check for duplicate pairs
if duplicate_pairs:
    print(f"Found {len(duplicate_pairs)} duplicate pairs. Review the dataset:")
    for pair, file_list in duplicate_pairs.items():
        print(f"Duplicate pair found in files: {', '.join(file_list)}")
        print(f"German: {pair[0]}")
        print(f"French: {pair[1]}")
        print("-" * 50)
else:
    print("No duplicate pairs found. Dataset is unique.")

# Check for corrupted files
if corrupted_files:
    print(f"Found {len(corrupted_files)} corrupted or unreadable files:")
    for file in corrupted_files:
        print(f" - {file}")
else:
    print("No corrupted files detected.")

print("\nDataset verification complete.")


In [None]:
import os
import json
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define paths
dataset_folder = "/content/drive/MyDrive/raw_responses"
output_file = "/content/drive/MyDrive/dataset_b.json"

# List of files to exclude
excluded_files = {"raw_batch_16.txt", "raw_batch_28.txt", "raw_batch_95.txt"}

# Ensure the dataset folder exists
if not os.path.exists(dataset_folder):
    raise FileNotFoundError(f"Folder '{dataset_folder}' not found in Google Drive.")

# Initialize the combined dataset list
combined_data = []

# Process all files in the folder
for file_name in sorted(os.listdir(dataset_folder)):
    if file_name in excluded_files or not file_name.endswith(".txt"):
        continue  # Skip excluded and non-txt files

    file_path = os.path.join(dataset_folder, file_name)

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)

            # Ensure data is a list of dictionaries with expected keys
            if isinstance(data, list) and all(isinstance(pair, dict) and "german" in pair and "french" in pair for pair in data):
                combined_data.extend(data)
            else:
                print(f"Skipping invalid or malformed file: {file_name}")

    except json.JSONDecodeError:
        print(f"Skipping unreadable JSON file: {file_name}")

# Save the combined dataset
with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump(combined_data, json_file, ensure_ascii=False, indent=4)

print(f"Dataset successfully saved as 'dataset_b.json' in Google Drive.")
print(f"Total translation pairs: {len(combined_data)}")


In [None]:
import json
import os
from collections import defaultdict
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to dataset_b.json
dataset_path = "/content/drive/MyDrive/dataset_b.json"

# Ensure the dataset file exists
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"File '{dataset_path}' not found in Google Drive.")

# Load dataset_b.json
with open(dataset_path, "r", encoding="utf-8") as file:
    dataset = json.load(file)

# Initialize variables for checking duplicates
seen_pairs = set()
duplicate_pairs = defaultdict(int)

# Check for duplicate pairs
for pair in dataset:
    pair_tuple = (pair["german"], pair["french"])  # Convert to tuple for uniqueness check
    if pair_tuple in seen_pairs:
        duplicate_pairs[pair_tuple] += 1  # Count duplicate occurrences
    else:
        seen_pairs.add(pair_tuple)

# Report findings
total_duplicates = sum(duplicate_pairs.values())

print("\n=== Dataset Validation Summary ===")
print(f"Total pairs in dataset: {len(dataset)}")
print(f"Total unique pairs: {len(seen_pairs)}")
print(f"Total duplicate pairs: {total_duplicates}")

if total_duplicates > 0:
    print("\nDuplicate Pairs Found:")
    for pair, count in duplicate_pairs.items():
        print(f"German: {pair[0]}")
        print(f"French: {pair[1]}")
        print(f"Occurrences: {count + 1}")
        print("-" * 50)
else:
    print("No duplicate pairs found. Dataset is unique.")

print("\nDataset verification complete.")


# **Model C**

In [None]:
import gc
import torch
# Clear garbage and GPU cache
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleared!")


In [None]:
import json
import random
local_path = "/content/dataset_b.json"

# Load dataset_b (for training)
with open(local_path, "r", encoding="utf-8") as file:
    dataset_b = json.load(file)

# Select 1600 samples from dataset_b if available
if len(dataset_b) > 1600:
    train_data = random.sample(dataset_b, 1600)
else:
    train_data = dataset_b

print(f"Training samples from dataset_b: {len(train_data)}")

# Load test dataset from "/content/dataset_a_test.json" (used for validation)
test_set_path = "/content/dataset_a_test.json"
with open(test_set_path, "r", encoding="utf-8") as file:
    test_data = json.load(file)

print(f"Testing samples from dataset_a_test.json: {len(test_data)}")

# Create training and evaluation texts by concatenating German and French
train_texts = [sample["german"] + " -> " + sample["french"] for sample in train_data]
eval_texts = [sample["german"] + " -> " + sample["french"] for sample in test_data]

from datasets import Dataset
train_dataset = Dataset.from_dict({"text": train_texts})
eval_dataset = Dataset.from_dict({"text": eval_texts})


In [None]:
# Load model and tokenizer with quantization configuration
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,                      # load the model in 8-bit mode
    llm_int8_enable_fp32_cpu_offload=True,    # offload some operations to CPU for stability
)
model_name = "microsoft/phi-2"
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quant_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Setup PEFT with LoRA
from peft import get_peft_model, LoraConfig, TaskType
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # For language modeling tasks
    inference_mode=False,
    r=32,              # Increase rank for more capacity
    lora_alpha=64,     # Scaling factor
    lora_dropout=0.1,  # Dropout probability
)
model = get_peft_model(model, peft_config)

# Freeze base parameters (only LoRA params are trainable)
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False

model.train()

In [None]:
import gc
# Define a custom callback to record training and validation losses
from transformers import TrainerCallback

class LossPlotCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None and "loss" in logs:
            self.train_losses.append((state.global_step, logs["loss"]))
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is not None and "eval_loss" in metrics:
            self.eval_losses.append((state.global_step, metrics["eval_loss"]))

loss_callback = LossPlotCallback()

# Clear garbage and GPU cache
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleared!")

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments with evaluation strategy enabled
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/model_c",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    fp16=False,
    save_steps=50,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=50,
    evaluation_strategy="steps",  # Enable evaluation during training
    eval_steps=50,               # Evaluate every 50 steps
)

# Initialize Trainer with both training and eval datasets and custom callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    callbacks=[loss_callback]
)

# Start training (evaluates at each eval_steps)
trainer.train()

# Save the fine-tuned model
trainer.save_model("/content/model_c")

import matplotlib.pyplot as plt
# Clear garbage and GPU cache again
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleared!")

# Extract log history from training
log_history = trainer.state.log_history

train_steps = []
train_losses = []
eval_steps = []
eval_losses = []

for log in log_history:
    if 'loss' in log and 'eval_loss' not in log:
        if 'step' in log:
            train_steps.append(log['step'])
            train_losses.append(log['loss'])
    if 'eval_loss' in log:
        if 'step' in log:
            eval_steps.append(log['step'])
            eval_losses.append(log['eval_loss'])

plt.figure(figsize=(10,5))
plt.plot(train_steps, train_losses, label='Training Loss')
plt.plot(eval_steps, eval_losses, label='Validation Loss', marker='o')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

import torch
from langdetect import detect
import sacrebleu
from bert_score import score
import re
import gc

# Function to filter out non-French parts while keeping valid French sentences
def filter_french_text(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split into sentences
    french_sentences = [sent for sent in sentences if detect(sent) == "fr"]  # Keep only French sentences
    return " ".join(french_sentences).strip()  # Reconstruct filtered text

# Function to generate translation and retry if needed
def generate_translation(input_text, max_new_tokens=50, max_attempts=5):
    input_text = input_text.replace("->", "").strip()
    conditioned_text = "Translate the following German text into French: " + input_text

    attempt = 0
    while attempt < max_attempts:
        attempt += 1
        inputs = tokenizer(conditioned_text, return_tensors="pt", truncation=True, max_length=128).to(model.device)
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            temperature=0.7,        # Reduced randomness
            repetition_penalty=2.0,  # Increase penalty to discourage repetition
            length_penalty=1.5,      # Encourage longer, more complete responses,
        )
        translation = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        translation = translation.replace("->", "").strip()

        # Filter out non-French parts
        filtered_translation = filter_french_text(translation)

        # If there is at least some French text, return it
        if filtered_translation:
            return filtered_translation

        print(f"Attempt {attempt}: No French detected. Retrying...")

    print(f"Max retries reached. Returning last attempt: {translation}")
    return translation

# Initialize lists for results
generated_translations = []
reference_translations = []  # List of reference texts (one per sample)

# Loop through test data (raw JSON loaded from dataset_a_test.json)
for idx, sample in enumerate(test_data):
    try:
        german_input = sample["german"]
        ref_french = sample["french"]
        generated = generate_translation(german_input, max_new_tokens=50)

        if generated:  # Only add non-empty translations
            print(f"Sample {idx}:")
            print("German input:", german_input)
            print("Generated translation (validated):", generated)
            print("Reference translation:", ref_french)
            print("-" * 50)

            generated_translations.append(generated)
            reference_translations.append(ref_french)
        else:
            print(f"Skipping Sample {idx} due to repeated non-French output.")
    except Exception as e:
        print(f"Error processing sample {idx}: {e}")

print("Number of valid French translations:", len(generated_translations))

# Compute BLEU score only if translations exist
if generated_translations:
    bleu = sacrebleu.corpus_bleu(generated_translations, [[ref] for ref in reference_translations])
    print("BLEU score on test set:", bleu.score)
else:
    print("No valid translations generated. Skipping BLEU score.")

# Compute BERTScore only if translations exist
if generated_translations:
    P, R, F1 = score(generated_translations, reference_translations, lang="fr", verbose=True)
    print("Average BERTScore F1 on test set:", F1.mean().item())
else:
    print("No valid translations generated. Skipping BERTScore.")

# **Model D**

In [None]:
import json
import random
import os
import shutil
import torch
from sklearn.model_selection import train_test_split
import gc
# 1. Load and split Dataset A
with open('dataset_a.json', 'r', encoding='utf-8') as f:
    dataset_a = json.load(f)

train_data, test_data = train_test_split(dataset_a, test_size=0.2, random_state=42)

with open('dataset_a_test.json', 'w', encoding='utf-8') as f:
    json.dump(test_data, f, ensure_ascii=False, indent=2)

# 2. Load Dataset B and create Dataset C
with open('dataset_b.json', 'r', encoding='utf-8') as f:
    dataset_b = json.load(f)

dataset_c = train_data + dataset_b
random.seed(42)
random.shuffle(dataset_c)

with open('dataset_c.json', 'w', encoding='utf-8') as f:
    json.dump(dataset_c, f, ensure_ascii=False, indent=2)

# Optionally copy Dataset C to Google Drive (adjust path if needed)
# shutil.copy('dataset_c.json', '/content/drive/MyDrive/dataset_c.json')

# 3. Load the tokenizer and preprocess Dataset C
from transformers import AutoTokenizer
from datasets import Dataset

model_name = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def preprocess_function(example):
    prompt = f"Translate German to French: {example['german']}\nFrench:"
    target = f" {example['french']}"
    full_text = prompt + target
    tokenized = tokenizer(full_text, truncation=True, max_length=512, padding="max_length")
    return tokenized

ds = Dataset.from_list(dataset_c)
tokenized_dataset = ds.map(preprocess_function, batched=False)

# Split tokenized dataset into training and evaluation sets (90/10 split)
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

# 4. Load the model with quantization, configure LoRA, and fine-tune
from transformers import (
    AutoModelForCausalLM, TrainingArguments, Trainer,
    BitsAndBytesConfig, DataCollatorForLanguageModeling
)


# Clear garbage and GPU cache
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleared!")

from peft import LoraConfig, get_peft_model

# Create a quantization configuration object
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,                      # load the model in 8-bit mode
    llm_int8_enable_fp32_cpu_offload=True,    # optional: offload some operations to CPU for stability
)

# Load base model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)
model.resize_token_embeddings(len(tokenizer))  # Adjust for added tokens if any

# (Optional) Inspect the model modules to find the appropriate target module names:
# for name, module in model.named_modules():
#     print(name)

# Configure LoRA: adjust target_modules to modules present in microsoft/phi-2.
# Here we try ["q_proj", "v_proj"] as a potential alternative.
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
print("LoRA model configured successfully!")

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir="./phi2_finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=20,
    save_steps=100,
    fp16=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
train_result = trainer.train()
trainer.save_model("/content/model_d")

# 5. Plot training and validation loss graphs
import matplotlib.pyplot as plt

# Extract log history from training
log_history = trainer.state.log_history

train_steps = []
train_losses = []
eval_steps = []
eval_losses = []

for log in log_history:
    if 'loss' in log and 'eval_loss' not in log:
        if 'step' in log:
            train_steps.append(log['step'])
            train_losses.append(log['loss'])
    if 'eval_loss' in log:
        if 'step' in log:
            eval_steps.append(log['step'])
            eval_losses.append(log['eval_loss'])

plt.figure(figsize=(10,5))
plt.plot(train_steps, train_losses, label='Training Loss')
plt.plot(eval_steps, eval_losses, label='Validation Loss', marker='o')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()


In [None]:

import torch
from langdetect import detect
import sacrebleu
from bert_score import score
import re
import gc


# Clear garbage and GPU cache
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
print("Memory cleared!")

# Function to filter out non-French parts while keeping valid French sentences
def filter_french_text(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split into sentences
    french_sentences = [sent for sent in sentences if detect(sent) == "fr"]  # Keep only French sentences
    return " ".join(french_sentences).strip()  # Reconstruct filtered text

# Function to generate translation and retry if needed
def generate_translation(input_text, max_new_tokens=50, max_attempts=5):
    input_text = input_text.replace("->", "").strip()
    conditioned_text = "Translate the following German text into French: " + input_text

    attempt = 0
    while attempt < max_attempts:
        attempt += 1
        inputs = tokenizer(conditioned_text, return_tensors="pt", truncation=True, max_length=128).to(model.device)
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=5,
            early_stopping=True,
            temperature=0.7,        # Reduced randomness
            repetition_penalty=2.0,  # Increase penalty to discourage repetition
            length_penalty=1.5,      # Encourage longer, more complete responses,
        )
        translation = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
        translation = translation.replace("->", "").strip()

        # Filter out non-French parts
        filtered_translation = filter_french_text(translation)

        # If there is at least some French text, return it
        if filtered_translation:
            return filtered_translation

        print(f"Attempt {attempt}: No French detected. Retrying...")

    print(f"Max retries reached. Returning last attempt: {translation}")
    return translation

# Initialize lists for results
generated_translations = []
reference_translations = []  # List of reference texts (one per sample)

# Loop through test data (raw JSON loaded from dataset_a_test.json)
for idx, sample in enumerate(test_data):
    try:
        german_input = sample["german"]
        ref_french = sample["french"]
        generated = generate_translation(german_input, max_new_tokens=50)

        if generated:  # Only add non-empty translations
            print(f"Sample {idx}:")
            print("German input:", german_input)
            print("Generated translation (validated):", generated)
            print("Reference translation:", ref_french)
            print("-" * 50)

            generated_translations.append(generated)
            reference_translations.append(ref_french)
        else:
            print(f"Skipping Sample {idx} due to repeated non-French output.")
    except Exception as e:
        print(f"Error processing sample {idx}: {e}")

print("Number of valid French translations:", len(generated_translations))

# Compute BLEU score only if translations exist
if generated_translations:
    bleu = sacrebleu.corpus_bleu(generated_translations, [[ref] for ref in reference_translations])
    print("BLEU score on test set:", bleu.score)
else:
    print("No valid translations generated. Skipping BLEU score.")

# Compute BERTScore only if translations exist
if generated_translations:
    P, R, F1 = score(generated_translations, reference_translations, lang="fr", verbose=True)
    print("Average BERTScore F1 on test set:", F1.mean().item())
else:
    print("No valid translations generated. Skipping BERTScore.")