In [25]:
from datasets import load_dataset
import re
import pandas as pd
import os 
import json
import os
import torch
import traceback
import warnings
import time
import re
from pathlib import Path
from tqdm import tqdm
from typing import Dict, List, Optional, Tuple
from deep_translator import GoogleTranslator
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings('ignore')

In [26]:
# create a folder with the name of data
os.makedirs('data', exist_ok=True)

#### **Stage 0: Data Preprocessing to Extract the Python Code**
1. Downloaded the Dataset: Downloaded the "jtatman/python-code-dataset-500k" from Hugging Face.
2. Data Cleansing: Cleansed the data and extracted just the Python code using regex where the Python code was enclosed in between <pythoncode>.
3. Store Extracted Code: Stored all the Python code in python_code_dataset.csv file with the column name English_code.


In [27]:
# Load dataset from Hugging Face Datasets and store the output column after extracting the python code into a csv file 
ds = load_dataset("jtatman/python-code-dataset-500k")
ds = pd.DataFrame(ds['train'])
ds = ds.drop(['instruction', 'system'], axis=1)
ds['English_code'] = ds['output'].apply(lambda x: re.search(r'```python(.*?)```', x, re.DOTALL).group(1) if re.search(r'```python(.*?)```', x, re.DOTALL) else None)
ds = ds.drop(['output'], axis=1)
ds = ds.dropna()
ds.to_csv('data/python_code_dataset.csv', index=False)
ds.head()

Unnamed: 0,English_code
0,\nfor i in range(10): # First digit\n for ...
1,\ndef count_distinct_states(matrix):\n coun...
2,\ndef remove_spaces_and_punctuation(s):\n r...
3,\nimport math\n\ndef is_prime(n):\n # Check...
4,"\nclass String:\n def __init__(self, string..."


In [28]:
ds.describe()

Unnamed: 0,English_code
count,67063
unique,25549
top,\ndef factorial(n):\n if n == 0:\n r...
freq,41


### **Stage 1: Initial Translation**

1. **Initial Translation:** Used the ```python_code_dataset.csv``` for the initial translation using Google Translator.

2. **Keyword Dictionary Creation:** Created a keyword dictionary using the curated dataset by Joshua Otten, which includes files like ```FrenchKey.txt```, ```SpanishKey.txt```, ```KurdishKey.txt```, ```BengaliKey.txt```, ```MandarinKey.txt```, ```GreekKey.txt```, ```EnglishKey.txt```, and ```HindiKey.txt```.

3. **Mapping Keywords:** Mapped keywords from ```EnglishKey.txt``` to```HindiKey.txt``` for partial translation.

4. **Partial Translation:** Parsed the code one by one, replacing English keywords with their Hindi counterparts from the dictionary.

5. **Google Translator:** Provided the partially translated code to Google Translator, ensuring:
    - Comments and strings were translated as a whole.
    - Variables separated by underscores were split, translated, and rejoined to maintain consistency.

6. **Final Outcome:** The final outcome was named ```Hindi_code_version_1```.

In [31]:
class Config:
    def __init__(self, max_rows=None):
        self.input_path = 'data/python_code_dataset.csv'
        self.output_path = 'data/google_code_translations.csv'
        self.checkpoint_path = 'data/translation_checkpoint.json'
        self.keywords_path = './Joshua_Keywords.csv'
        self.batch_size = 5
        self.source_lang = 'en'
        self.target_lang = 'hi'
        self.sleep_time = 0.2
        self.max_retries = 3
        self.max_rows = max_rows

class CheckpointManager:
    """Manages saving and loading of translation progress"""
    def __init__(self, checkpoint_path: str):
        self.checkpoint_path = checkpoint_path
        self.processed_indices = set()
        self._load_checkpoint()

    def _load_checkpoint(self) -> None:
        """Load existing checkpoint if available"""
        if os.path.exists(self.checkpoint_path):
            try:
                with open(self.checkpoint_path, 'r') as f:
                    data = json.load(f)
                self.processed_indices = set(data.get('processed_indices', []))
                print(f"Loaded checkpoint with {len(self.processed_indices)} processed items")
            except Exception as e:
                print(f"Error loading checkpoint: {str(e)}")
                self.processed_indices = set()

    def save_checkpoint(self) -> None:
        """Save current progress to checkpoint file"""
        try:
            data = {'processed_indices': list(self.processed_indices)}
            with open(self.checkpoint_path, 'w') as f:
                json.dump(data, f)
        except Exception as e:
            print(f"Error saving checkpoint: {str(e)}")

    def mark_processed(self, index: int) -> None:
        """Mark an item as processed and save checkpoint"""
        self.processed_indices.add(index)
        self.save_checkpoint()

    def is_processed(self, index: int) -> bool:
        """Check if an item has been processed"""
        return index in self.processed_indices

    def get_unprocessed_indices(self, total_items: int) -> List[int]:
        """Get list of indices that haven't been processed yet"""
        return [i for i in range(total_items) if not self.is_processed(i)]

class KeywordManager:
    """Manages programming keyword translations"""
    def __init__(self, keywords_path: str):
        self.keywords_path = keywords_path
        self.keywords = self._load_keywords()
        self._add_special_cases()

    def _load_keywords(self) -> Dict[str, str]:
        """Load keyword translations from file"""
        try:
            df = pd.read_csv(self.keywords_path)
            # Drop non-Hindi translations
            columns_to_drop = [
                'FrenchKey.txt', 'SpanishKey.txt', 'KurdishKey.txt',
                'BengaliKey.txt', 'MandarinKey.txt', 'GreekKey.txt'
            ]
            df.drop(columns=columns_to_drop, inplace=True)
            df.dropna(inplace=True)
            return {row['EnglishKey.txt']: row['HindiKey.txt'] for _, row in df.iterrows()}
        except Exception as e:
            print(f"Error loading keywords: {str(e)}")
            return {}

    def _add_special_cases(self) -> None:
        """Add special case translations"""
        special_cases = {
            'i': 'ई',
            'j': 'जे',
            'k': 'के'
        }
        self.keywords.update(special_cases)

    def get_translation(self, word: str) -> Optional[str]:
        """Get translation for a keyword if available"""
        return self.keywords.get(word)

class CodeDataset(Dataset):
    """Dataset for code translation"""
    def __init__(self, codes: List[str], indices: List[int]):
        self.codes = codes
        self.indices = indices

    def __len__(self) -> int:
        return len(self.codes)

    def __getitem__(self, idx: int) -> Dict[str, any]:
        return {
            'index': self.indices[idx],
            'code': self.codes[idx]
        }

def custom_collate(batch: List[Dict]) -> Dict[str, List]:
    """Custom collate function for DataLoader"""
    return {
        'indices': [item['index'] for item in batch],
        'codes': [item['code'] for item in batch]
    }

class CodeTranslator:
    def __init__(self, config: Config, keyword_manager: KeywordManager):
        self.config = config
        self.keyword_manager = keyword_manager
        self.translator = GoogleTranslator(
            source=config.source_lang,
            target=config.target_lang
        )
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

    def process_compound_word(self, word: str) -> str:
        """Handle translation of compound words with underscores"""
        if '_' in word:
            parts = word.split('_')
            translated_parts = []
            for part in parts:
                translated = self.safe_translate(part)
                # If translation contains space, replace with underscore
                translated = translated.replace(' ', '_') if translated else part
                translated_parts.append(translated)
            return '_'.join(translated_parts)
        return word

    def translate_token(self, token: str) -> str:
        if token.isspace():
            return token
        elif '_' in token:
            parts = token.split('_')
            translated_parts = []
            for part in parts:
                if part:
                    keyword_trans = self.keyword_manager.get_translation(part)
                    if keyword_trans:
                        translated_parts.append(keyword_trans)
                    else:
                        trans = self.safe_translate(part)
                        if ' ' in trans:
                            trans = trans.replace(' ', '_')
                        translated_parts.append(trans)
            return '_'.join(translated_parts)
        elif token.isalpha():
            keyword_trans = self.keyword_manager.get_translation(token)
            if keyword_trans:
                return keyword_trans.replace(' ', '_')
            translation = self.safe_translate(token)
            return translation.replace(' ', '_')
        return token

    def safe_translate(self, text: str) -> str:
        if not text or not isinstance(text, str):
            return text

        for attempt in range(self.config.max_retries):
            try:
                translated = self.translator.translate(text)
                if ' ' in translated:
                    translated = translated.replace(' ', '_')
                if any(c.isascii() and c.isalpha() for c in translated):
                    translated = self.translator.translate(text.lower()).replace(' ', '_')
                return translated
            except Exception as e:
                if attempt == self.config.max_retries - 1:
                    return text
        return text

    def translate_line(self, line: str) -> str:
        indent = len(line) - len(line.lstrip())
        line = line.lstrip()

        if not line:
            return line

        try:
            if '#' in line:
                code_part, comment_part = line.split('#', 1)
                translated_comment = self.safe_translate(comment_part.strip())

                if code_part:
                    tokens = re.findall(r'[a-zA-Z_]+|\d+|[^\w\s]|\s+', code_part)
                    translated_tokens = [self.translate_token(token) for token in tokens]
                    translated_code = ''.join(translated_tokens)
                    return ' ' * indent + translated_code.rstrip() + ' #' + translated_comment
                return ' ' * indent + '#' + translated_comment

            tokens = re.findall(r'[a-zA-Z_]+|\d+|[^\w\s]|\s+', line)
            translated_tokens = [self.translate_token(token) for token in tokens]
            return ' ' * indent + ''.join(translated_tokens)

        except Exception as e:
            print(f"Line translation error: {str(e)}")
            return line
    def translate_code(self, code: str) -> str:
        if not isinstance(code, str):
            return ""

        if '\\n' in code:
            lines = code.strip("'\"").split('\\n')
            translated_lines = [self.translate_line(line.strip()) for line in lines]
            return '\\n '.join(translated_lines)

        lines = code.split('\n')
        translated_lines = [self.translate_line(line) for line in lines]
        return '\n'.join(translated_lines)

    def process_batch(self, batch: Dict[str, List]) -> Tuple[List[int], List[str]]:
        """Process a batch of code samples"""
        indices = batch['indices']
        codes = batch['codes']

        translated_batch = []
        for code in codes:
            if isinstance(code, torch.Tensor):
                code = code.cpu().numpy().item()
            translated_code = self.translate_code(code)
            translated_batch.append(translated_code)

        return indices, translated_batch

class TranslationManager:
    """Manages the overall translation process"""
    def __init__(self, config: Config):
        self.config = config
        self.checkpoint_manager = CheckpointManager(config.checkpoint_path)
        self.keyword_manager = KeywordManager(config.keywords_path)
        self.translator = CodeTranslator(config, self.keyword_manager)

    def prepare_data(self) -> Tuple[pd.DataFrame, List[int]]:
        if os.path.exists(self.config.output_path):
            results_df = pd.read_csv(self.config.output_path)
            input_df = pd.read_csv(self.config.input_path)
        else:
            input_df = pd.read_csv(self.config.input_path)
            if self.config.max_rows:
                input_df = input_df.head(self.config.max_rows)
            results_df = pd.DataFrame({
                'English_code': input_df['English_code'],
                'Hindi_code': [None] * len(input_df)
            })

        unprocessed_indices = self.checkpoint_manager.get_unprocessed_indices(len(input_df))
        if self.config.max_rows:
            unprocessed_indices = unprocessed_indices[:self.config.max_rows]
        return results_df, unprocessed_indices

    def process_translations(self) -> Optional[pd.DataFrame]:
        """Process all translations with checkpointing"""
        try:
            results_df, unprocessed_indices = self.prepare_data()

            if not unprocessed_indices:
                print("All items have been processed!")
                return results_df

            print(f"Found {len(unprocessed_indices)} unprocessed items")

            # Create dataset and dataloader
            unprocessed_codes = [
                results_df.iloc[i]['English_code'] for i in unprocessed_indices
            ]
            dataset = CodeDataset(unprocessed_codes, unprocessed_indices)
            dataloader = DataLoader(
                dataset,
                batch_size=self.config.batch_size,
                shuffle=False,
                collate_fn=custom_collate
            )

            # Process batches
            try:
                with tqdm(total=len(unprocessed_indices), desc="Translating code") as pbar:
                    for batch in dataloader:
                        indices, translated_codes = self.translator.process_batch(batch)

                        # Update results and save progress
                        for idx, translated_code in zip(indices, translated_codes):
                            results_df.at[idx, 'Hindi_code'] = translated_code
                            self.checkpoint_manager.mark_processed(idx)

                        # Save intermediate results
                        results_df.to_csv(self.config.output_path, index=False)
                        pbar.update(len(indices))

            except KeyboardInterrupt:
                print("\nProcess interrupted by user. Saving progress...")
                results_df.to_csv(self.config.output_path, index=False)
                return results_df

            print(f"\nProcessing completed! Results saved to: {self.config.output_path}")
            return results_df

        except Exception as e:
            print(f"Error during processing: {str(e)}")
            traceback.print_exc()
            if 'results_df' in locals():
                results_df.to_csv(self.config.output_path, index=False)
                return results_df
            return None


"""Main entry point"""
# Create config
config = Config(max_rows=50)

# Create data directory if it doesn't exist
Path('data').mkdir(exist_ok=True)

# Clean up GPU memory if available
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Initialize and run translation manager
manager = TranslationManager(config)
processed_df = manager.process_translations()

if processed_df is not None:
    print("\nProcessing completed successfully!")


Using device: cpu
Found 50 unprocessed items


Translating code: 100%|██████████| 50/50 [05:00<00:00,  6.02s/it]


Processing completed! Results saved to: data/google_code_translations.csv

Processing completed successfully!





### **Stage 2: GPT Enhancement: Example-based Translation using GPT 4o Mini**

1. **Base Example:** Used the ```Hindi_code_version_1``` from Stage 1 as a base example for the model.

2. **Partial Translation:** Took unseen English Python code and performed partial translation using the Keyword Dictionary, similar to Stage 1, Step 4.

3. **Model Input:** Provided both the example pair ```Hindi_code_version_1``` and a different partially translated code (test case) to the GPT model with a customized prompt.

4. **Translation Request:** Asked the GPT model to translate the test case using the example pair as a reference.

In [32]:
#Load API Key from a text file
with open('../API.txt', 'r') as f:
    API_key = f.read().strip()

In [33]:
from openai import OpenAI
import pandas as pd
import re
from tqdm import tqdm

class Config:
    def __init__(self, max_rows=None, example_count = 5, output_path = 'data/translations_gpt.csv'):
        self.input_path = '../few_shot_Learning_data.csv'
        self.output_path = output_path
        self.checkpoint_path = 'data/checkpoint_gpt.json'
        self.max_rows = max_rows
        self.batch_size = 5
        self.openai_api_key = API_key
        self.examples_count = example_count
        self.client = OpenAI(api_key=self.openai_api_key)

class KeywordReplacer:
    def __init__(self):
        self.keywords = self._load_keywords()
        
    def _load_keywords(self):
        df = pd.read_csv('./Joshua_Keywords.csv')
        columns_to_drop = [
            'FrenchKey.txt', 'SpanishKey.txt', 'KurdishKey.txt',
            'BengaliKey.txt', 'MandarinKey.txt', 'GreekKey.txt'
        ]
        df.drop(columns=columns_to_drop, inplace=True)
        df.dropna(inplace=True)
        return {row['EnglishKey.txt']: row['HindiKey.txt'] for _, row in df.iterrows()}
    
    def replace_keywords(self, code):

        # Split code into tokens while preserving structure
        tokens = re.findall(r'[a-zA-Z_]+|\d+|[^\w\s]|\s+', code)
        translated_tokens = []
        
        for token in tokens:
            if token in self.keywords:
                translated_tokens.append(self.keywords[token])
            elif token == 'True':
                translated_tokens.append('सत्य')
            elif token == 'False':
                translated_tokens.append('असत्य')
            else:
                translated_tokens.append(token)
                
        return ''.join(translated_tokens)

class GPTTranslator:
    def __init__(self, config):
        self.config = config
        self.examples = self.load_examples()
        self.keyword_replacer = KeywordReplacer()

    def load_examples(self):
        df = pd.read_csv(self.config.input_path)
        return df.head(self.config.examples_count)[['English_code', 'Hindi_code']]

    def create_prompt(self, code_to_translate):
        examples_text = ""
        for i, row in self.examples.iterrows():
            examples_text += f"\n\nExample {i+1}:\n"
            examples_text += f"English code:\n{row['English_code']}\n"
            examples_text += f"Hindi translated code:\n{row['Hindi_code']}\n------------------------\n"
            
        prompt = f"""Complete the translation of this partially English Python code to completely Hindi python code:
        - Translate variable names, function names, strings and comments to Hindi
        - Join multi-word Hindi translations with underscores
        - Break down compound English words separated by underscores and translate each part into sensible Hindi and join them back with underscores
        - Preserve code structure and syntax
        - Here are some examples of translations:
    
        {examples_text}
        
        Now translate partially translated code to completely in Hindi:
        {code_to_translate}"""
        
        return prompt

    def translate_code(self, code):
        # First replace known keywords
        partially_translated = self.keyword_replacer.replace_keywords(code)

        # Then use GPT to complete the translation
        prompt = self.create_prompt(partially_translated)
        try:
            # print(f"Prompt:\n{prompt}")
            response = self.config.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a Expert Python code translator who understands the nuanses of language in coding and converts code from English to  Hindi code while preserving functionality. Return only the translated code without any explanation."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )
            translated_code = response.choices[0].message.content.strip()

            # Clean up the response to extract just the code
            if "```python" in translated_code:
                translated_code = translated_code.split("```python")[1].split("```")[0].strip()
            elif "```" in translated_code:
                translated_code = translated_code.split("```")[1].strip()
                
            return translated_code
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return code

def run_translation(max_rows=None, example_count=5, file_path='data/translations_gpt.csv'):
    config = Config(max_rows=max_rows, example_count=example_count, output_path=file_path)
    translator = GPTTranslator(config)
    
    df = pd.read_csv(config.input_path)
    if max_rows:
        df = df.iloc[30:max_rows+30] # it will always start from 30th row and take 10 rows ahead of it
    
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        translated = translator.translate_code(row['English_code'])
        results.append({
            'English_code': row['English_code'],
            'Hindi_code': translated
        })
        
    results_df = pd.DataFrame(results)
    results_df.to_csv(config.output_path, index=False)
    return results_df



In [34]:
gpt_translation_file_path = [
    'data/translations_gpt_10r_5e.csv',
    'data/translations_gpt_10r_10e.csv',
    'data/translations_gpt_10r_20e.csv',
    'data/translations_gpt_10r_30e.csv'
]
example_counts = [5, 10, 20, 30]
# Usage
'''
max_rows is the number of rows to translate. If None, all rows will be translated.
example_count is the number of examples to use for GPT prompt. It should be less than max_rows.
'''
for file_path, example_count in zip(gpt_translation_file_path, example_counts):
    translated_df = run_translation(max_rows=10, example_count=example_count, file_path=file_path)
    print(f"Translation results saved to: {file_path}")

100%|██████████| 10/10 [00:43<00:00,  4.36s/it]


Translation results saved to: data/translations_gpt_10r_5e.csv


100%|██████████| 10/10 [00:48<00:00,  4.88s/it]


Translation results saved to: data/translations_gpt_10r_10e.csv


100%|██████████| 10/10 [00:53<00:00,  5.35s/it]


Translation results saved to: data/translations_gpt_10r_20e.csv


100%|██████████| 10/10 [01:02<00:00,  6.24s/it]

Translation results saved to: data/translations_gpt_10r_30e.csv





In [37]:
from tqdm import tqdm
import re


def keyword_reverse_translation(code, reverse_keywords):
    tokens = re.findall(r'[\u0900-\u097F_]+|[a-zA-Z_]+|\d+|[^\w\s]|\s+', code)
    translated_tokens = []
    
    for token in tokens:
        if token in reverse_keywords:
            translated_tokens.append(reverse_keywords[token])
        else:
            translated_tokens.append(token)
    
    return ''.join(translated_tokens)

class TranslationEvaluator:
    def __init__(self, config):
        self.config = config
        self.keyword_replacer = KeywordReplacer()
        self.reverse_keywords = {v: k for k, v in self.keyword_replacer.keywords.items()}

    def reverse_translate_code(self, hindi_code):
        # First replace known keywords
        partially_translated = keyword_reverse_translation(hindi_code, self.reverse_keywords)
        
        prompt = f"""Complete the translation of this partially translated Python code to English:
        - The code already has Python keywords translated to English
        - Translate remaining variable names and comments
        - Convert Hindi compound words (with underscores) to appropriate English terms
        - Preserve code structure and syntax
        
        Partially translated code:
        {partially_translated}"""
        
        try:
            response = self.config.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a Python code translator converting Hindi code to English."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )
            return self.clean_code(response.choices[0].message.content.strip())
        except Exception as e:
            print(f"Reverse translation error: {str(e)}")
            return hindi_code
    def clean_code(self, code):
        """Remove markdown and normalize code"""
        if "```python" in code:
            code = code.split("```python")[1].split("```")[0].strip()
        elif "```" in code:
            code = code.split("```")[1].strip()
        return code.strip()


    def evaluate_translations(self, df):
        """Evaluate translations using round-trip and BLEU score"""
        results = []
        
        for _, row in tqdm(df.iterrows(), desc="Evaluating translations"):
            original = row['English_code']
            hindi = row['Hindi_code']
            
            # Round-trip translation
            back_translated = self.reverse_translate_code(hindi)
            
            results.append({
                'original': original,
                'hindi': hindi,
                'back_translated': back_translated,
            })
            
        return pd.DataFrame(results)

# Usage example
def run_evaluation( gpt_translation_file_path = '', evaluation_results_file_path = ''):
    config = Config()
    evaluator = TranslationEvaluator(config)
    
    # Load translations
    df = pd.read_csv(gpt_translation_file_path) # change the file name to the file you want to evaluate
    
    # Run evaluation
    results = evaluator.evaluate_translations(df)
    
    # Save results
    results.to_csv(evaluation_results_file_path, index=False) # change the file name to save the results accordingly
    
    return results

In [36]:
# run evaluation
evaluation_results_file_path = [
    'data/backtranslation_results_10r_5e.csv',
    'data/backtranslation_results_10r_10e.csv',
    'data/backtranslation_results_10r_20e.csv',
    'data/backtranslation_results_10r_30e.csv'
]

for input_file, output_file in zip(gpt_translation_file_path, evaluation_results_file_path):
    evaluation_results = run_evaluation(input_file, output_file)
    print(f"Evaluation results saved to: {output_file}")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Evaluating translations: 10it [00:31,  3.19s/it]
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Evaluation results saved to: data/backtranslation_results_10r_5e.csv


Evaluating translations: 10it [00:28,  2.80s/it]
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Evaluation results saved to: data/backtranslation_results_10r_10e.csv


Evaluating translations: 10it [00:37,  3.78s/it]
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Evaluation results saved to: data/backtranslation_results_10r_20e.csv


Evaluating translations: 10it [00:32,  3.28s/it]

Evaluation results saved to: data/backtranslation_results_10r_30e.csv





In [None]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
import ast
import tokenize
from io import StringIO
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import re

class CodeTranslationEvaluator:
    def __init__(self):
        self.semantic_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        
    def evaluate_translation(self, original_code: str, back_translated_code: str) -> dict:
        """Main evaluation function that combines all metrics"""
        results = {
            'bleu_score': self.calculate_bleu_score(original_code, back_translated_code),
            'syntax_validation': self.validate_syntax(back_translated_code),
            'semantic_similarity': self.evaluate_semantic_similarity(original_code, back_translated_code)
        }
        
        # Calculate overall score
        valid_scores = [score for score in [
            results['bleu_score'],
            1.0 if results['syntax_validation']['is_valid'] else 0.0,
            results['semantic_similarity']['overall_semantic_score']
        ] if score is not None]
        
        results['overall_score'] = np.mean(valid_scores) if valid_scores else 0.0
        return results

    def calculate_bleu_score(self, original_code: str, back_translated_code: str) -> float:
        """Calculate BLEU score between original and back-translated code"""
        def tokenize_code(code):
            return code.replace('\n', ' ').split()
        
        reference = [tokenize_code(original_code)]
        candidate = tokenize_code(back_translated_code)
        
        return sentence_bleu(reference, candidate)

    def validate_syntax(self, code: str) -> dict:
        """Check if the code is syntactically valid Python"""
        try:
            ast.parse(code)
            return {
                'is_valid': True,
                'error': None,
                'token_structure': self._analyze_token_structure(code)
            }
        except SyntaxError as e:
            return {
                'is_valid': False,
                'error': str(e),
                'token_structure': None
            }

    def _analyze_token_structure(self, code: str) -> dict:
        """Analyze the token structure of the code"""
        try:
            tokens = list(tokenize.generate_tokens(StringIO(code).readline))
            return {
                'total_tokens': len(tokens),
                'token_types': {
                    'NAME': sum(1 for tok in tokens if tok.type == tokenize.NAME),
                    'STRING': sum(1 for tok in tokens if tok.type == tokenize.STRING),
                    'NUMBER': sum(1 for tok in tokens if tok.type == tokenize.NUMBER),
                    'NEWLINE': sum(1 for tok in tokens if tok.type == tokenize.NEWLINE),
                }
            }
        except tokenize.TokenError:
            return None

    def evaluate_semantic_similarity(self, original_code: str, back_translated_code: str) -> dict:
        """Evaluate semantic similarity of code components"""
        # Extract components
        orig_comments = self._extract_comments(original_code)
        trans_comments = self._extract_comments(back_translated_code)
        
        orig_functions = self._extract_function_names(original_code)
        trans_functions = self._extract_function_names(back_translated_code)
        
        orig_strings = self._extract_string_literals(original_code)
        trans_strings = self._extract_string_literals(back_translated_code)
        
        # Calculate similarities
        comment_similarity = self._calculate_text_similarity(orig_comments, trans_comments)
        function_similarity = self._calculate_text_similarity(orig_functions, trans_functions)
        string_similarity = self._calculate_text_similarity(orig_strings, trans_strings)
        
        # Calculate overall semantic score
        semantic_scores = [score for score in [
            comment_similarity.get('similarity') if comment_similarity else None,
            function_similarity.get('similarity') if function_similarity else None,
            string_similarity.get('similarity') if string_similarity else None
        ] if score is not None]
        
        overall_semantic_score = np.mean(semantic_scores) if semantic_scores else 0.0
        
        return {
            'comments': comment_similarity,
            'functions': function_similarity,
            'strings': string_similarity,
            'overall_semantic_score': overall_semantic_score
        }

    def _extract_comments(self, code: str) -> list:
        """Extract comments from code"""
        comments = []
        for line in code.split('\n'):
            if '#' in line:
                comment = line[line.index('#')+1:].strip()
                if comment:
                    comments.append(comment)
        return comments

    def _extract_function_names(self, code: str) -> list:
        """Extract function names from code"""
        function_names = []
        try:
            tree = ast.parse(code)
            for node in ast.walk(tree):
                if isinstance(node, ast.FunctionDef):
                    function_names.append(node.name)
        except:
            pass
        return function_names

    def _extract_string_literals(self, code: str) -> list:
        """Extract string literals from code"""
        return re.findall(r'"([^"]*)"', code) + re.findall(r"'([^']*)'", code)

    def _calculate_text_similarity(self, texts1: list, texts2: list) -> dict:
        """Calculate semantic similarity between two lists of texts"""
        if not texts1 or not texts2:
            return None
            
        # Get embeddings for all texts
        embeddings1 = self.semantic_model.encode(texts1)
        embeddings2 = self.semantic_model.encode(texts2)
        
        # Calculate average similarity
        similarities = []
        for emb1, emb2 in zip(embeddings1, embeddings2):
            similarity = 1 - cosine(emb1, emb2)
            similarities.append(similarity)
        
        return {
            'similarity': np.mean(similarities),
            'individual_scores': list(zip(texts1, texts2, similarities))
        }

def evaluate_translations(input_file: str, output_file: str):
    """Evaluate translations from a file and save results"""
    # Initialize evaluator
    evaluator = CodeTranslationEvaluator()
    
    # Read translations
    df = pd.read_csv(input_file)
    results = []
    
    # Evaluate each translation
    for _, row in df.iterrows():
        evaluation = evaluator.evaluate_translation(
            row['original'],
            row['back_translated']
        )
        
        # Flatten the nested dictionary for DataFrame storage
        flattened_result = {
            'original': row['original'],
            'hind_translated': row['hindi'],
            'back_translated': row['back_translated'],
            'bleu_score': evaluation['bleu_score'],
            'syntax_valid': evaluation['syntax_validation']['is_valid'],
            'syntax_error': evaluation['syntax_validation']['error'],
            'semantic_score': evaluation['semantic_similarity']['overall_semantic_score'],
            'overall_score': evaluation['overall_score']
        }
        results.append(flattened_result)
    
    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_file, index=False)

    # Print summary
    print(f"\nEvaluation Summary for {input_file}:")
    print(f"Average BLEU Score: {results_df['bleu_score'].mean():.4f}")
    print(f"Syntax Valid Rate: {results_df['syntax_valid'].mean():.4f}")
    print(f"Average Semantic Similarity: {results_df['semantic_score'].mean():.4f}")
    print(f"Overall Score: {results_df['overall_score'].mean():.4f}")

# Example usage
if __name__ == "__main__":
    evaluation_files = [
        'data/backtranslation_results_10r_5e.csv',
        'data/backtranslation_results_10r_10e.csv',
        'data/backtranslation_results_10r_20e.csv',
        'data/backtranslation_results_10r_30e.csv'
    ]
    
    output_files = [
        'data/comprehensive_evaluation_5e.csv',
        'data/comprehensive_evaluation_10e.csv',
        'data/comprehensive_evaluation_20e.csv',
        'data/comprehensive_evaluation_30e.csv'
    ]
    
    for input_file, output_file in zip(evaluation_files, output_files):
        evaluate_translations(input_file, output_file)


Evaluation Summary for data/backtranslation_results_10r_5e.csv:
Average BLEU Score: 0.7150
Syntax Valid Rate: 1.0000
Average Semantic Similarity: 0.9296
Overall Score: 0.8815

Evaluation Summary for data/backtranslation_results_10r_10e.csv:
Average BLEU Score: 0.7035
Syntax Valid Rate: 1.0000
Average Semantic Similarity: 0.9370
Overall Score: 0.8802

Evaluation Summary for data/backtranslation_results_10r_20e.csv:
Average BLEU Score: 0.6860
Syntax Valid Rate: 0.9000
Average Semantic Similarity: 0.8662
Overall Score: 0.8174

Evaluation Summary for data/backtranslation_results_10r_30e.csv:
Average BLEU Score: 0.7357
Syntax Valid Rate: 1.0000
Average Semantic Similarity: 0.9070
Overall Score: 0.8809


In [44]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path

class EvaluationVisualizer:
    def __init__(self):
        Path('evaluation_graphs').mkdir(exist_ok=True)
        sns.set_palette("husl")

    def load_evaluation_data(self):
        """Load evaluation results for all configurations"""
        configs = ['5e', '10e', '20e', '30e']
        data = {}
        for config in configs:
            try:
                df = pd.read_csv(f'data/comprehensive_evaluation_{config}.csv')
                data[f'{config[:-1]} Examples'] = df
            except FileNotFoundError:
                print(f"Warning: Could not find data file for {config} configuration")
        return data

    def plot_overall_metrics_comparison(self, data):
        """Create bar plot comparing metrics across configurations"""
        metrics = {
            'Configuration': [],
            'BLEU Score': [],
            'Syntax Valid Rate': [],
            'Semantic Score': [],
            'Overall Score': []
        }
        
        for config, df in data.items():
            metrics['Configuration'].append(config)
            metrics['BLEU Score'].append(df['bleu_score'].mean())
            metrics['Syntax Valid Rate'].append(df['syntax_valid'].mean())
            metrics['Semantic Score'].append(df['semantic_score'].mean())
            metrics['Overall Score'].append(df['overall_score'].mean())
        
        metrics_df = pd.DataFrame(metrics)
        
        plt.figure(figsize=(12, 6))
        ax = metrics_df.set_index('Configuration').plot(kind='bar', width=0.8)
        plt.title('Comparison of Evaluation Metrics Across Configurations')
        plt.xlabel('Number of Examples')
        plt.ylabel('Score')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        
        # Add value labels on bars
        for container in ax.containers:
            ax.bar_label(container, fmt='%.3f')
            
        plt.tight_layout()
        plt.savefig('evaluation_graphs/overall_metrics_comparison.png')
        plt.close()

    def plot_score_distributions(self, data):
        """Create violin plots showing score distributions"""
        # Prepare data for plotting
        plot_data = []
        
        for config, df in data.items():
            for score_type in ['bleu_score', 'semantic_score', 'overall_score']:
                score_name = score_type.replace('_', ' ').title()
                scores = df[score_type].tolist()
                plot_data.extend([{
                    'Configuration': config,
                    'Score Type': score_name,
                    'Score': score
                } for score in scores])
        
        plot_df = pd.DataFrame(plot_data)
        
        plt.figure(figsize=(12, 6))
        sns.violinplot(data=plot_df, x='Configuration', y='Score', hue='Score Type')
        plt.title('Score Distributions Across Configurations')
        plt.xticks(rotation=45)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig('evaluation_graphs/score_distributions.png')
        plt.close()

    def plot_syntax_analysis(self, data):
        """Create visualization for syntax validation results"""
        syntax_data = {
            'Configuration': [],
            'Valid Syntax': [],
            'Invalid Syntax': []
        }
        
        for config, df in data.items():
            valid_rate = df['syntax_valid'].mean()
            syntax_data['Configuration'].append(config)
            syntax_data['Valid Syntax'].append(valid_rate)
            syntax_data['Invalid Syntax'].append(1 - valid_rate)
        
        syntax_df = pd.DataFrame(syntax_data)
        
        plt.figure(figsize=(10, 6))
        ax = syntax_df.set_index('Configuration').plot(kind='bar', stacked=True)
        plt.title('Syntax Validation Results')
        plt.xlabel('Number of Examples')
        plt.ylabel('Proportion')
        
        # Add percentage labels
        for container in ax.containers:
            ax.bar_label(container, fmt='%.1f%%', label_type='center')
            
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.savefig('evaluation_graphs/syntax_analysis.png')
        plt.close()

    def plot_semantic_similarity_heatmap(self, data):
        """Create heatmap for semantic similarity scores"""
        semantic_data = []
        
        for config, df in data.items():
            semantic_data.append({
                'Configuration': config,
                'Mean Score': df['semantic_score'].mean(),
                'Min Score': df['semantic_score'].min(),
                'Max Score': df['semantic_score'].max(),
                'Std Dev': df['semantic_score'].std()
            })
        
        semantic_df = pd.DataFrame(semantic_data).set_index('Configuration')
        
        plt.figure(figsize=(10, 6))
        sns.heatmap(semantic_df, annot=True, cmap='YlOrRd', fmt='.3f',
                   cbar_kws={'label': 'Score'})
        plt.title('Semantic Similarity Analysis')
        plt.tight_layout()
        plt.savefig('evaluation_graphs/semantic_similarity_heatmap.png')
        plt.close()

    def generate_all_visualizations(self):
        """Generate all visualization graphs"""
        print("Loading evaluation data...")
        data = self.load_evaluation_data()
        
        if not data:
            print("No evaluation data found!")
            return
            
        print("Generating visualizations...")
        self.plot_overall_metrics_comparison(data)
        self.plot_score_distributions(data)
        self.plot_syntax_analysis(data)
        self.plot_semantic_similarity_heatmap(data)
        print("All visualizations have been generated in 'evaluation_graphs' directory")

# Usage
if __name__ == "__main__":
    visualizer = EvaluationVisualizer()
    visualizer.generate_all_visualizations()
# Usage
if __name__ == "__main__":
    visualizer = EvaluationVisualizer()
    visualizer.generate_all_visualizations()

Loading evaluation data...
Generating visualizations...
All visualizations have been generated in 'evaluation_graphs' directory
Loading evaluation data...
Generating visualizations...
All visualizations have been generated in 'evaluation_graphs' directory


<Figure size 1200x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>