In [14]:
from datasets import load_dataset
import re
import pandas as pd
import os 
import json
import os
import torch
import traceback
import warnings
import time
import re
from pathlib import Path
from tqdm import tqdm
from typing import Dict, List, Optional, Tuple
from deep_translator import GoogleTranslator
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings('ignore')

In [15]:
# create a folder with the name of data
os.makedirs('data', exist_ok=True)

In [16]:
# Load dataset from Hugging Face Datasets and store the output column after extracting the python code into a csv file 
ds = load_dataset("jtatman/python-code-dataset-500k")
ds = pd.DataFrame(ds['train'])
ds = ds.drop(['instruction', 'system'], axis=1)
ds['English_code'] = ds['output'].apply(lambda x: re.search(r'```python(.*?)```', x, re.DOTALL).group(1) if re.search(r'```python(.*?)```', x, re.DOTALL) else None)
ds = ds.drop(['output'], axis=1)
ds.to_csv('data/python_code_dataset.csv', index=False)
ds.head()

Unnamed: 0,English_code
0,\nfor i in range(10): # First digit\n for ...
1,\ndef count_distinct_states(matrix):\n coun...
2,\ndef remove_spaces_and_punctuation(s):\n r...
3,\nimport math\n\ndef is_prime(n):\n # Check...
4,"\nclass String:\n def __init__(self, string..."


In [None]:
class Config:
    def __init__(self, max_rows=None):
        self.input_path = 'data/python_code_dataset.csv'
        self.output_path = 'data/google_code_translations.csv'
        self.checkpoint_path = 'data/translation_checkpoint.json'
        self.keywords_path = './Joshua_Keywords.csv'
        self.batch_size = 5
        self.source_lang = 'en'
        self.target_lang = 'hi'
        self.sleep_time = 0.2
        self.max_retries = 3
        self.max_rows = max_rows

class CheckpointManager:
    """Manages saving and loading of translation progress"""
    def __init__(self, checkpoint_path: str):
        self.checkpoint_path = checkpoint_path
        self.processed_indices = set()
        self._load_checkpoint()

    def _load_checkpoint(self) -> None:
        """Load existing checkpoint if available"""
        if os.path.exists(self.checkpoint_path):
            try:
                with open(self.checkpoint_path, 'r') as f:
                    data = json.load(f)
                self.processed_indices = set(data.get('processed_indices', []))
                print(f"Loaded checkpoint with {len(self.processed_indices)} processed items")
            except Exception as e:
                print(f"Error loading checkpoint: {str(e)}")
                self.processed_indices = set()

    def save_checkpoint(self) -> None:
        """Save current progress to checkpoint file"""
        try:
            data = {'processed_indices': list(self.processed_indices)}
            with open(self.checkpoint_path, 'w') as f:
                json.dump(data, f)
        except Exception as e:
            print(f"Error saving checkpoint: {str(e)}")

    def mark_processed(self, index: int) -> None:
        """Mark an item as processed and save checkpoint"""
        self.processed_indices.add(index)
        self.save_checkpoint()

    def is_processed(self, index: int) -> bool:
        """Check if an item has been processed"""
        return index in self.processed_indices

    def get_unprocessed_indices(self, total_items: int) -> List[int]:
        """Get list of indices that haven't been processed yet"""
        return [i for i in range(total_items) if not self.is_processed(i)]

class KeywordManager:
    """Manages programming keyword translations"""
    def __init__(self, keywords_path: str):
        self.keywords_path = keywords_path
        self.keywords = self._load_keywords()
        self._add_special_cases()

    def _load_keywords(self) -> Dict[str, str]:
        """Load keyword translations from file"""
        try:
            df = pd.read_csv(self.keywords_path)
            # Drop non-Hindi translations
            columns_to_drop = [
                'FrenchKey.txt', 'SpanishKey.txt', 'KurdishKey.txt',
                'BengaliKey.txt', 'MandarinKey.txt', 'GreekKey.txt'
            ]
            df.drop(columns=columns_to_drop, inplace=True)
            df.dropna(inplace=True)
            return {row['EnglishKey.txt']: row['HindiKey.txt'] for _, row in df.iterrows()}
        except Exception as e:
            print(f"Error loading keywords: {str(e)}")
            return {}

    def _add_special_cases(self) -> None:
        """Add special case translations"""
        special_cases = {
            'i': 'ई',
            'j': 'जे',
            'k': 'के'
        }
        self.keywords.update(special_cases)

    def get_translation(self, word: str) -> Optional[str]:
        """Get translation for a keyword if available"""
        return self.keywords.get(word)

class CodeDataset(Dataset):
    """Dataset for code translation"""
    def __init__(self, codes: List[str], indices: List[int]):
        self.codes = codes
        self.indices = indices

    def __len__(self) -> int:
        return len(self.codes)

    def __getitem__(self, idx: int) -> Dict[str, any]:
        return {
            'index': self.indices[idx],
            'code': self.codes[idx]
        }

def custom_collate(batch: List[Dict]) -> Dict[str, List]:
    """Custom collate function for DataLoader"""
    return {
        'indices': [item['index'] for item in batch],
        'codes': [item['code'] for item in batch]
    }

class CodeTranslator:
    def __init__(self, config: Config, keyword_manager: KeywordManager):
        self.config = config
        self.keyword_manager = keyword_manager
        self.translator = GoogleTranslator(
            source=config.source_lang,
            target=config.target_lang
        )
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

    def process_compound_word(self, word: str) -> str:
        """Handle translation of compound words with underscores"""
        if '_' in word:
            parts = word.split('_')
            translated_parts = []
            for part in parts:
                translated = self.safe_translate(part)
                # If translation contains space, replace with underscore
                translated = translated.replace(' ', '_') if translated else part
                translated_parts.append(translated)
            return '_'.join(translated_parts)
        return word

    def translate_token(self, token: str) -> str:
        if token.isspace():
            return token
        elif '_' in token:
            parts = token.split('_')
            translated_parts = []
            for part in parts:
                if part:
                    keyword_trans = self.keyword_manager.get_translation(part)
                    if keyword_trans:
                        translated_parts.append(keyword_trans)
                    else:
                        trans = self.safe_translate(part)
                        if ' ' in trans:
                            trans = trans.replace(' ', '_')
                        translated_parts.append(trans)
            return '_'.join(translated_parts)
        elif token.isalpha():
            keyword_trans = self.keyword_manager.get_translation(token)
            if keyword_trans:
                return keyword_trans.replace(' ', '_')
            translation = self.safe_translate(token)
            return translation.replace(' ', '_')
        return token

    def safe_translate(self, text: str) -> str:
        if not text or not isinstance(text, str):
            return text

        for attempt in range(self.config.max_retries):
            try:
                translated = self.translator.translate(text)
                if ' ' in translated:
                    translated = translated.replace(' ', '_')
                if any(c.isascii() and c.isalpha() for c in translated):
                    translated = self.translator.translate(text.lower()).replace(' ', '_')
                return translated
            except Exception as e:
                if attempt == self.config.max_retries - 1:
                    return text
        return text

    def translate_line(self, line: str) -> str:
        indent = len(line) - len(line.lstrip())
        line = line.lstrip()

        if not line:
            return line

        try:
            if '#' in line:
                code_part, comment_part = line.split('#', 1)
                translated_comment = self.safe_translate(comment_part.strip())

                if code_part:
                    tokens = re.findall(r'[a-zA-Z_]+|\d+|[^\w\s]|\s+', code_part)
                    translated_tokens = [self.translate_token(token) for token in tokens]
                    translated_code = ''.join(translated_tokens)
                    return ' ' * indent + translated_code.rstrip() + ' #' + translated_comment
                return ' ' * indent + '#' + translated_comment

            tokens = re.findall(r'[a-zA-Z_]+|\d+|[^\w\s]|\s+', line)
            translated_tokens = [self.translate_token(token) for token in tokens]
            return ' ' * indent + ''.join(translated_tokens)

        except Exception as e:
            print(f"Line translation error: {str(e)}")
            return line
    def translate_code(self, code: str) -> str:
        if not isinstance(code, str):
            return ""

        if '\\n' in code:
            lines = code.strip("'\"").split('\\n')
            translated_lines = [self.translate_line(line.strip()) for line in lines]
            return '\\n '.join(translated_lines)

        lines = code.split('\n')
        translated_lines = [self.translate_line(line) for line in lines]
        return '\n'.join(translated_lines)

    def process_batch(self, batch: Dict[str, List]) -> Tuple[List[int], List[str]]:
        """Process a batch of code samples"""
        indices = batch['indices']
        codes = batch['codes']

        translated_batch = []
        for code in codes:
            if isinstance(code, torch.Tensor):
                code = code.cpu().numpy().item()
            translated_code = self.translate_code(code)
            translated_batch.append(translated_code)

        return indices, translated_batch

class TranslationManager:
    """Manages the overall translation process"""
    def __init__(self, config: Config):
        self.config = config
        self.checkpoint_manager = CheckpointManager(config.checkpoint_path)
        self.keyword_manager = KeywordManager(config.keywords_path)
        self.translator = CodeTranslator(config, self.keyword_manager)

    def prepare_data(self) -> Tuple[pd.DataFrame, List[int]]:
        if os.path.exists(self.config.output_path):
            results_df = pd.read_csv(self.config.output_path)
            input_df = pd.read_csv(self.config.input_path)
        else:
            input_df = pd.read_csv(self.config.input_path)
            if self.config.max_rows:
                input_df = input_df.head(self.config.max_rows)
            results_df = pd.DataFrame({
                'English_code': input_df['English_code'],
                'Hindi_code': [None] * len(input_df)
            })

        unprocessed_indices = self.checkpoint_manager.get_unprocessed_indices(len(input_df))
        if self.config.max_rows:
            unprocessed_indices = unprocessed_indices[:self.config.max_rows]
        return results_df, unprocessed_indices

    def process_translations(self) -> Optional[pd.DataFrame]:
        """Process all translations with checkpointing"""
        try:
            results_df, unprocessed_indices = self.prepare_data()

            if not unprocessed_indices:
                print("All items have been processed!")
                return results_df

            print(f"Found {len(unprocessed_indices)} unprocessed items")

            # Create dataset and dataloader
            unprocessed_codes = [
                results_df.iloc[i]['English_code'] for i in unprocessed_indices
            ]
            dataset = CodeDataset(unprocessed_codes, unprocessed_indices)
            dataloader = DataLoader(
                dataset,
                batch_size=self.config.batch_size,
                shuffle=False,
                collate_fn=custom_collate
            )

            # Process batches
            try:
                with tqdm(total=len(unprocessed_indices), desc="Translating code") as pbar:
                    for batch in dataloader:
                        indices, translated_codes = self.translator.process_batch(batch)

                        # Update results and save progress
                        for idx, translated_code in zip(indices, translated_codes):
                            results_df.at[idx, 'Hindi_code'] = translated_code
                            self.checkpoint_manager.mark_processed(idx)

                        # Save intermediate results
                        results_df.to_csv(self.config.output_path, index=False)
                        pbar.update(len(indices))

            except KeyboardInterrupt:
                print("\nProcess interrupted by user. Saving progress...")
                results_df.to_csv(self.config.output_path, index=False)
                return results_df

            print(f"\nProcessing completed! Results saved to: {self.config.output_path}")
            return results_df

        except Exception as e:
            print(f"Error during processing: {str(e)}")
            traceback.print_exc()
            if 'results_df' in locals():
                results_df.to_csv(self.config.output_path, index=False)
                return results_df
            return None


"""Main entry point"""
# Create config
config = Config(max_rows=50)

# Create data directory if it doesn't exist
Path('data').mkdir(exist_ok=True)

# Clean up GPU memory if available
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Initialize and run translation manager
manager = TranslationManager(config)
processed_df = manager.process_translations()

if processed_df is not None:
    print("\nProcessing completed successfully!")


Using device: cpu
Found 50 unprocessed items


Translating code:  10%|█         | 5/50 [00:12<01:53,  2.53s/it]

In [20]:
#Load API Key from a text file
with open('../API.txt', 'r') as f:
    API_key = f.read().strip()

In [None]:
from openai import OpenAI
import pandas as pd
import re
from tqdm import tqdm

class Config:
    def __init__(self, max_rows=None, example_count = 5):
        self.input_path = 'data/google_code_translations.csv'
        self.output_path = 'data/translations_gpt.csv'
        self.checkpoint_path = 'data/checkpoint_gpt.json'
        self.max_rows = max_rows
        self.batch_size = 5
        self.openai_api_key = API_key
        self.examples_count = example_count
        self.client = OpenAI(api_key=self.openai_api_key)

class KeywordReplacer:
    def __init__(self):
        self.keywords = self._load_keywords()
        
    def _load_keywords(self):
        df = pd.read_csv('./Joshua_Keywords.csv')
        columns_to_drop = [
            'FrenchKey.txt', 'SpanishKey.txt', 'KurdishKey.txt',
            'BengaliKey.txt', 'MandarinKey.txt', 'GreekKey.txt'
        ]
        df.drop(columns=columns_to_drop, inplace=True)
        df.dropna(inplace=True)
        return {row['EnglishKey.txt']: row['HindiKey.txt'] for _, row in df.iterrows()}
    
    def replace_keywords(self, code):
        # Split code into tokens while preserving structure
        tokens = re.findall(r'[a-zA-Z_]+|\d+|[^\w\s]|\s+', code)
        translated_tokens = []
        
        for token in tokens:
            if token in self.keywords:
                translated_tokens.append(self.keywords[token])
            elif token == 'True':
                translated_tokens.append('सत्य')
            elif token == 'False':
                translated_tokens.append('असत्य')
            else:
                translated_tokens.append(token)
                
        return ''.join(translated_tokens)

class GPTTranslator:
    def __init__(self, config):
        self.config = config
        self.examples = self.load_examples()
        self.keyword_replacer = KeywordReplacer()

    def load_examples(self):
        df = pd.read_csv(self.config.input_path)
        return df.head(self.config.examples_count)[['English_code', 'Hindi_code']]

    def create_prompt(self, code_to_translate):
        examples_text = ""
        for i, row in self.examples.iterrows():
            examples_text += f"\n\nExample {i+1}:\n"
            examples_text += f"English code:\n{row['English_code']}\n"
            examples_text += f"Hindi translated code:\n{row['Hindi_code']}\n------------------------\n"
            
        prompt = f"""Complete the translation of this partially English Python code to completely Hindi python code:
        - Translate variable names, function names, strings and comments to Hindi
        - Join multi-word Hindi translations with underscores
        - Break down compound English words separated by underscores and translate each part into sensible Hindi and join them back with underscores
        - Preserve code structure and syntax
        - Here are some examples of translations:
    
        {examples_text}
        
        Now translate partially translated code to completely in Hindi:
        {code_to_translate}"""
        
        return prompt

    def translate_code(self, code):
        # First replace known keywords
        partially_translated = self.keyword_replacer.replace_keywords(code)

        # Then use GPT to complete the translation
        prompt = self.create_prompt(partially_translated)
        try:
            # print(f"Prompt:\n{prompt}")
            response = self.config.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a Expert Python code translator who understands the nuanses of language in coding and converts code from English to  Hindi code while preserving functionality. Return only the translated code without any explanation."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )
            translated_code = response.choices[0].message.content.strip()

            # Clean up the response to extract just the code
            if "```python" in translated_code:
                translated_code = translated_code.split("```python")[1].split("```")[0].strip()
            elif "```" in translated_code:
                translated_code = translated_code.split("```")[1].strip()
                
            return translated_code
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return code

def run_translation(max_rows=None):
    config = Config(max_rows=max_rows, example_count=5)
    translator = GPTTranslator(config)
    
    df = pd.read_csv(config.input_path)
    if max_rows:
        df = df.iloc[config.examples_count:max_rows+config.examples_count]
    
    results = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        translated = translator.translate_code(row['English_code'])
        results.append({
            'English_code': row['English_code'],
            'Hindi_code': translated
        })
        
    results_df = pd.DataFrame(results)
    results_df.to_csv(config.output_path, index=False)
    return results_df

# Usage
translated_df = run_translation(max_rows=5)

100%|██████████| 5/5 [00:31<00:00,  6.33s/it]


In [46]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
from tqdm import tqdm
import re


def keyword_reverse_translation(code, reverse_keywords):
    tokens = re.findall(r'[\u0900-\u097F_]+|[a-zA-Z_]+|\d+|[^\w\s]|\s+', code)
    translated_tokens = []
    
    for token in tokens:
        if token in reverse_keywords:
            translated_tokens.append(reverse_keywords[token])
        else:
            translated_tokens.append(token)
    
    return ''.join(translated_tokens)

class TranslationEvaluator:
    def __init__(self, config):
        self.config = config
        self.keyword_replacer = KeywordReplacer()
        nltk.download('punkt')
        self.reverse_keywords = {v: k for k, v in self.keyword_replacer.keywords.items()}

    def reverse_translate_code(self, hindi_code):
        # First replace known keywords
        partially_translated = keyword_reverse_translation(hindi_code, self.reverse_keywords)
        
        prompt = f"""Complete the translation of this partially translated Python code to English:
        - The code already has Python keywords translated to English
        - Translate remaining variable names and comments
        - Convert Hindi compound words (with underscores) to appropriate English terms
        - Preserve code structure and syntax
        
        Partially translated code:
        {partially_translated}"""
        
        try:
            response = self.config.client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a Python code translator converting Hindi code to English."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0
            )
            return self.clean_code(response.choices[0].message.content.strip())
        except Exception as e:
            print(f"Reverse translation error: {str(e)}")
            return hindi_code
    def clean_code(self, code):
        """Remove markdown and normalize code"""
        if "```python" in code:
            code = code.split("```python")[1].split("```")[0].strip()
        elif "```" in code:
            code = code.split("```")[1].strip()
        return code.strip()

    def calculate_bleu(self, original, translated):
        """Calculate BLEU score between original and translated code"""
        smooth = SmoothingFunction().method1
        
        # Tokenize the code
        def tokenize(code):
            return nltk.word_tokenize(code)
        
        reference = [tokenize(original)]
        candidate = tokenize(translated)
        
        return sentence_bleu(reference, candidate, smoothing_function=smooth)

    def evaluate_translations(self, df):
        """Evaluate translations using round-trip and BLEU score"""
        results = []
        
        for _, row in tqdm(df.iterrows(), desc="Evaluating translations"):
            original = row['English_code']
            hindi = row['Hindi_code']
            
            # Round-trip translation
            back_translated = self.reverse_translate_code(hindi)
            
            # Calculate BLEU score
            bleu_score = self.calculate_bleu(original, back_translated)
            
            results.append({
                'original': original,
                'hindi': hindi,
                'back_translated': back_translated,
                'bleu_score': bleu_score
            })
            
        return pd.DataFrame(results)

# Usage example
def run_evaluation():
    config = Config()
    evaluator = TranslationEvaluator(config)
    
    # Load translations
    df = pd.read_csv('data/translations_gpt.csv')
    
    # Run evaluation
    results = evaluator.evaluate_translations(df)
    
    # Save results
    results.to_csv('data/evaluation_results.csv', index=False)
    
    # Print average BLEU score
    avg_bleu = results['bleu_score'].mean()
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    
    return results

# Run evaluation
evaluation_results = run_evaluation()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ankitkumar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Evaluating translations: 5it [00:12,  2.59s/it]

Average BLEU Score: 0.8150



