In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib.ticker as mtick
import os
import re

# Set the style for the plots
sns.set(style="whitegrid")
plt.rcParams.update({'font.size': 12})

# Define a custom color palette for the models
palette = {"Google": "#4285F4", "ChatGPT": "#19C37D", "Claude": "#8C43EA"}

# Function to clean numeric values
# Function to clean numeric values - FIXED to handle trailing commas and other issues
def clean_numeric_value(value):
    if value is None or pd.isna(value):
        return 0
        
    if isinstance(value, (int, float)):
        return float(value)
        
    if isinstance(value, str):
        # Remove newlines, commas, and any whitespace
        cleaned = value.replace('\n', '').replace(',', '.').strip()
        # Handle multiple decimal points by keeping only the first one
        if cleaned.count('.') > 1:
            parts = cleaned.split('.', 1)
            cleaned = parts[0] + '.' + parts[1].replace('.', '')
        # Make sure we have a valid number
        try:
            return float(cleaned)
        except ValueError:
            # If conversion fails, try to extract the first number found
            match = re.search(r'(\d+\.?\d*)', cleaned)
            if match:
                return float(match.group(1))
            return 0
    
    return 0

# Read and process Google data
def process_google_data(file_path):
    google_df = pd.read_csv(file_path)
    processed_data = []
    
    for _, row in google_df.iterrows():
        processed_data.append({
            'model': 'Google',
            'language': row['Language'],
            'examples': 30,  # Fixed at 30 examples based on filename
            'bleu_score': clean_numeric_value(row['Bleu Score']),
            'syntax_valid_rate': clean_numeric_value(row['Syntax Valid Rate']),
            'structure_score': clean_numeric_value(row['Structure Score']),
            'semantic_score': clean_numeric_value(row['Semantic Scire ']),  # Note the space
            'token_match': clean_numeric_value(row['Token Match']),
            'overall': clean_numeric_value(row['Overall'])
        })
    
    return pd.DataFrame(processed_data)

# Read and process ChatGPT/Claude data
def process_model_data(file_path, model_name):
    # Read the CSV file without header first to inspect the structure
    raw_data = pd.read_csv(file_path, header=None)
    
    # Find the header row (which has 'Language' and 'No of Examples')
    header_idx = None
    for i, row in raw_data.iterrows():
        if 'Language' in row.values and 'No of Examples' in row.values:
            header_idx = i
            break
    
    if header_idx is None:
        raise ValueError(f"Could not find header row in {model_name} data")
    
    # Extract headers and use them to read the data properly
    headers = raw_data.iloc[header_idx].tolist()
    data = raw_data.iloc[header_idx+1:].reset_index(drop=True)
    data.columns = headers
    
    # Process the data
    processed_data = []
    current_language = None
    
    for _, row in data.iterrows():
        if not pd.isna(row['Language']) and row['Language']:
            current_language = row['Language']
        
        if not pd.isna(row['No of Examples']):
            processed_data.append({
                'model': model_name,
                'language': current_language,
                'examples': clean_numeric_value(row['No of Examples']),
                'bleu_score': clean_numeric_value(row['Bleu Score']),
                'syntax_valid_rate': clean_numeric_value(row['Syntax Valid Rate']),
                'structure_score': clean_numeric_value(row['Structure Score']),
                'semantic_score': clean_numeric_value(row['Semantic Scire ']),  # Note the space
                'token_match': clean_numeric_value(row['Token Match']),
                'overall': clean_numeric_value(row['Overall'])
            })
    
    return pd.DataFrame(processed_data)

# Main function to generate visualizations
def generate_visualizations():
    # Read and process all data
    google_df = process_google_data('./Results/Google-Google translator using 30 code snippets.csv')
    chatgpt_df = process_model_data('./Results/chatGPT-Table 1.csv', 'ChatGPT')
    claude_df = process_model_data('./Results/Claude-Table 1.csv', 'Claude')
    
    # Combine all data
    df = pd.concat([google_df, chatgpt_df, claude_df], ignore_index=True)
    
    # Clean language names for better display
    df['language_display'] = df['language'].str.replace(r'\(.*\)', '', regex=True).str.strip()
    
    # Create a directory for saving the visualizations
    os.makedirs('visualizations', exist_ok=True)
    
    # 1. Model comparison chart - compare models across languages
    # For fair comparison, use examples=0 for ChatGPT and Claude
    comparison_df = pd.DataFrame()
    
    # Add Google data (always 30 examples)
    comparison_df = pd.concat([comparison_df, google_df], ignore_index=True)
    
    # Add ChatGPT data with 0 examples
    for lang in chatgpt_df['language'].unique():
        lang_data = chatgpt_df[chatgpt_df['language'] == lang]
        zero_examples = lang_data[lang_data['examples'] == 0]
        if not zero_examples.empty:
            comparison_df = pd.concat([comparison_df, zero_examples.iloc[[0]]], ignore_index=True)
    
    # Add Claude data with 0 examples
    for lang in claude_df['language'].unique():
        lang_data = claude_df[claude_df['language'] == lang]
        zero_examples = lang_data[lang_data['examples'] == 0]
        if not zero_examples.empty:
            comparison_df = pd.concat([comparison_df, zero_examples.iloc[[0]]], ignore_index=True)
    
    # Add language_display column
    comparison_df['language_display'] = comparison_df['language'].str.replace(r'\(.*\)', '', regex=True).str.strip()
    
    plt.figure(figsize=(14, 8))
    ax = sns.barplot(
        x='language_display',
        y='overall',
        hue='model',
        data=comparison_df,
        palette=palette
    )
    
    # Customize the plot
    ax.set_title('AI Model Translation Performance by Language', fontsize=16)
    ax.set_xlabel('Language', fontsize=14)
    ax.set_ylabel('Overall Performance Score', fontsize=14)
    ax.set_ylim(0, 1)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    
    plt.tight_layout()
    plt.savefig('visualizations/model_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 2. Create a heatmap for better visualization of performance differences
    pivot_df = comparison_df.pivot_table(
        index='language_display',
        columns='model',
        values='overall'
    )
    
    plt.figure(figsize=(12, 8))
    ax = sns.heatmap(
        pivot_df,
        annot=True,
        fmt='.2f',
        cmap='YlGnBu',
        vmin=0,
        vmax=1,
        linewidths=.5
    )
    
    ax.set_title('AI Model Translation Performance Heatmap', fontsize=16)
    plt.tight_layout()
    plt.savefig('visualizations/performance_heatmap.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 3. Create line plots showing performance by example count for each language
    languages = df['language'].unique()
    
    for lang in languages:
        # Filter data for this language
        lang_df = df[df['language'] == lang]
        
        # Skip if we don't have multiple example counts
        if len(lang_df['examples'].unique()) <= 1:
            continue
            
        plt.figure(figsize=(10, 6))
        ax = sns.lineplot(
            x='examples',
            y='overall',
            hue='model',
            style='model',
            markers=True,
            dashes=False,
            data=lang_df,
            palette=palette
        )
        
        # Format the plot
        display_lang = lang.replace('(', '').replace(')', '').strip()
        ax.set_title(f'Performance by Example Count: {display_lang}', fontsize=16)
        ax.set_xlabel('Number of Examples', fontsize=14)
        ax.set_ylabel('Overall Performance Score', fontsize=14)
        ax.set_ylim(0, 1)
        ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
        
        plt.tight_layout()
        clean_lang = lang.replace('(', '').replace(')', '').replace(' ', '_').lower()
        plt.savefig(f'visualizations/example_count_{clean_lang}.png', dpi=300, bbox_inches='tight')
        plt.close()
    
    # 4. Create a summary plot comparing average model performance
    model_avg = comparison_df.groupby('model')['overall'].mean().reset_index()
    
    plt.figure(figsize=(10, 6))
    ax = sns.barplot(
        x='model',
        y='overall',
        data=model_avg,
        palette=palette
    )
    
    # Add value labels on top of bars
    for i, v in enumerate(model_avg['overall']):
        ax.text(i, v + 0.02, f'{v:.2%}', ha='center', fontsize=12)
    
    # Customize the plot
    ax.set_title('Average Performance Across All Languages', fontsize=16)
    ax.set_xlabel('AI Model', fontsize=14)
    ax.set_ylabel('Average Performance Score', fontsize=14)
    ax.set_ylim(0, 1)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    
    plt.tight_layout()
    plt.savefig('visualizations/average_performance.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 5. Create radar charts for multi-dimensional performance comparison
    # First, prepare the data by averaging metrics across languages for each model
    metrics = ['bleu_score', 'syntax_valid_rate', 'structure_score', 'semantic_score', 'token_match', 'overall']
    radar_df = comparison_df.groupby('model')[metrics].mean().reset_index()
    
    # Normalize syntax_valid_rate to 0-1 scale
    radar_df['syntax_valid_rate'] = radar_df['syntax_valid_rate'] / 100
    
    # Create the radar chart
    labels = ['BLEU Score', 'Syntax Valid Rate', 'Structure Score', 'Semantic Score', 'Token Match', 'Overall']
    num_vars = len(labels)
    
    # Compute angle for each axis
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]  # Close the loop
    
    # Set up the plot
    fig, ax = plt.subplots(figsize=(12, 10), subplot_kw=dict(polar=True))
    
    # Plot data for each model
    for i, model in enumerate(radar_df['model']):
        values = radar_df[radar_df['model'] == model][metrics].values.flatten().tolist()
        values += values[:1]  # Close the loop
        
        color = palette[model]
        ax.plot(angles, values, 'o-', linewidth=2, label=model, color=color)
        ax.fill(angles, values, alpha=0.1, color=color)
    
    # Set chart properties
    ax.set_thetagrids(np.degrees(angles[:-1]), labels)
    ax.set_ylim(0, 1)
    ax.set_title('AI Model Performance Across Metrics', fontsize=16, pad=20)
    ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    
    plt.tight_layout()
    plt.savefig('visualizations/radar_chart.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # 6. Create a facet grid showing all metrics by model and language
    # Melt the dataframe to get all metrics in one column
    melted_df = pd.melt(
        comparison_df, 
        id_vars=['model', 'language_display'], 
        value_vars=metrics,
        var_name='metric', 
        value_name='score'
    )
    
    # Rename metrics for better display
    metric_names = {
        'bleu_score': 'BLEU Score',
        'syntax_valid_rate': 'Syntax Valid Rate',
        'structure_score': 'Structure Score',
        'semantic_score': 'Semantic Score',
        'token_match': 'Token Match',
        'overall': 'Overall'
    }
    melted_df['metric'] = melted_df['metric'].map(metric_names)
    
    # Scale syntax_valid_rate to 0-1
    melted_df.loc[melted_df['metric'] == 'Syntax Valid Rate', 'score'] = melted_df.loc[melted_df['metric'] == 'Syntax Valid Rate', 'score'] / 100
    
    # Create the facet grid
    g = sns.catplot(
        data=melted_df,
        x='language_display',
        y='score',
        hue='model',
        col='metric',
        kind='bar',
        height=4,
        aspect=1.2,
        sharey=False,
        palette=palette,
        col_wrap=3
    )
    
    # Customize the plot
    g.set_xticklabels(rotation=45, ha='right')
    g.set_titles("{col_name}")
    g.set_axis_labels("Language", "Score")
    
    # Format y-axis as percentage where appropriate
    for ax in g.axes.flat:
        ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
        ax.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig('visualizations/metrics_by_language.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    print("All visualizations have been generated and saved to the 'visualizations' folder!")

if __name__ == "__main__":
    generate_visualizations()

  ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.barplot(


All visualizations have been generated and saved to the 'visualizations' folder!
