# Experimental Analysis for CWE Classification (zero-shot and few-shot)

In [33]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [12]:
def load_data_to_df_cwe(base_dir):
    # Iterate through the folders and files
    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.startswith('cwe_classification') and file.endswith('.xlsx'):
                    file_path = os.path.join(folder_path, file)
                    
                # Read the file into each data frame
                if "gpt35" in file:
                    print("reading file in gpt35:", file_path)
                    df_gpt35 = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gpt35['true_label_binary'] = df_gpt35['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "gpt4" in file:
                    print("reading file in gpt4:", file_path)
                    df_gpt4 = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gpt4['true_label_binary'] = df_gpt4['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "gpt-4o" in file:
                    print("reading file in gpt-4o:", file_path)
                    df_gpt4o = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gpt4o['true_label_binary'] = df_gpt4o['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "CodeLlama-7b" in file:
                    print("reading file in CodeLlama-7b:", file_path)
                    df_codellama7b = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_codellama7b['true_label_binary'] = df_codellama7b['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "CodeLlama-13b" in file:
                    print("reading file in CodeLlama-13b:", file_path)
                    df_codellama13b = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_codellama13b['true_label_binary'] = df_codellama13b['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "text-bison" in file:
                    print("reading filein palm2:", file_path)
                    df_palm2 = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_palm2['true_label_binary'] = df_palm2['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "gemini" in file:
                    print("reading file in gemini:", file_path)
                    df_gemini = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gemini['true_label_binary'] = df_gemini['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                    # Rename the column `vul_binary_classifications` to `vul_file_class
                    df_gemini.rename(columns={'vul_binary_classification': 'vul_file_class'}, inplace=True)

    return df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini

def load_data_to_df_cwe_few(base_dir):
    # Iterate through the folders and files
    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.startswith('cwe_few_shot_classification') and file.endswith('.xlsx'):
                    file_path = os.path.join(folder_path, file)
                    
                # Read the file into each data frame
                if "gpt35" in file:
                    print("reading file in gpt35:", file_path)
                    df_gpt35 = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gpt35['true_label_binary'] = df_gpt35['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "gpt4" in file:
                    print("reading file in gpt4:", file_path)
                    df_gpt4 = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gpt4['true_label_binary'] = df_gpt4['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "gpt-4o" in file:
                    print("reading file in gpt-4o:", file_path)
                    df_gpt4o = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gpt4o['true_label_binary'] = df_gpt4o['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "CodeLlama-7b" in file:
                    print("reading file in CodeLlama-7b:", file_path)
                    df_codellama7b = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_codellama7b['true_label_binary'] = df_codellama7b['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "CodeLlama-13b" in file:
                    print("reading file in CodeLlama-13b:", file_path)
                    df_codellama13b = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_codellama13b['true_label_binary'] = df_codellama13b['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "text-bison" in file:
                    print("reading filein palm2:", file_path)
                    df_palm2 = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_palm2['true_label_binary'] = df_palm2['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                elif "gemini" in file:
                    print("reading file in gemini:", file_path)
                    df_gemini = pd.read_excel(file_path)
                    # Assign the new column: `true_label_binary` to the data frame
                    df_gemini['true_label_binary'] = df_gemini['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                    # Rename the column `vul_binary_classifications` to `vul_file_class
                    df_gemini.rename(columns={'vul_binary_classification': 'vul_file_class'}, inplace=True)

    return df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini


# Helper function to calculate the metrics
def calculate_metrics(df, metrics_dict, model_name, output=False):
    # Assuming your DataFrame is named 'df'
    y_true = df['true_label']
    y_pred = df['vul_file_class']

    # Compute accuracy and round to two decimal places
    accuracy = round(accuracy_score(y_true, y_pred), 2)

    # Compute precision and round to two decimal places
    precision = round(precision_score(y_true, y_pred, average='macro'), 2)

    # Compute recall and round to two decimal places
    recall = round(recall_score(y_true, y_pred, average='macro'), 2)

    # Compute F1-score and round to two decimal places
    f1 = round(f1_score(y_true, y_pred, average='macro'), 2)

    if output:
        # Print the metrics
        print(f"Metrics for {model_name}:")
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-score:", f1)

    # Append the metrics to the list in the dictionary
    metrics_dict[model_name].append(accuracy)
    metrics_dict[model_name].append(precision)
    metrics_dict[model_name].append(recall)
    metrics_dict[model_name].append(f1)


def calculate_metrics_per_language  (df, model_name, metrics_dict, output=False):
   # Assuming your DataFrame is named 'df'
    languages = df['language'].unique()
    if output:
        print(f"Result for {model_name}:")

    for language in languages:
        if output:
            print(f"Metrics for language: {language}")
        # Create a temporary list to store the metrics for the current language
        metrics = []

        # Filter the DataFrame for the current language
        df_language = df[df['language'] == language]
        
        # Get the true labels and predicted labels for the current language
        y_true = df_language['true_label']
        y_pred = df_language['vul_file_class']
        
        # Compute accuracy and round to two decimal places
        accuracy = float(round(accuracy_score(y_true, y_pred), 2))

        # Compute precision and round to two decimal places
        precision = float(round(precision_score(y_true, y_pred, average='macro'), 2))

        # Compute recall and round to two decimal places
        recall = float(round(recall_score(y_true, y_pred, average='macro'), 2))

        # Compute F1-score and round to two decimal places
        f1 = float(round(f1_score(y_true, y_pred, average='macro'), 2))
        
        if output:
            # Print the rounded metrics for the current language
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1-score:", f1)
            print("=========================================")

        # Append the metrics to the temporary list
        metrics.append(accuracy)
        metrics.append(precision)
        metrics.append(recall)
        metrics.append(f1)

        # Append the metrics to the list in the dictionary for the current language
        metrics_dict[model_name][language] = metrics


## <ins>CWE Classification (Zero-Shot)</ins>

### Experiment 1: CWE-Sys1 + CWE-UserZ

In [13]:
# Set the base directory
base_dir = './excel_results/experiment_CWE-Sys1_CWE-UserZ'
# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_cwe(base_dir)
                

reading file in CodeLlama-7b: ./excel_results/experiment_CWE-Sys1_CWE-UserZ/codellama7b/cwe_classification_results_CodeLlama-7b-Instruct-hf_20240307_043843.xlsx
reading file in gpt35: ./excel_results/experiment_CWE-Sys1_CWE-UserZ/gpt35/cwe_classification_results_gpt35-turbo_20240211_180544.xlsx
reading file in gpt-4o: ./excel_results/experiment_CWE-Sys1_CWE-UserZ/gpt4o/cwe_classification_results_gpt-4o_20240707_144502.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_CWE-Sys1_CWE-UserZ/codellama13b/cwe_classification_results_CodeLlama-13b-Instruct-hf_20240307_060448.xlsx
reading file in gpt4: ./excel_results/experiment_CWE-Sys1_CWE-UserZ/gpt4/cwe_classification_results_gpt4-turbo_20240307_220849.xlsx
reading file in gemini: ./excel_results/experiment_CWE-Sys1_CWE-UserZ/gemini1.5pro/cwe_classification_results_gemini-1.5-pro-001_20240709_132658.xlsx


In [14]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [15]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict, "gpt35")
calculate_metrics(df_gpt4, metrics_dict, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict)


In [16]:
# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: CWE-Sys1 + CWE-UserZ")
df_metrics = pd.DataFrame(metrics_dict)
df_metrics = df_metrics.T
df_metrics.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics

Experiment results: CWE-Sys1 + CWE-UserZ


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.45,0.14,0.12,0.12
gpt4,0.56,0.21,0.18,0.18
gpt4o,0.62,0.23,0.19,0.2
CodeLlama-7b,0.22,0.12,0.09,0.08
CodeLlama-13b,0.33,0.09,0.08,0.08
gemini,0.53,0.15,0.13,0.13


### Experiment 2: CWE-Sys2 + CWE-UserZ

In [17]:
# Set the base directory
base_dir = './excel_results/experiment_CWE-Sys2_CWE-UserZ'

# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_cwe(base_dir)

reading file in CodeLlama-7b: ./excel_results/experiment_CWE-Sys2_CWE-UserZ/codellama7b/cwe_classification_results_CodeLlama-7b-Instruct-hf_20240304_064953.xlsx
reading file in gpt35: ./excel_results/experiment_CWE-Sys2_CWE-UserZ/gpt35/cwe_classification_results_gpt35-turbo_20240226_171613.xlsx
reading file in gpt-4o: ./excel_results/experiment_CWE-Sys2_CWE-UserZ/gpt4o/cwe_classification_results_gpt-4o_20240708_154331.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_CWE-Sys2_CWE-UserZ/codellama13b/cwe_classification_results_CodeLlama-13b-Instruct-hf_20240304_114823.xlsx
reading file in gpt4: ./excel_results/experiment_CWE-Sys2_CWE-UserZ/gpt4/cwe_classification_results_gpt4-turbo_20240307_211950.xlsx
reading file in gemini: ./excel_results/experiment_CWE-Sys2_CWE-UserZ/gemini1.5pro/cwe_classification_results_gemini-1.5-pro-001_20240710_153505.xlsx


In [18]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict2 = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict2 = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [19]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict2, "gpt35")
calculate_metrics(df_gpt4, metrics_dict2, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict2, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict2, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict2, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict2, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict2)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict2)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict2)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict2)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict2)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict2)

# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: CWE-Sys2 + CWE-UserZ")
df_metrics2 = pd.DataFrame(metrics_dict2)
df_metrics2 = df_metrics2.T
df_metrics2.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics2


Experiment results: CWE-Sys2 + CWE-UserZ


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.38,0.14,0.12,0.12
gpt4,0.67,0.3,0.23,0.25
gpt4o,0.52,0.22,0.19,0.19
CodeLlama-7b,0.29,0.08,0.05,0.06
CodeLlama-13b,0.25,0.08,0.05,0.05
gemini,0.55,0.16,0.15,0.14


### Experiment 3: CWE-Sys1 + CWE-UserF

In [20]:
# Set the base directory
base_dir = './excel_results/experiment_CWE-Sys1_CWE-UserF'

# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_cwe_few(base_dir)

reading file in CodeLlama-7b: ./excel_results/experiment_CWE-Sys1_CWE-UserF/codellama7b/cwe_few_shot_classification_results_CodeLlama-7b-Instruct-hf_20240307_044555.xlsx
reading file in gpt35: ./excel_results/experiment_CWE-Sys1_CWE-UserF/gpt35/cwe_few_shot_classification_results_gpt35-turbo_20240211_181050.xlsx
reading file in gpt-4o: ./excel_results/experiment_CWE-Sys1_CWE-UserF/gpt4o/cwe_few_shot_classification_results_gpt-4o_20240707_171140.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_CWE-Sys1_CWE-UserF/codellama13b/cwe_few_shot_classification_results_CodeLlama-13b-Instruct-hf_20240307_063542.xlsx
reading file in gpt4: ./excel_results/experiment_CWE-Sys1_CWE-UserF/gpt4/cwe_few_shot_classification_results_gpt4-turbo_20240311_181323.xlsx
reading file in gemini: ./excel_results/experiment_CWE-Sys1_CWE-UserF/gemini1.5pro/cwe_few_shot_classification_results_gemini-1.5-pro-001_20240709_174749.xlsx


In [21]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict3 = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict3 = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [22]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict3, "gpt35")
calculate_metrics(df_gpt4, metrics_dict3, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict3, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict3, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict3, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict3, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict3)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict3)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict3)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict3)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict3)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict3)

# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: CWE-Sys1_CWE-UserF")
df_metrics3 = pd.DataFrame(metrics_dict3)
df_metrics3 = df_metrics3.T
df_metrics3.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics3


Experiment results: CWE-Sys1_CWE-UserF


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.52,0.34,0.35,0.32
gpt4,0.61,0.42,0.52,0.44
gpt4o,0.67,0.61,0.69,0.61
CodeLlama-7b,0.11,0.13,0.1,0.08
CodeLlama-13b,0.23,0.09,0.07,0.07
gemini,0.66,0.52,0.56,0.51


### Experiment 4: CWE-Sys2 + CWE-UserF

In [23]:
# Set the base directory
base_dir = './excel_results/experiment_CWE-Sys2_CWE-UserF'

# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_cwe_few(base_dir)

reading file in CodeLlama-7b: ./excel_results/experiment_CWE-Sys2_CWE-UserF/codellama7b/cwe_few_shot_classification_results_CodeLlama-7b-Instruct-hf_20240304_075019.xlsx
reading file in gpt35: ./excel_results/experiment_CWE-Sys2_CWE-UserF/gpt35/cwe_few_shot_classification_results_gpt35-turbo_20240307_161036.xlsx
reading file in gpt-4o: ./excel_results/experiment_CWE-Sys2_CWE-UserF/gpt4o/cwe_few_shot_classification_results_gpt-4o_20240708_165251.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_CWE-Sys2_CWE-UserF/codellama13b/cwe_few_shot_classification_results_CodeLlama-13b-Instruct-hf_20240304_122855.xlsx
reading file in gpt4: ./excel_results/experiment_CWE-Sys2_CWE-UserF/gpt4/cwe_few_shot_classification_results_gpt4-turbo_20240317_005738.xlsx
reading file in gemini: ./excel_results/experiment_CWE-Sys2_CWE-UserF/gemini1.5pro/cwe_few_shot_classification_results_gemini-1.5-pro-001_20240710_221336.xlsx


In [24]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict4 = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict4 = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [25]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict4, "gpt35")
calculate_metrics(df_gpt4, metrics_dict4, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict4, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict4, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict4, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict4, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict4)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict4)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict4)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict4)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict4)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict4)

# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: CWE-Sys2 CWE-UserF")
df_metrics4 = pd.DataFrame(metrics_dict4)
df_metrics4 = df_metrics4.T
df_metrics4.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics4


Experiment results: CWE-Sys2 CWE-UserF


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.44,0.24,0.23,0.21
gpt4,0.69,0.54,0.58,0.53
gpt4o,0.65,0.6,0.74,0.63
CodeLlama-7b,0.13,0.11,0.08,0.06
CodeLlama-13b,0.21,0.1,0.09,0.07
gemini,0.69,0.56,0.52,0.49


In [27]:
# If there is not a directory named 'experiment_results_cwe' create one
if not os.path.exists('experiment_results_cwe'):
    os.makedirs('experiment_results_cwe')

# Concatenate the dataframes horizontally to compare the results
df_metrics_all = pd.concat([df_metrics, df_metrics2, df_metrics3, df_metrics4], axis=1)
# Save the metrics to an Excel file
df_metrics_all.to_csv('experiment_results_cwe/experiment_results_cwe.csv')
df_metrics_all

Unnamed: 0,accuracy,precision,recall,f1-score,accuracy.1,precision.1,recall.1,f1-score.1,accuracy.2,precision.2,recall.2,f1-score.2,accuracy.3,precision.3,recall.3,f1-score.3
gpt35,0.45,0.14,0.12,0.12,0.38,0.14,0.12,0.12,0.52,0.34,0.35,0.32,0.44,0.24,0.23,0.21
gpt4,0.56,0.21,0.18,0.18,0.67,0.3,0.23,0.25,0.61,0.42,0.52,0.44,0.69,0.54,0.58,0.53
gpt4o,0.62,0.23,0.19,0.2,0.52,0.22,0.19,0.19,0.67,0.61,0.69,0.61,0.65,0.6,0.74,0.63
CodeLlama-7b,0.22,0.12,0.09,0.08,0.29,0.08,0.05,0.06,0.11,0.13,0.1,0.08,0.13,0.11,0.08,0.06
CodeLlama-13b,0.33,0.09,0.08,0.08,0.25,0.08,0.05,0.05,0.23,0.09,0.07,0.07,0.21,0.1,0.09,0.07
gemini,0.53,0.15,0.13,0.13,0.55,0.16,0.15,0.14,0.66,0.52,0.56,0.51,0.69,0.56,0.52,0.49


## Analyse the results of the experiments per programming language

In [50]:

def test_dataframe_matches_dict(df_dict, original_dict):
    """
    Compare dataframes to corresponding values in an original dictionary and report mismatches.
    
    This function iterates through a dictionary of dataframes, comparing the values of specific columns
    ('Accuracy', 'Precision', 'Recall', 'F1-Score') in each dataframe to the values stored in an original dictionary.
    If there are discrepancies, it reports the mismatches.
    """

    mismatches = []
    
    for language, df in df_dict.items():
        for index, row in df.iterrows():
            model = row['Model']
            if model not in original_dict:
                mismatches.append(f"Model {model} not found in original dictionary for {language}")
                continue
            
            original_values = original_dict[model][language]
            df_values = row[['Accuracy', 'Precision', 'Recall', 'F1-Score']].values
            
            # Check if original_values and df_values are not the same
            if not np.array_equal(original_values, df_values):
                mismatches.append(f"Values for {model} do not match for {language}")

    if mismatches:
        print("Mismatches found:")
        for mismatch in mismatches:
            print(mismatch)
    else:
        print("All values match!")

In [51]:
languages = ['python', 'c', 'cpp', 'javascript', 'java']
all_dicts = [metrics_per_language_dict, metrics_per_language_dict2, metrics_per_language_dict3, metrics_per_language_dict4]

# Initialize a list to hold dictionaries of dataframes for each metrics dictionary
all_dfs = []

for m_dict in all_dicts:
    # Initialize a dictionary to hold dataframes for each language
    dfs = {language: pd.DataFrame() for language in languages}
    for language in languages:
        data = []
        for model, metrics in m_dict.items():
            data.append([model] + metrics[language])
        dfs[language] = pd.DataFrame(data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
    all_dfs.append(dfs)

In [52]:
# Assuming all_dfs[0] corresponds to metrics_per_language_dict
test_dataframe_matches_dict(all_dfs[0], metrics_per_language_dict)
test_dataframe_matches_dict(all_dfs[1], metrics_per_language_dict2)
test_dataframe_matches_dict(all_dfs[2], metrics_per_language_dict3)
test_dataframe_matches_dict(all_dfs[3], metrics_per_language_dict4)

All values match!
All values match!
All values match!
All values match!


In [53]:
# Drop the 'Model' column from all_dfs[1] to all_dfs[3] since as long as the all_dfs[0] has the 'Model' column
for language in languages:
    all_dfs[1][language] = all_dfs[1][language].drop(columns='Model')
    all_dfs[2][language] = all_dfs[2][language].drop(columns='Model')
    all_dfs[3][language] = all_dfs[3][language].drop(columns='Model')

## Result for Python

In [54]:
# all_dfs[0], metrics_per_language_dict -> CWE-Sys1 + CWE-UserZ
# all_dfs[1], metrics_per_language_dict2 -> CWE-Sys2 + CWE-UserZ
# all_dfs[2], metrics_per_language_dict3 -> CWE-Sys1 + CWE-UserF
# all_dfs[3], metrics_per_language_dict4 -> VD-Sys2 + CWE-UserF

# Create a folder to store all csv files
if not os.path.exists('experiment_results_cwe'):
    os.makedirs('experiment_results_cwe')

# Concatenate the dataframes horizontally to compare the results for Python
df_python = pd.concat([all_dfs[0]['python'], all_dfs[1]['python'], all_dfs[2]['python'], all_dfs[3]['python']], axis=1)
# Save the metrics to an CSV file
df_python.to_csv('experiment_results_cwe/python.csv')
print("CWE experiment results for Python:")
df_python

CWE experiment results for Python:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.39,0.25,0.21,0.22,0.38,0.22,0.2,0.2,0.47,0.27,0.32,0.28,0.46,0.32,0.3,0.29
1,gpt4,0.61,0.43,0.42,0.4,0.66,0.42,0.35,0.36,0.51,0.53,0.54,0.48,0.59,0.59,0.55,0.52
2,gpt4o,0.54,0.36,0.26,0.27,0.51,0.35,0.31,0.3,0.53,0.58,0.52,0.51,0.58,0.6,0.65,0.6
3,CodeLlama-7b,0.13,0.09,0.07,0.07,0.26,0.16,0.1,0.11,0.01,0.0,0.02,0.01,0.11,0.15,0.14,0.11
4,CodeLlama-13b,0.29,0.16,0.15,0.14,0.24,0.1,0.09,0.08,0.21,0.17,0.13,0.11,0.18,0.17,0.18,0.12
5,gemini,0.54,0.24,0.21,0.2,0.5,0.25,0.22,0.22,0.67,0.61,0.64,0.58,0.71,0.72,0.62,0.62


## Result for C

In [55]:
# Concatenate the dataframes horizontally to compare the results for C
df_c = pd.concat([all_dfs[0]['c'], all_dfs[1]['c'], all_dfs[2]['c'], all_dfs[3]['c']], axis=1)
# Save the metrics to an CSV file
df_c.to_csv('experiment_results_cwe/c.csv')
print("CWE experiment results for C")
df_c

CWE experiment results for C


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.43,0.21,0.2,0.19,0.34,0.2,0.19,0.19,0.44,0.46,0.43,0.39,0.41,0.32,0.33,0.31
1,gpt4,0.6,0.38,0.37,0.36,0.7,0.46,0.42,0.43,0.65,0.56,0.66,0.58,0.7,0.67,0.69,0.66
2,gpt4o,0.61,0.37,0.31,0.32,0.48,0.33,0.28,0.29,0.66,0.64,0.74,0.65,0.6,0.62,0.76,0.64
3,CodeLlama-7b,0.22,0.19,0.15,0.15,0.19,0.08,0.06,0.07,0.16,0.12,0.2,0.13,0.15,0.12,0.17,0.12
4,CodeLlama-13b,0.31,0.18,0.15,0.15,0.22,0.15,0.12,0.12,0.22,0.14,0.13,0.12,0.24,0.2,0.21,0.18
5,gemini,0.4,0.22,0.17,0.18,0.53,0.29,0.26,0.26,0.67,0.57,0.6,0.56,0.7,0.67,0.62,0.61


## Result for C++


In [56]:
# Concatenate the dataframes horizontally to compare the results for C++
df_cpp = pd.concat([all_dfs[0]['cpp'], all_dfs[1]['cpp'], all_dfs[2]['cpp'], all_dfs[3]['cpp']], axis=1)
# Save the metrics to an CSV file
df_cpp.to_csv('experiment_results_cwe/cpp.csv')
print("CWE experiment results for C++")
df_cpp

CWE experiment results for C++


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.43,0.21,0.19,0.19,0.36,0.25,0.22,0.22,0.54,0.43,0.46,0.41,0.43,0.32,0.32,0.29
1,gpt4,0.53,0.39,0.39,0.37,0.62,0.44,0.42,0.41,0.64,0.59,0.68,0.6,0.68,0.57,0.66,0.57
2,gpt4o,0.65,0.42,0.39,0.39,0.51,0.4,0.38,0.36,0.65,0.61,0.75,0.64,0.6,0.59,0.76,0.62
3,CodeLlama-7b,0.33,0.17,0.15,0.14,0.36,0.18,0.12,0.14,0.15,0.17,0.18,0.16,0.11,0.09,0.13,0.09
4,CodeLlama-13b,0.29,0.18,0.15,0.15,0.25,0.12,0.11,0.1,0.26,0.22,0.2,0.19,0.24,0.27,0.22,0.19
5,gemini,0.58,0.32,0.28,0.29,0.62,0.32,0.3,0.3,0.6,0.45,0.4,0.39,0.64,0.45,0.42,0.41


## Result for JavaScript

In [57]:
# Concatenate the dataframes horizontally to compare the results for JavaScript
df_js = pd.concat([all_dfs[0]['javascript'], all_dfs[1]['javascript'], all_dfs[2]['javascript'], all_dfs[3]['javascript']], axis=1)
# Save the metrics to an CSV file
df_js.to_csv('experiment_results_cwe/javascript.csv')
print("CWE experiment results for JavaScript")
df_js

CWE experiment results for JavaScript


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.53,0.25,0.26,0.24,0.47,0.27,0.27,0.25,0.64,0.42,0.5,0.44,0.5,0.41,0.4,0.37
1,gpt4,0.52,0.28,0.28,0.26,0.76,0.52,0.52,0.5,0.53,0.52,0.64,0.53,0.79,0.68,0.71,0.66
2,gpt4o,0.69,0.37,0.42,0.38,0.53,0.29,0.29,0.27,0.74,0.59,0.62,0.59,0.69,0.53,0.61,0.55
3,CodeLlama-7b,0.21,0.16,0.2,0.17,0.36,0.25,0.25,0.22,0.14,0.13,0.19,0.13,0.14,0.05,0.12,0.06
4,CodeLlama-13b,0.41,0.26,0.3,0.26,0.38,0.27,0.27,0.23,0.17,0.08,0.07,0.06,0.14,0.13,0.11,0.09
5,gemini,0.66,0.31,0.35,0.33,0.59,0.27,0.28,0.26,0.72,0.6,0.73,0.63,0.76,0.52,0.52,0.49


## Result for Java

In [58]:
# Concatenate the dataframes horizontally to compare the results for Java
df_java = pd.concat([all_dfs[0]['java'], all_dfs[1]['java'], all_dfs[2]['java'], all_dfs[3]['java']], axis=1)
# Save the metrics to an CSV file
df_java.to_csv('experiment_results_cwe/java.csv')
print("CWE experiment results for Java")
df_java

CWE experiment results for Java


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.46,0.19,0.16,0.16,0.39,0.22,0.18,0.19,0.55,0.28,0.31,0.29,0.42,0.22,0.23,0.21
1,gpt4,0.51,0.29,0.23,0.25,0.63,0.42,0.35,0.37,0.68,0.49,0.55,0.5,0.69,0.48,0.52,0.48
2,gpt4o,0.62,0.35,0.3,0.31,0.58,0.36,0.32,0.32,0.76,0.59,0.63,0.59,0.77,0.63,0.68,0.64
3,CodeLlama-7b,0.21,0.14,0.1,0.1,0.31,0.11,0.11,0.1,0.1,0.07,0.11,0.07,0.14,0.09,0.13,0.08
4,CodeLlama-13b,0.36,0.17,0.18,0.17,0.23,0.16,0.16,0.14,0.29,0.14,0.15,0.12,0.24,0.15,0.12,0.12
5,gemini,0.51,0.28,0.26,0.26,0.51,0.29,0.26,0.26,0.65,0.59,0.69,0.59,0.65,0.52,0.6,0.52
