# Experimental Analysis for Vulnerability Detection on our Dataset

In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_data_to_df_vd(base_dir):
    # Iterate through the folders and files
    for folder in os.listdir(base_dir):
        folder_path = os.path.join(base_dir, folder)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                if file.startswith('binary') and file.endswith('.xlsx'):
                    file_path = os.path.join(folder_path, file)
                    
                    # Read the file into each data frame
                    if "gpt35" in file:
                        print("reading file in gpt35:", file_path)
                        df_gpt35 = pd.read_excel(file_path)
                        # Assign the new column: `true_label_binary` to the data frame
                        df_gpt35['true_label_binary'] = df_gpt35['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                    elif "gpt4" in file:
                        print("reading file in gpt4:", file_path)
                        df_gpt4 = pd.read_excel(file_path)
                        # Assign the new column: `true_label_binary` to the data frame
                        df_gpt4['true_label_binary'] = df_gpt4['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                    elif "gpt-4o" in file:
                        print("reading file in gpt-4o:", file_path)
                        df_gpt4o = pd.read_excel(file_path)
                        # Assign the new column: `true_label_binary` to the data frame
                        df_gpt4o['true_label_binary'] = df_gpt4o['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                    elif "CodeLlama-7b" in file:
                        print("reading file in CodeLlama-7b:", file_path)
                        df_codellama7b = pd.read_excel(file_path)
                        # Assign the new column: `true_label_binary` to the data frame
                        df_codellama7b['true_label_binary'] = df_codellama7b['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                        # Rename the column: `vul_file_class` to `vul_binary_classification`
                        df_codellama7b.rename(columns={'vul_file_class': 'vul_binary_classification'}, inplace=True)
                    elif "CodeLlama-13b" in file:
                        print("reading file in CodeLlama-13b:", file_path)
                        df_codellama13b = pd.read_excel(file_path)
                        # Assign the new column: `true_label_binary` to the data frame
                        df_codellama13b['true_label_binary'] = df_codellama13b['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')
                        # Rename the column: `vul_file_class` to `vul_binary_classification`
                        df_codellama13b.rename(columns={'vul_file_class': 'vul_binary_classification'}, inplace=True)
                    elif "gemini" in file:
                        print("reading file in gemini:", file_path)
                        df_gemini = pd.read_excel(file_path)
                        # Assign the new column: `true_label_binary` to the data frame
                        df_gemini['true_label_binary'] = df_gemini['true_label'].apply(lambda x: 'vulnerable' if x != 'non-vul' else 'not vulnerable')

    return df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini


# Helper function to calculate the metrics
def calculate_metrics(df, metrics_dict, model_name, output=False):
    # Assuming your DataFrame is named 'df'
    y_true = df['true_label_binary']
    y_pred = df['vul_binary_classification']

    # Compute accuracy and round to two decimal places
    accuracy = round(accuracy_score(y_true, y_pred), 2)

    # Compute precision and round to two decimal places
    precision = round(precision_score(y_true, y_pred, pos_label='vulnerable'), 2)

    # Compute recall and round to two decimal places
    recall = round(recall_score(y_true, y_pred, pos_label='vulnerable'), 2)

    # Compute F1-score and round to two decimal places
    f1 = round(f1_score(y_true, y_pred, pos_label='vulnerable'), 2)

    if output:
        # Print the metrics
        print(f"Metrics for {model_name}:")
        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-score:", f1)

    # Append the metrics to the list in the dictionary
    metrics_dict[model_name].append(accuracy)
    metrics_dict[model_name].append(precision)
    metrics_dict[model_name].append(recall)
    metrics_dict[model_name].append(f1)


def calculate_metrics_per_language  (df, model_name, metrics_dict, output=False):
   # Assuming your DataFrame is named 'df'
    languages = df['language'].unique()
    if output:
        print(f"Result for {model_name}:")

    for language in languages:
        if output:
            print(f"Metrics for language: {language}")
        # Create a temporary list to store the metrics for the current language
        metrics = []

        # Filter the DataFrame for the current language
        df_language = df[df['language'] == language]
        
        # Get the true labels and predicted labels for the current language
        y_true = df_language['true_label_binary']
        y_pred = df_language['vul_binary_classification']
        
        # Compute accuracy and round to two decimal places
        accuracy = float(round(accuracy_score(y_true, y_pred), 2))
        
        # Compute precision and round to two decimal places
        precision = float(round(precision_score(y_true, y_pred, pos_label='vulnerable'), 2))
        
        # Compute recall and round to two decimal places
        recall = float(round(recall_score(y_true, y_pred, pos_label='vulnerable'), 2))
        
        # Compute F1-score and round to two decimal places
        f1 = float(round(f1_score(y_true, y_pred, pos_label='vulnerable'), 2))
        
        if output:
            # Print the rounded metrics for the current language
            print("Accuracy:", accuracy)
            print("Precision:", precision)
            print("Recall:", recall)
            print("F1-score:", f1)
            print("=========================================")

        # Append the metrics to the temporary list
        metrics.append(accuracy)
        metrics.append(precision)
        metrics.append(recall)
        metrics.append(f1)

        # Append the metrics to the list in the dictionary for the current language
        metrics_dict[model_name][language] = metrics


## <ins>Binary Classification Analysis (Vulnerability Detection)</ins>

### Experiment 1: VD-Sys1 + VD-User1

In [3]:
# Set the base directory
base_dir = './excel_results/experiment_VD-Sys1_VD-User1'
# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_vd(base_dir)
                

reading file in CodeLlama-7b: ./excel_results/experiment_VD-Sys1_VD-User1/codellama7b/binary_classification_results_CodeLlama-7b-Instruct-hf_20240307_043231.xlsx
reading file in gpt35: ./excel_results/experiment_VD-Sys1_VD-User1/gpt35/binary_classification_results_gpt35-turbo_20240211_151835.xlsx
reading file in gpt-4o: ./excel_results/experiment_VD-Sys1_VD-User1/gpt4o/binary_classification_results_gpt-4o_20240706_211109.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_VD-Sys1_VD-User1/codellama13b/binary_classification_results_CodeLlama-13b-Instruct-hf_20240307_054418.xlsx
reading file in gpt4: ./excel_results/experiment_VD-Sys1_VD-User1/gpt4/binary_classification_results_gpt4-turbo_20240211_185602.xlsx
reading file in gemini: ./excel_results/experiment_VD-Sys1_VD-User1/gemini1.5pro/binary_classification_results_gemini-1.5-pro-001_20240708_182324.xlsx


In [4]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [5]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict, "gpt35")
calculate_metrics(df_gpt4, metrics_dict, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict)


In [6]:
# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: VD-Sys1 + VD-User1")
df_metrics = pd.DataFrame(metrics_dict)
df_metrics = df_metrics.T
df_metrics.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics

Experiment results: VD-Sys1 + VD-User1


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.62,0.59,0.77,0.67
gpt4,0.81,0.81,0.82,0.81
gpt4o,0.8,0.72,0.97,0.83
CodeLlama-7b,0.62,0.58,0.89,0.7
CodeLlama-13b,0.6,0.76,0.28,0.41
gemini,0.8,0.79,0.83,0.81


## Experiment 2: VD-Sys1 + VD-User2

In [7]:
# Set the base directory
base_dir = './excel_results/experiment_VD-Sys1_VD-User2'

# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_vd(base_dir)

reading file in CodeLlama-7b: ./excel_results/experiment_VD-Sys1_VD-User2/codellama7b/binary_classification_results_CodeLlama-7b-Instruct-hf_20240718_192855.xlsx
reading file in gpt35: ./excel_results/experiment_VD-Sys1_VD-User2/gpt35/binary_r1_b2_classification_results_gpt35-turbo_20240719_115110.xlsx
reading file in gpt-4o: ./excel_results/experiment_VD-Sys1_VD-User2/gpt4o/binary_r1_b2_classification_results_gpt-4o_20240718_174733.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_VD-Sys1_VD-User2/codellama13b/binary_classification_results_CodeLlama-13b-Instruct-hf_20240718_203431.xlsx
reading file in gpt4: ./excel_results/experiment_VD-Sys1_VD-User2/gpt4/binary_r1_b2_classification_results_gpt4-turbo_20240717_171713.xlsx
reading file in gemini: ./excel_results/experiment_VD-Sys1_VD-User2/gemini1.5pro/binary_r1_b2_classification_results_gemini-1.5-pro-001_20240723_154510.xlsx


In [8]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict2 = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict2 = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [9]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict2, "gpt35")
calculate_metrics(df_gpt4, metrics_dict2, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict2, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict2, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict2, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict2, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict2)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict2)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict2)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict2)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict2)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict2)

# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: VD-Sys1 + VD-User2")
df_metrics2 = pd.DataFrame(metrics_dict2)
df_metrics2 = df_metrics2.T
df_metrics2.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics2


Experiment results: VD-Sys1 + VD-User2


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.57,0.55,0.82,0.66
gpt4,0.79,0.73,0.92,0.81
gpt4o,0.71,0.64,0.99,0.78
CodeLlama-7b,0.57,0.54,0.93,0.68
CodeLlama-13b,0.66,0.67,0.65,0.66
gemini,0.69,0.65,0.84,0.73


## Experiment 3: VD-Sys2 + VD-User1

In [10]:
# Set the base directory
base_dir = './excel_results/experiment_VD-Sys2_VD-User1'

# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_vd(base_dir)

reading file in CodeLlama-7b: ./excel_results/experiment_VD-Sys2_VD-User1/codellama7b/binary_classification_results_CodeLlama-7b-Instruct-hf_20240719_131326.xlsx
reading file in gpt35: ./excel_results/experiment_VD-Sys2_VD-User1/gpt35/binary_r2_b1_classification_results_gpt35-turbo_20240719_135632.xlsx
reading file in gpt-4o: ./excel_results/experiment_VD-Sys2_VD-User1/gpt4o/binary_r2_b1_classification_results_gpt-4o_20240718_185808.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_VD-Sys2_VD-User1/codellama13b/binary_classification_results_CodeLlama-13b-Instruct-hf_20240719_150001.xlsx
reading file in gpt4: ./excel_results/experiment_VD-Sys2_VD-User1/gpt4/binary_r2_b1_classification_results_gpt4-turbo_20240717_181240.xlsx
reading file in gemini: ./excel_results/experiment_VD-Sys2_VD-User1/gemini1.5pro/binary_r2_b1_classification_results_gemini-1.5-pro-001_20240723_155008.xlsx


In [11]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict3 = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict3 = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [12]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict3, "gpt35")
calculate_metrics(df_gpt4, metrics_dict3, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict3, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict3, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict3, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict3, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict3)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict3)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict3)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict3)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict3)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict3)

# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: VD-Sys2 + VD-User1")
df_metrics3 = pd.DataFrame(metrics_dict3)
df_metrics3 = df_metrics3.T
df_metrics3.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics3


Experiment results: VD-Sys2 + VD-User1


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.72,0.73,0.7,0.71
gpt4,0.79,0.75,0.87,0.81
gpt4o,0.62,0.57,1.0,0.72
CodeLlama-7b,0.54,0.52,0.98,0.68
CodeLlama-13b,0.65,0.62,0.77,0.69
gemini,0.75,0.71,0.83,0.77


## Experiment 4: VD-Sys2 + VD-User2

In [13]:
# Set the base directory
base_dir = './excel_results/experiment_VD-Sys2_VD-User2'

# Load the data into data frames
df_gpt35, df_gpt4, df_gpt4o, df_codellama7b, df_codellama13b, df_gemini = load_data_to_df_vd(base_dir)

reading file in CodeLlama-7b: ./excel_results/experiment_VD-Sys2_VD-User2/codellama7b/binary_classification_results_CodeLlama-7b-Instruct-hf_20240307_101900.xlsx
reading file in gpt35: ./excel_results/experiment_VD-Sys2_VD-User2/gpt35/binary_classification_results_gpt35-turbo_20240311_193511.xlsx
reading file in gpt-4o: ./excel_results/experiment_VD-Sys2_VD-User2/gpt4o/binary_classification_results_gpt-4o_20240708_114034.xlsx
reading file in CodeLlama-13b: ./excel_results/experiment_VD-Sys2_VD-User2/codellama13b/binary_classification_results_CodeLlama-13b-Instruct-hf_20240304_105709.xlsx
reading file in gpt4: ./excel_results/experiment_VD-Sys2_VD-User2/gpt4/binary_classification_results_gpt4-turbo_20240311_202341.xlsx
reading file in gemini: ./excel_results/experiment_VD-Sys2_VD-User2/gemini1.5pro/binary_classification_results_gemini-1.5-pro-001_20240710_152744.xlsx


In [14]:
# First, make a dict to store all metrics for each model
# Key is the model name, value is a list to store the metrics
metrics_dict4 = {
    "gpt35": [],
    "gpt4": [],
    "gpt4o": [],
    "CodeLlama-7b": [],
    "CodeLlama-13b": [],
    "gemini": []
}

# Store the result for each programming language per model in a dictionary.
# Key is the model name, value is a dictionary where key is the language and value is a list to store the metrics
metrics_per_language_dict4 = {
    "gpt35": {},
    "gpt4": {},
    "gpt4o": {},
    "CodeLlama-7b": {},
    "CodeLlama-13b": {},
    "gemini": {}
}

In [15]:
# Itrate through each model and calculate the metrics
calculate_metrics(df_gpt35, metrics_dict4, "gpt35")
calculate_metrics(df_gpt4, metrics_dict4, "gpt4")
calculate_metrics(df_gpt4o, metrics_dict4, "gpt4o")
calculate_metrics(df_codellama7b, metrics_dict4, "CodeLlama-7b")
calculate_metrics(df_codellama13b, metrics_dict4, "CodeLlama-13b")
calculate_metrics(df_gemini, metrics_dict4, "gemini")

# Calculate the metrics per language for each model
calculate_metrics_per_language(df_gpt35, "gpt35", metrics_per_language_dict4)
calculate_metrics_per_language(df_gpt4, "gpt4", metrics_per_language_dict4)
calculate_metrics_per_language(df_gpt4o, "gpt4o", metrics_per_language_dict4)
calculate_metrics_per_language(df_codellama7b, "CodeLlama-7b", metrics_per_language_dict4)
calculate_metrics_per_language(df_codellama13b, "CodeLlama-13b", metrics_per_language_dict4)
calculate_metrics_per_language(df_gemini, "gemini", metrics_per_language_dict4)

# Make a dataframe to store the metrics, which we can use to analyze the results
print("Experiment results: VD-Sys2 + VD-User2")
df_metrics4 = pd.DataFrame(metrics_dict4)
df_metrics4 = df_metrics4.T
df_metrics4.columns = ['accuracy', 'precision', 'recall', 'f1-score']
df_metrics4


Experiment results: VD-Sys2 + VD-User2


Unnamed: 0,accuracy,precision,recall,f1-score
gpt35,0.61,0.58,0.77,0.66
gpt4,0.78,0.71,0.93,0.81
gpt4o,0.63,0.57,0.99,0.73
CodeLlama-7b,0.57,0.55,0.84,0.66
CodeLlama-13b,0.66,0.62,0.81,0.71
gemini,0.68,0.62,0.97,0.75


In [16]:
# Concatenate the dataframes horizontally to compare the results
df_metrics_all = pd.concat([df_metrics, df_metrics2, df_metrics3, df_metrics4], axis=1)
# Save the metrics to an Excel file
df_metrics_all.to_excel('experiment_results_vd_all.xlsx')
df_metrics_all

Unnamed: 0,accuracy,precision,recall,f1-score,accuracy.1,precision.1,recall.1,f1-score.1,accuracy.2,precision.2,recall.2,f1-score.2,accuracy.3,precision.3,recall.3,f1-score.3
gpt35,0.62,0.59,0.77,0.67,0.57,0.55,0.82,0.66,0.72,0.73,0.7,0.71,0.61,0.58,0.77,0.66
gpt4,0.81,0.81,0.82,0.81,0.79,0.73,0.92,0.81,0.79,0.75,0.87,0.81,0.78,0.71,0.93,0.81
gpt4o,0.8,0.72,0.97,0.83,0.71,0.64,0.99,0.78,0.62,0.57,1.0,0.72,0.63,0.57,0.99,0.73
CodeLlama-7b,0.62,0.58,0.89,0.7,0.57,0.54,0.93,0.68,0.54,0.52,0.98,0.68,0.57,0.55,0.84,0.66
CodeLlama-13b,0.6,0.76,0.28,0.41,0.66,0.67,0.65,0.66,0.65,0.62,0.77,0.69,0.66,0.62,0.81,0.71
gemini,0.8,0.79,0.83,0.81,0.69,0.65,0.84,0.73,0.75,0.71,0.83,0.77,0.68,0.62,0.97,0.75


## Analyse the results of the experiments per programming language

In [17]:
def test_dataframe_matches_dict(df_dict, original_dict):
    mismatches = []
    
    for language, df in df_dict.items():
        for index, row in df.iterrows():
            model = row['Model']
            if model not in original_dict:
                mismatches.append(f"Model {model} not found in original dictionary for {language}")
                continue
            
            original_values = original_dict[model][language]
            df_values = row[['Accuracy', 'Precision', 'Recall', 'F1-Score']].values
            
            # Check if original_values and df_values are not the same
            if not np.array_equal(original_values, df_values):
                mismatches.append(f"Values for {model} do not match for {language}")

    if mismatches:
        print("Mismatches found:")
        for mismatch in mismatches:
            print(mismatch)
    else:
        print("All values match!")

In [18]:
languages = ['python', 'c', 'cpp', 'javascript', 'java']
all_dicts = [metrics_per_language_dict, metrics_per_language_dict2, metrics_per_language_dict3, metrics_per_language_dict4]

# Initialize a list to hold dictionaries of dataframes for each metrics dictionary
all_dfs = []

for m_dict in all_dicts:
    # Initialize a dictionary to hold dataframes for each language
    dfs = {language: pd.DataFrame() for language in languages}
    for language in languages:
        data = []
        for model, metrics in m_dict.items():
            data.append([model] + metrics[language])
        dfs[language] = pd.DataFrame(data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
    all_dfs.append(dfs)

In [19]:
# Assuming all_dfs[0] corresponds to metrics_per_language_dict
test_dataframe_matches_dict(all_dfs[0], metrics_per_language_dict)
test_dataframe_matches_dict(all_dfs[1], metrics_per_language_dict2)
test_dataframe_matches_dict(all_dfs[2], metrics_per_language_dict3)
test_dataframe_matches_dict(all_dfs[3], metrics_per_language_dict4)

All values match!
All values match!
All values match!
All values match!


In [20]:
# Drop the 'Model' column from all_dfs[1] to all_dfs[3] since as long as the all_dfs[0] has the 'Model' column
for language in languages:
    all_dfs[1][language] = all_dfs[1][language].drop(columns='Model')
    all_dfs[2][language] = all_dfs[2][language].drop(columns='Model')
    all_dfs[3][language] = all_dfs[3][language].drop(columns='Model')

## Result for Python

In [21]:
# all_dfs[0], metrics_per_language_dict -> VD-Sys1 + VD-User1
# all_dfs[1], metrics_per_language_dict2 -> VD-Sys1 + VD-User2
# all_dfs[2], metrics_per_language_dict3 -> VD-Sys2 + VD-User1
# all_dfs[3], metrics_per_language_dict4 -> VD-Sys2 + VD-User2

# Create a folder to store all csv files
if not os.path.exists('experiment_results_vd'):
    os.makedirs('experiment_results_vd')

# Concatenate the dataframes horizontally to compare the results for Python
df_python = pd.concat([all_dfs[0]['python'], all_dfs[1]['python'], all_dfs[2]['python'], all_dfs[3]['python']], axis=1)
# Save the metrics to an CSV file
df_python.to_csv('experiment_results_vd/python.csv')
print("Vulnerability Detection Experiment Results for Python")
df_python

Vulnerability Detection Experiment Results for Python


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.63,0.59,0.87,0.7,0.57,0.54,0.89,0.67,0.74,0.75,0.71,0.73,0.61,0.57,0.87,0.69
1,gpt4,0.75,0.73,0.79,0.76,0.78,0.72,0.89,0.8,0.78,0.73,0.87,0.8,0.71,0.66,0.87,0.75
2,gpt4o,0.76,0.7,0.92,0.8,0.66,0.6,0.95,0.73,0.58,0.54,1.0,0.7,0.59,0.55,0.97,0.7
3,CodeLlama-7b,0.7,0.67,0.76,0.72,0.58,0.56,0.79,0.65,0.54,0.52,0.97,0.68,0.57,0.55,0.76,0.64
4,CodeLlama-13b,0.57,0.69,0.24,0.35,0.59,0.59,0.61,0.6,0.59,0.59,0.63,0.61,0.54,0.53,0.63,0.58
5,gemini,0.76,0.79,0.71,0.75,0.61,0.59,0.71,0.64,0.74,0.7,0.84,0.76,0.66,0.6,0.95,0.73


## Result for C

In [22]:
# Concatenate the dataframes horizontally to compare the results for C
df_c = pd.concat([all_dfs[0]['c'], all_dfs[1]['c'], all_dfs[2]['c'], all_dfs[3]['c']], axis=1)
# Save the metrics to an CSV file
df_c.to_csv('experiment_results_vd/c.csv')
print("Vulnerability Detection Experiment Results for C")
df_c

Vulnerability Detection Experiment Results for C


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.56,0.55,0.68,0.61,0.47,0.48,0.7,0.57,0.69,0.68,0.73,0.7,0.57,0.56,0.68,0.61
1,gpt4,0.83,0.78,0.91,0.84,0.75,0.67,0.98,0.8,0.8,0.75,0.89,0.81,0.78,0.7,0.98,0.82
2,gpt4o,0.83,0.75,1.0,0.85,0.7,0.63,1.0,0.77,0.64,0.58,1.0,0.73,0.66,0.59,1.0,0.75
3,CodeLlama-7b,0.59,0.55,0.93,0.69,0.59,0.55,0.98,0.7,0.52,0.51,0.95,0.67,0.52,0.51,0.86,0.64
4,CodeLlama-13b,0.65,0.93,0.32,0.47,0.69,0.67,0.77,0.72,0.65,0.6,0.86,0.71,0.77,0.72,0.89,0.8
5,gemini,0.77,0.73,0.86,0.79,0.7,0.67,0.8,0.73,0.78,0.79,0.77,0.78,0.73,0.65,1.0,0.79


## Result for C++

In [23]:
# Concatenate the dataframes horizontally to compare the results for C++
df_cpp = pd.concat([all_dfs[0]['cpp'], all_dfs[1]['cpp'], all_dfs[2]['cpp'], all_dfs[3]['cpp']], axis=1)
# Save the metrics to an CSV file
df_cpp.to_csv('experiment_results_vd/cpp.csv')
print("Vulnerability Detection Experiment Results for C++")
df_cpp

Vulnerability Detection Experiment Results for C++


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.6,0.57,0.75,0.65,0.6,0.56,0.86,0.68,0.72,0.71,0.75,0.73,0.61,0.59,0.75,0.66
1,gpt4,0.85,0.82,0.89,0.85,0.83,0.76,0.97,0.85,0.78,0.71,0.94,0.81,0.81,0.73,0.97,0.83
2,gpt4o,0.86,0.78,1.0,0.88,0.75,0.67,1.0,0.8,0.62,0.57,1.0,0.73,0.6,0.55,1.0,0.71
3,CodeLlama-7b,0.61,0.57,0.92,0.7,0.57,0.54,0.94,0.69,0.57,0.54,1.0,0.7,0.6,0.56,0.94,0.7
4,CodeLlama-13b,0.61,0.83,0.28,0.42,0.62,0.63,0.61,0.62,0.68,0.64,0.83,0.72,0.69,0.64,0.89,0.74
5,gemini,0.88,0.85,0.92,0.88,0.72,0.67,0.89,0.76,0.72,0.67,0.89,0.76,0.68,0.61,0.97,0.75


## Result for JavaScript

In [24]:
# Concatenate the dataframes horizontally to compare the results for JavaScript
df_js = pd.concat([all_dfs[0]['javascript'], all_dfs[1]['javascript'], all_dfs[2]['javascript'], all_dfs[3]['javascript']], axis=1)
# Save the metrics to an CSV file
df_js.to_csv('experiment_results_vd/javascript.csv')
print("Vulnerability Detection Experiment Results for JavaScript") 
df_js

Vulnerability Detection Experiment Results for JavaScript


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.74,0.69,0.86,0.77,0.69,0.63,0.93,0.75,0.81,0.85,0.76,0.8,0.72,0.67,0.9,0.76
1,gpt4,0.84,0.88,0.79,0.84,0.83,0.77,0.93,0.84,0.78,0.74,0.86,0.79,0.72,0.67,0.9,0.76
2,gpt4o,0.67,0.61,0.97,0.75,0.66,0.59,1.0,0.74,0.53,0.52,1.0,0.68,0.53,0.52,0.97,0.67
3,CodeLlama-7b,0.64,0.58,0.97,0.73,0.55,0.53,0.97,0.68,0.52,0.51,1.0,0.67,0.66,0.6,0.9,0.72
4,CodeLlama-13b,0.53,0.57,0.28,0.37,0.69,0.72,0.62,0.67,0.66,0.64,0.72,0.68,0.62,0.58,0.86,0.69
5,gemini,0.88,0.87,0.9,0.88,0.64,0.6,0.86,0.7,0.71,0.67,0.83,0.74,0.6,0.56,1.0,0.72


## Result for Java

In [25]:
# Concatenate the dataframes horizontally to compare the results for Java
df_java = pd.concat([all_dfs[0]['java'], all_dfs[1]['java'], all_dfs[2]['java'], all_dfs[3]['java']], axis=1)
# Save the metrics to an CSV file
df_java.to_csv('experiment_results_vd/java.csv')
print("Vulnerability Detection Experiment Results for Java")
df_java

Vulnerability Detection Experiment Results for Java


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Accuracy.1,Precision.1,Recall.1,F1-Score.1,Accuracy.2,Precision.2,Recall.2,F1-Score.2,Accuracy.3,Precision.3,Recall.3,F1-Score.3
0,gpt35,0.6,0.57,0.74,0.65,0.58,0.56,0.76,0.65,0.65,0.69,0.57,0.62,0.58,0.57,0.69,0.62
1,gpt4,0.8,0.86,0.71,0.78,0.79,0.76,0.83,0.8,0.82,0.83,0.81,0.82,0.83,0.78,0.93,0.85
2,gpt4o,0.82,0.75,0.98,0.85,0.79,0.7,1.0,0.82,0.68,0.61,1.0,0.76,0.71,0.64,1.0,0.78
3,CodeLlama-7b,0.56,0.54,0.88,0.67,0.54,0.52,0.95,0.67,0.54,0.52,1.0,0.68,0.56,0.54,0.76,0.63
4,CodeLlama-13b,0.6,0.75,0.29,0.41,0.71,0.78,0.6,0.68,0.68,0.65,0.79,0.71,0.65,0.62,0.79,0.69
5,gemini,0.75,0.74,0.76,0.75,0.76,0.7,0.93,0.8,0.77,0.74,0.83,0.79,0.71,0.65,0.95,0.77
