<a href="https://colab.research.google.com/github/aleksandarmanev01/spam-detection-final/blob/main/src/classification/analysis_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load the necessary libraries
import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Function to calculate and return performance metrics
def evaluate_performance(file_name):
    # Load the CSV file into a pandas DataFrame
    predictions = pd.read_csv(file_name)

    # Convert the labels to '0's and '1's
    actual_labels = predictions['actual_label'].replace({'ham': 0, 'spam': 1})
    predicted_labels = predictions['predicted_label'].replace({'ham': 0, 'spam': 1})

    # Calculate metrics
    precision = precision_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)
    accuracy = accuracy_score(actual_labels, predicted_labels)
    f1 = f1_score(actual_labels, predicted_labels)

    # Return the performance metrics
    return {
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1
    }

In [3]:
# Initialize a list to store the performance metrics for all prompts and models
performance_metrics = []

# List of files for all prompts and models
files = ['predictions_prompt1_flan-t5.csv', 'predictions_prompt2_flan-t5.csv',
         'predictions_prompt3_flan-t5.csv', 'predictions_prompt4_flan-t5.csv',
         'predictions_prompt2_llama2.csv', 'predictions_prompt3_llama2.csv',
         'predictions_prompt4_llama2.csv']

# Mapping for file names to model and prompt
file_mapping = {
    'predictions_prompt1_flan-t5.csv': ('FLAN-T5', 'Prompt 1'),
    'predictions_prompt2_flan-t5.csv': ('FLAN-T5', 'Prompt 2'),
    'predictions_prompt3_flan-t5.csv': ('FLAN-T5', 'Prompt 3'),
    'predictions_prompt4_flan-t5.csv': ('FLAN-T5', 'Prompt 4'),
    'predictions_prompt2_llama2.csv': ('LLaMA 2', 'Prompt 2'),
    'predictions_prompt3_llama2.csv': ('LLaMA 2', 'Prompt 3'),
    'predictions_prompt4_llama2.csv': ('LLaMA 2', 'Prompt 4'),
}

# Evaluate performance for all files
for file_name in files:
    model, prompt = file_mapping[file_name]
    metrics = evaluate_performance(file_name)
    metrics['Model'] = model
    metrics['Prompt'] = prompt
    performance_metrics.append(metrics)

# Convert the performance metrics to a DataFrame
df_performance = pd.DataFrame(performance_metrics)

# Set the index to model and prompt to organize the data better
df_performance.set_index(['Model', 'Prompt'], inplace=True)

# Display the combined performance metrics table
df_performance

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Recall,Precision,F1 Score
Model,Prompt,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
FLAN-T5,Prompt 1,0.909548,0.977242,0.599836,0.743381
FLAN-T5,Prompt 2,0.970388,0.947791,0.848921,0.895636
FLAN-T5,Prompt 3,0.971823,0.962517,0.847877,0.901567
FLAN-T5,Prompt 4,0.931084,0.978581,0.66515,0.791983
LLaMA 2,Prompt 2,0.925879,0.713521,0.728142,0.720757
LLaMA 2,Prompt 3,0.585068,0.941098,0.236621,0.37816
LLaMA 2,Prompt 4,0.228464,0.995984,0.147619,0.257128


In [4]:
# Function to calculate and return false positives and false negatives
def evaluate_fp_fn(file_name):
    # Load the CSV file into a pandas DataFrame
    predictions = pd.read_csv(file_name)

    # Convert the labels to '0's and '1's
    actual_labels = predictions['actual_label'].replace({'ham': 0, 'spam': 1})
    predicted_labels = predictions['predicted_label'].replace({'ham': 0, 'spam': 1})

    # Calculate confusion matrix
    conf_matrix = confusion_matrix(actual_labels, predicted_labels)

    # False Positives are at [0,1] and False Negatives are at [1,0] in the confusion matrix
    false_positives = conf_matrix[0][1]
    false_negatives = conf_matrix[1][0]

    # Return the counts of false positives and false negatives
    return {
        'False Positives': false_positives,
        'False Negatives': false_negatives
    }

In [5]:
# Initialize a list to store the FP and FN for all prompts and models
fp_fn_metrics = []

# List of files for all prompts and models (make sure all your csv files are listed here)
files = [
    'predictions_prompt1_flan-t5.csv', 'predictions_prompt2_flan-t5.csv',
    'predictions_prompt3_flan-t5.csv', 'predictions_prompt4_flan-t5.csv',
    'predictions_prompt2_llama2.csv', 'predictions_prompt3_llama2.csv',
    'predictions_prompt4_llama2.csv'
]

# Evaluate FP and FN for all files
for file_name in files:
    model, prompt = file_mapping[file_name]
    fp_fn = evaluate_fp_fn(file_name)
    fp_fn['Model'] = model
    fp_fn['Prompt'] = prompt
    fp_fn_metrics.append(fp_fn)

# Convert the FP and FN metrics to a DataFrame
df_fp_fn = pd.DataFrame(fp_fn_metrics)

# Set the index to model and prompt to organize the data better
df_fp_fn.set_index(['Model', 'Prompt'], inplace=True)

# Display the combined FP and FN table
df_fp_fn

Unnamed: 0_level_0,Unnamed: 1_level_0,False Positives,False Negatives
Model,Prompt,Unnamed: 2_level_1,Unnamed: 3_level_1
FLAN-T5,Prompt 1,487,17
FLAN-T5,Prompt 2,126,39
FLAN-T5,Prompt 3,129,28
FLAN-T5,Prompt 4,368,16
LLaMA 2,Prompt 2,199,214
LLaMA 2,Prompt 3,2268,44
LLaMA 2,Prompt 4,4296,3
