In [13]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Define the path to your CSV file
# Using a raw string (r'...') is a good practice for file paths on Windows
file_path = r'f:\VERO UTENTE\Desktop\Uni\dissertation\main\K-fold\large-cross-entropy--5e-5\data_splits\fold1_validation_split_predictions.csv'

def print_metrics(data_frame):
    print(f"\nNumber of examples: {len(data_frame)}")
    
    if not data_frame.empty:
        print("Proportion and number of examples per class:")
        
        # Get value counts and proportions
        counts = data_frame['labels'].value_counts()
        proportions = data_frame['labels'].value_counts(normalize=True)
        
        # Combine them for a prettier output
        for label, proportion in proportions.items():
            count = counts[label]
            print(f"{label}: {proportion:.2%} ({count})")
        print("-" * 30)

        # Extract the true labels and predicted labels from the DataFrame
        y_true = data_frame['labels']
        y_pred = data_frame['predicted_label']
        
        # Get the unique labels present in the data
        labels = sorted(pd.concat([y_true, y_pred]).unique())

        # Calculate accuracy
        accuracy = accuracy_score(y_true, y_pred)

        # Calculate weighted F1 score
        weighted_f1 = f1_score(y_true, y_pred, average='weighted', labels=labels, zero_division=0)

        # Calculate macro F1 score
        macro_f1 = f1_score(y_true, y_pred, average='macro', labels=labels, zero_division=0)

        # Generate a detailed classification report
        report = classification_report(y_true, y_pred, labels=labels, zero_division=0)

        # Print the calculated metrics
        print(f"\nMetrics for filtered data from: {file_path}\n")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Weighted F1 Score: {weighted_f1:.4f}")
        print(f"Macro F1 Score: {macro_f1:.4f}")
        print(report)
    else:
        print("\nNo examples in data_frame.")

def threshold_filter(data_frame, threshold):
    # Define the probability columns
    prob_cols = ['prob_indicator', 'prob_ideation', 'prob_behavior', 'prob_attempt']
    
    df = data_frame.copy()

    # Check if all probability columns exist in the DataFrame
    if not all(col in df.columns for col in prob_cols):
        raise KeyError(f"One or more probability columns are missing. Required: {prob_cols}")

    # Calculate the maximum probability for each row
    df['max_prob'] = df[prob_cols].max(axis=1)

    # Filter the DataFrame based on the max probability threshold
    filtered_df = df[df['max_prob'] >= threshold].copy()

    return filtered_df

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

#print full DataFrame metrics
print("Metrics for the full DataFrame:")
print_metrics(df)

# Apply the threshold filter to the DataFrame
threshold =0.999
filtered_df = threshold_filter(df, threshold=threshold)

# Print metrics for the filtered DataFrame
print(f"\nThreshold used for filtering: {threshold}")
print("\nMetrics for the filtered DataFrame:")
print_metrics(filtered_df)

Metrics for the full DataFrame:

Number of examples: 100
Proportion and number of examples per class:
ideation: 40.00% (40)
behavior: 27.00% (27)
indicator: 26.00% (26)
attempt: 7.00% (7)
------------------------------

Metrics for filtered data from: f:\VERO UTENTE\Desktop\Uni\dissertation\main\K-fold\large-cross-entropy--5e-5\data_splits\fold1_validation_split_predictions.csv

Accuracy: 0.7100
Weighted F1 Score: 0.7026
Macro F1 Score: 0.6892
              precision    recall  f1-score   support

     attempt       0.55      0.86      0.67         7
    behavior       0.77      0.74      0.75        27
    ideation       0.69      0.82      0.75        40
   indicator       0.80      0.46      0.59        26

    accuracy                           0.71       100
   macro avg       0.70      0.72      0.69       100
weighted avg       0.73      0.71      0.70       100


Threshold used for filtering: 0.999

Metrics for the filtered DataFrame:

Number of examples: 25
Proportion and numb

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Define the path to your CSV file
# Using a raw string (r'...') is a good practice for file paths on Windows
file_path = r'f:\VERO UTENTE\Desktop\Uni\dissertation\main\K-fold\large-cross-entropy--5e-5\data_splits\fold1_validation_split_predictions.csv'

def print_metrics(data_frame):
    print(f"\nNumber of examples: {len(data_frame)}")
    
    if not data_frame.empty:
        print("Proportion and number of examples per class:")
        
        # Get value counts and proportions
        counts = data_frame['labels'].value_counts()
        proportions = data_frame['labels'].value_counts(normalize=True)
        
        # Combine them for a prettier output
        for label, proportion in proportions.items():
            count = counts[label]
            print(f"{label}: {proportion:.2%} ({count})")
        print("-" * 30)

        # Extract the true labels and predicted labels from the DataFrame
        y_true = data_frame['labels']
        y_pred = data_frame['predicted_label']
        
        # Get the unique labels present in the data
        labels = sorted(pd.concat([y_true, y_pred]).unique())

        # Calculate accuracy
        accuracy = accuracy_score(y_true, y_pred)

        # Calculate weighted F1 score
        weighted_f1 = f1_score(y_true, y_pred, average='weighted', labels=labels, zero_division=0)

        # Calculate macro F1 score
        macro_f1 = f1_score(y_true, y_pred, average='macro', labels=labels, zero_division=0)

        # Generate a detailed classification report
        report = classification_report(y_true, y_pred, labels=labels, zero_division=0)

        # Print the calculated metrics
        print(f"\nMetrics for filtered data from: {file_path}\n")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Weighted F1 Score: {weighted_f1:.4f}")
        print(f"Macro F1 Score: {macro_f1:.4f}")
        print(report)
    else:
        print("\nNo examples in data_frame.")

def threshold_filter(data_frame, threshold):
    # Define the probability columns
    prob_cols = ['prob_indicator', 'prob_ideation', 'prob_behavior', 'prob_attempt']
    
    df = data_frame.copy()

    # Check if all probability columns exist in the DataFrame
    if not all(col in df.columns for col in prob_cols):
        raise KeyError(f"One or more probability columns are missing. Required: {prob_cols}")

    # Calculate the maximum probability for each row
    df['max_prob'] = df[prob_cols].max(axis=1)

    # Filter the DataFrame based on the max probability threshold
    filtered_df = df[df['max_prob'] >= threshold].copy()

    return filtered_df

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

#print full DataFrame metrics
print("Metrics for the full DataFrame:")
print_metrics(df)

# Apply the threshold filter to the DataFrame
threshold =0.999
filtered_df = threshold_filter(df, threshold=threshold)

# Print metrics for the filtered DataFrame
print(f"\nThreshold used for filtering: {threshold}")
print("\nMetrics for the filtered DataFrame:")
print_metrics(filtered_df)