# **Introduction**
* Abusive relationships are a critical issue that affect people worldwide.
* With the rise of social media, more people are sharing personal stories.
* Using NLP, we can detect signs of abuse and identify key risk factors.
* Allows for efficient identification of abusive patterns and better support for victims.

# **Data**
Dataset of 10k Reddit posts which has been processed to:
* Post Metadata
* Relationship and Demographic data generated by Gemini pro
* Contexual Risk Factors generated by Gemini pro
notice


# **Installations & Imports**

In [None]:
# Installations
!pip install matplotlib seaborn
!pip install upgrade pandas
!pip install openpyxl
!pip install lime


In [None]:
# Imports
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.colors import LogNorm
import matplotlib.cm as cm

import plotly.express as px
from plotly.subplots import make_subplots

import nltk
from nltk.corpus import words

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, \
  precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import TSNE
from sklearn.svm import SVC

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader, TensorDataset

from collections import Counter, defaultdict
import seaborn as sns
from tabulate import tabulate
import statsmodels.api as sm
from tqdm import tqdm
import zipfile
import random

from transformers import RobertaTokenizer, DistilBertTokenizer, RobertaForSequenceClassification, \
    DistilBertForSequenceClassification, AdamW

import gc
from lime.lime_text import LimeTextExplainer

# **Part 1 - Data Validation**
**Zoom out on the distributions of the data**


In [None]:
#
# Load the Excel file, using the first row as headers
df = pd.read_excel('Labeled.xlsx', sheet_name='All', engine='openpyxl')

# Select the specified columns range
columns_range = df.columns[23:70]

# Define columns to exclude - columns with a different scale (shown in the next cell)
exclude_columns = ['author_gender', 'age_female', 'age_male', 'author_role', 'relationship_type']

# Filter out the columns to exclude
selected_columns = [col for col in columns_range if col not in exclude_columns]

# Create a new DataFrame with only the selected columns
df_selected = df[selected_columns]

# Define the possible labels
labels = ['yes', 'plausibly', 'cannot be inferred', 'no', 'irrelevant']

# Count the occurrences of each label in each column
label_counts = pd.DataFrame({label: (df_selected == label).sum() for label in labels}, index=selected_columns)

# Convert counts to percentages
label_percentages = label_counts.div(label_counts.sum(axis=1), axis=0) * 100

# Plot the stacked bar plot
fig, ax = plt.subplots(figsize=(12, 10))
label_percentages.plot(kind='barh', stacked=True, ax=ax, colormap='viridis')

# Add labels and title
ax.set_xlabel('Percentage')
ax.set_ylabel('Risk factor')
ax.set_title("Distribution of risk factors")

# Adjust spacing and reverse the Y-axis order
ax.set_yticks(range(len(selected_columns)))
ax.set_yticklabels(selected_columns, fontsize=9)
ax.invert_yaxis()

# Show legend and adjust layout
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.subplots_adjust(left=0.3)  # Adjust margin for better readability
plt.show()


**Ditribution of columns with different range**

In [None]:
# Load the Excel file
df_excluded = pd.read_excel('Labeled.xlsx', sheet_name='All', engine='openpyxl')

# Define age bins and labels for grouping ages
age_bins = [0, 20, 30, 40, 50, float('inf')]
age_labels = ['<=20', '20-30', '30-40', '40-50', '>50']

# Process age-related columns
if 'age_female' in exclude_columns:
    df_excluded['age_female_group'] = pd.cut(df_excluded['age_female'], bins=age_bins, labels=age_labels)
if 'age_male' in exclude_columns:
    df_excluded['age_male_group'] = pd.cut(df_excluded['age_male'], bins=age_bins, labels=age_labels)

# Replace age columns with grouped versions in exclude_columns if present
exclude_columns = [
    'age_female_group' if 'age_female' in exclude_columns else 'age_female',
    'age_male_group' if 'age_male' in exclude_columns else 'age_male',
    *[col for col in exclude_columns if col not in ['age_female', 'age_male']]
]

# Iterate through each column in exclude_columns
for column in exclude_columns:
    # Skip columns that don't exist in the group
    if column not in df_excluded:
        continue

    # Extract unique labels for the current column
    labels = df_excluded[column].dropna().unique()

    # Count occurrences of each label
    label_counts = df_excluded[column].value_counts(normalize=True) * 100

    # Plot the distribution of labels for the current column
    fig, ax = plt.subplots(figsize=(10, 6))
    label_counts.plot(kind='bar', ax=ax)  # Changed to 'bar' for vertical bars

    # Add labels and title
    ax.set_xlabel('Labels')        # Switched x-axis label
    ax.set_ylabel('Percentage')    # Switched y-axis label
    ax.set_title(f"Distribution of Labels in {column}")

    # Rotate x-axis labels if they are too long
    plt.xticks(rotation=45, ha='right')

    # Adjust layout and display the plot
    plt.tight_layout()
    plt.show()


Notice that there are no missing values in the columns of Relationship, Demographic and risk factore, as expected, since the data was generated.

# **Part 2 - Data Exploration**
* Sampling 100 examples.
* Verifying that the data is representative.
* Manually classifying based on the stories.

**2.1 We randomly selected 100 examples and examined their representation based on subreddits.**

In [None]:
# Define the sheets to process and the fixed row count for the sampled data
sheets = ['Test', 'All']  # Test = sample, All = full dataset
fixed_row_counts = {'Test': 100}  # Fixed row count for the sampled dataset

# Dictionary to store extracted subreddit values from each sheet
subreddit_data = {}

# Load the Excel file once
excel_file = pd.ExcelFile('Labeled.xlsx')

# Process each sheet
for sheet_name in sheets:
    # Load the sheet into a DataFrame
    df = excel_file.parse(sheet_name=sheet_name)

    # Specify the columns to process (column at index 2)
    subreddit_column = df.columns[2:3]

    # Determine the number of rows to process
    if sheet_name in fixed_row_counts:
        num_rows = fixed_row_counts[sheet_name]
    else:
        num_rows = len(df)  # Dynamically calculate for the full dataset

    # Extract subreddit values from the specified column
    subreddit_values = []
    for i in range(num_rows):
        subreddit_values.extend(df.loc[i, subreddit_column].tolist())

    # Store the subreddit values in the dictionary
    subreddit_data[sheet_name] = subreddit_values

# Access extracted subreddit values
sampled_subreddits = subreddit_data['Test']
full_subreddits = subreddit_data['All']

# Count the occurrences of each unique subreddit
sampled_counts = Counter(sampled_subreddits)
full_counts = Counter(full_subreddits)

# Combine "survivinginfidelity" into the key "Infidelity" for consistency
sampled_counts['Infidelity'] += sampled_counts.pop('survivinginfidelity', 0)
full_counts['Infidelity'] += full_counts.pop('survivinginfidelity', 0)

# Calculate the total counts for normalization
total_sampled = sum(sampled_counts.values())
total_full = sum(full_counts.values())

# Convert counts to percentages
sampled_percentages = {subreddit: (count / total_sampled) * 100 for subreddit, count in sampled_counts.items()}
full_percentages = {subreddit: (count / total_full) * 100 for subreddit, count in full_counts.items()}

# Get a combined list of all unique subreddits from both datasets
all_subreddits = set(sampled_percentages.keys()).union(set(full_percentages.keys()))

# Sort subreddits by their percentage in the full dataset (descending order)
sorted_subreddits = sorted(all_subreddits, key=lambda sub: full_percentages.get(sub, 0), reverse=True)

# Prepare data for the grouped bar chart
sampled_percentages_sorted = [sampled_percentages.get(sub, 0) for sub in sorted_subreddits]
full_percentages_sorted = [full_percentages.get(sub, 0) for sub in sorted_subreddits]

# Plot the grouped bar chart
x_positions = range(len(sorted_subreddits))
bar_width = 0.35  # Width of the bars

fig, ax = plt.subplots(figsize=(12, 8))
bars_sampled = ax.bar([pos - bar_width/2 for pos in x_positions], sampled_percentages_sorted, bar_width, label='Sampled Data', color='skyblue')
bars_full = ax.bar([pos + bar_width/2 for pos in x_positions], full_percentages_sorted, bar_width, label='Full Data', color='lightgreen')

# Add labels, title, and legend
ax.set_xlabel('Subreddit')
ax.set_ylabel('Percentage')
ax.set_title('Comparison of Subreddit Percentages: Sampled vs Full Data')
ax.set_xticks(x_positions)
ax.set_xticklabels(sorted_subreddits, rotation=90)
ax.legend()

# Annotate the bars with the percentage values
for bars in [bars_sampled, bars_full]:
    for bar in bars:
        y_value = bar.get_height()
        if y_value > 0:  # Only annotate bars with a value greater than 0
            ax.text(
                bar.get_x() + bar.get_width() / 2, y_value / 2, f'{round(y_value)}%',
                ha='center', va='center', color='black'
            )

# Adjust layout for better spacing
plt.tight_layout(pad=3)
plt.show()


**2.2 We compared Gemini Pro predictions with the true labels manually annotated by us.**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score
from matplotlib.colors import LogNorm

# Load the Excel file
df = pd.read_excel('Labeled.xlsx', sheet_name='Test', engine='openpyxl')

# Define the contexts to search for in the column headers
contexts_to_find = ["Overall", "mental_condition", "jealousy", "emotional_violence"]

# Dynamically find column ranges or indices for the contexts
contexts = {}
for context in contexts_to_find:
    if context == "Overall":
        contexts[context] = df.columns[23:70]  # Overall is a range of columns
    else:
        contexts[context] = [col for col in df.columns if context in col]

# Define number of rows for predicted and true values
num_rows = 100

# Define the labels for the confusion matrix
labels = ['yes', 'plausibly', 'cannot be inferred', 'no']

# Define colors for each plot
plot_colors = {
    "Overall": "Blues",
    "mental_condition": "Greens",
    "jealousy": "Purples",
    "emotional_violence": "Oranges"
}

# Loop through each context and generate a plot
for context_name, columns_range in contexts.items():
    # Extract predicted values (first 100 rows)
    Predicted_values = []
    for i in range(num_rows):
        Predicted_values.extend(df.loc[i, columns_range].tolist())

    # Extract true values (rows 110-209)
    True_values = []
    for i in range(num_rows):
        True_values.extend(df.loc[i + 109, columns_range].tolist())

    # Compute the confusion matrix
    cm = confusion_matrix(Predicted_values, True_values, labels=labels)

    # Plot the confusion matrix
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plot_colors[context_name], norm=LogNorm(vmin=1, vmax=cm.max()))
    plt.colorbar()

    # Annotate the cells with counts
    for i in range(len(labels)):
        for j in range(len(labels)):
            plt.text(j, i, cm[i, j], ha='center', va='center', color='black', fontweight='bold')

    # Add gridlines
    for i in range(len(labels) - 1):
        plt.axhline(i + 0.5, color='black', linestyle='-', linewidth=1)
        plt.axvline(i + 0.5, color='black', linestyle='-', linewidth=1)

    # Add title and axis labels
    plt.title(f'Confusion Matrix: {context_name}')
    plt.xlabel('True Labels')
    plt.ylabel('Predicted Labels')
    plt.xticks(ticks=np.arange(len(labels)), labels=labels)
    plt.yticks(ticks=np.arange(len(labels)), labels=labels)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

    # Calculate and print F1 scores
    f1_weighted = f1_score(True_values, Predicted_values, average='weighted')
    f1_macro = f1_score(True_values, Predicted_values, average='macro')
    print(f"F1 Score (Weighted) for {context_name}: {f1_weighted:.2f}")
    print(f"F1 Score (Macro) for {context_name}: {f1_macro:.2f}")


In the comparison, we evaluated two metrics: weighted F1 and unweighted F1. The blue graph shows a general trend, while the following graphs are examples where the metrics evaluate similarly or differently.

# **Part 3 - Data Preprocessing for models’ prediction**



* Comparing differences in evaluation metrics.
* Merging label categories into binary classifications.
* Selecting the most reliable risk factors for the following parts.



**3.1 Comparison between weighted and unweighted F1**

In [None]:
# Read the Excel file
df = pd.read_excel('Labeled.xlsx', sheet_name='Test')

# List to store the results
results = []

# Iterate over the column ranges from index 23 to 69
for j in range(47):
    column_name = df.columns[23 + j]
    if column_name in ['age_female', 'age_male', 'author_gender']:
        continue

    num_rows = 100
    Predicted_values = df.iloc[:num_rows][column_name].tolist()
    True_values = df.iloc[109:109 + num_rows][column_name].tolist()

    # Calculate the F1 scores
    f1_weighted = f1_score(True_values, Predicted_values, average='weighted')
    f1_macro = f1_score(True_values, Predicted_values, average='macro')

    # Append the results to the list
    results.append({
        'Risk Factor': column_name,
        'F1 Score - weighted': round(f1_weighted, 2),
        'F1 Score - macro': round(f1_macro, 2)
    })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Sort the DataFrame by F1 Score - weighted in descending order
results_df = results_df.sort_values(by='F1 Score - weighted', ascending=False)

# Split the DataFrame into two parts
midpoint = len(results_df) // 2
df1 = results_df.iloc[:midpoint]
df2 = results_df.iloc[midpoint:]

# Display the two tables side by side with tabulate
print("Table 1:")
print(tabulate(df1, headers='keys', tablefmt='pretty', showindex=False))
print("\nTable 2:")
print(tabulate(df2, headers='keys', tablefmt='pretty', showindex=False))


**3.2 Comparison of unweighted F1 (macro) before and after merging labels**

In [None]:
# Read the Excel file
df = pd.read_excel('Labeled.xlsx', sheet_name='Test')

# First table: F1 Score - weighted and F1 Score - macro
results = []

# Iterate over the column ranges from index 23 to 69
for j in range(47):
    column_name = df.columns[23 + j]
    if column_name in ['age_female', 'age_male', 'author_gender']:
        continue

    num_rows = 100
    Predicted_values = df.iloc[:num_rows][column_name].tolist()
    True_values = df.iloc[109:109 + num_rows][column_name].tolist()

    # Calculate the F1 scores
    f1_weighted = f1_score(True_values, Predicted_values, average='weighted')
    f1_macro = f1_score(True_values, Predicted_values, average='macro')

    # Append the results to the list
    results.append({
        'Risk Factor': column_name,
        'F1 Score - weighted': round(f1_weighted, 2),
        'F1 Score - macro old': round(f1_macro, 2)
    })

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Second table: F1 Score - macro after replacing labels
columns_range = df.columns[34:70]
num_rows_replace = 211
df.loc[:num_rows_replace-1, columns_range] = df.loc[:num_rows_replace-1, columns_range].replace({'no': 'cannot be inferred', 'yes': 'plausibly'})

new_results = []

# Recalculate F1 scores after label replacement
for j in range(47):
    column_name = df.columns[23 + j]
    if column_name in ['age_female', 'age_male', 'author_gender']:
        continue

    num_rows = 100
    Predicted_values_new = df.iloc[:num_rows][column_name].tolist()
    True_values_new = df.iloc[109:109 + num_rows][column_name].tolist()

    # Calculate the new F1 score - macro
    f1_macro_new = f1_score(True_values_new, Predicted_values_new, average='macro')

    # Append the updated results to the list
    new_results.append({
        'Risk Factor': column_name,
        'F1 Score - macro new': round(f1_macro_new, 2)
    })

# Create DataFrame from the new results
results_new_df = pd.DataFrame(new_results)

# Merge the old and new DataFrames on 'Risk Factor'
final_df = pd.merge(results_df[['Risk Factor', 'F1 Score - macro old']], results_new_df, on='Risk Factor')

# Rename the columns
final_df.rename(columns={
    'F1 Score - macro old': 'F1 - old',
    'F1 Score - macro new': 'F1 - new'
}, inplace=True)

# Calculate the difference with two decimal places
final_df['Difference'] = (final_df['F1 - new'] - final_df['F1 - old']).round(2)

# Sort the final DataFrame by 'F1 - new'
final_df = final_df.sort_values(by='F1 - new', ascending=False)

# Reorder columns: F1 - new first, then F1 - old, then Difference
final_df = final_df[['Risk Factor', 'F1 - new', 'F1 - old', 'Difference']]

# Split the DataFrame into two parts
midpoint = len(final_df) // 2
df1 = final_df.iloc[:midpoint].reset_index(drop=True)
df2 = final_df.iloc[midpoint:].reset_index(drop=True)

# Display the two tables
print("Table 1:")
print(tabulate(df1, headers='keys', tablefmt='pretty', showindex=False))
print("\nTable 2:")
print(tabulate(df2, headers='keys', tablefmt='pretty', showindex=False))


**3.3 The final matric display - F1 macro after marging labels**


In [None]:
# Load the Excel file
df = pd.read_excel('Labeled.xlsx', sheet_name='Test', engine='openpyxl')

columns_range = df.columns[34:70]
# Extract the first row values from the specified columns
num_rows = 211

# Replace 'no' with 'cannot be inferred' and 'yes' with 'plausibly' in the specified columns and rows
df.loc[:num_rows-1, columns_range] = df.loc[:num_rows-1, columns_range].replace({'no': 0, 'cannot be inferred': 0,'irrelevant':0, 'yes': 1, 'plausibly': 1})
results = []
num_rows = 100
# Iterate over specified columns
for j in range(47):
    column_name = df.columns[23 + j]
    if column_name not in ['age_female', 'age_male', 'author_gender']:
        Predicted_values = df.iloc[:num_rows][column_name].tolist()
        True_values = df.iloc[109:109 + num_rows][column_name].tolist()
        results.append({
            'Risk Factor': column_name,
            'F1 Score - macro': round(f1_score(True_values, Predicted_values, average='macro'), 2),
        })
results_df = pd.DataFrame(results).sort_values(by='F1 Score - macro', ascending=False)

print(tabulate(results_df, headers='keys', tablefmt='pretty', showindex=False))



**3.4 Keep the reliable risk factors (>0.9) for the models**

In [None]:
# Filter for rows with F1 Score - macro >= 0.9
high_f1_scores = results_df[results_df['F1 Score - macro'] >= 0.9]

# Exclude the 'author_role' column
high_f1_scores_filtered = high_f1_scores[high_f1_scores['Risk Factor'] != 'author_role']

# Create a list of risk factors with F1 scores above 0.9, excluding 'author_role'
high_f1_risk_factors_list = high_f1_scores_filtered['Risk Factor'].tolist()

# Output the list
high_f1_risk_factors_list


Corr of selected columns with new labels

# **Part 4 - Baselines**
We predicted the risk factor: physical violence.

Goals:
* Establish a baseline for score.
* Data Insights: Identify key features and important words.



**4.1 Logistic regression**


Calculated correlation

In [None]:
# Load the Excel file
df_all = pd.read_excel('Labeled.xlsx', sheet_name='All', engine='openpyxl')

# Extract the columns corresponding to the top column names and merge labels
df_top_columns = df_all[high_f1_risk_factors_list].replace({'no': 0, 'cannot be inferred': 0,'irrelevant':0, 'yes': 1, 'plausibly': 1})
# Create a correlation matrix between these columns
correlation_matrix = df_top_columns.corr()
# Set up the matplotlib figure with a larger size and adjust font size
plt.figure(figsize=(22, 20))
sns.set(font_scale=1.2)
# Draw the heatmap without the mask and correct aspect ratio
heatmap = sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, fmt='.1f', linewidths=0.5)
# Rotate the x and y axis labels for better readability
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)

# Add a title to the heatmap
plt.title('Correlation Matrix for Reliable Risk Factors', fontsize=18)

# Adjust the layout to fit everything within the figure area
plt.tight_layout()

# Save the plot as an image and display it
plt.savefig('correlation_matrix.png')
plt.show()


In [None]:
# Load the Excel file
df_all = pd.read_excel('Labeled.xlsx', sheet_name='All', engine='openpyxl')

# Replace specific values in the entire DataFrame
replacement_dict = {
    'no': 0,
    'cannot be inferred': 0,
    'irrelevant': 0,
    'yes': 1,
    'plausibly': 1
}
df_all = df_all[high_f1_risk_factors_list].replace(replacement_dict)

# List of target columns to predict
#Target_to_predict = ['physical_violence', 'aggressive_behavior', 'gaslighting', 'narcissistic_traits']
Target_to_predict = ['physical_violence']

# Iterate over each target column
for target_column in Target_to_predict:
    print(f"\nThe target risk factor is: {target_column}")

    # Separate features and target variable
    X = df_all.drop(columns=[target_column])
    y = df_all[target_column]

    # Ensure the target variable is binary
    if y.nunique() != 2:
        print(f"Skipping '{target_column}' as it is not binary.")
        continue

    # Add a constant to the features (intercept)
    X = sm.add_constant(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Initialize the logistic regression model using statsmodels
    model = sm.Logit(y_train, X_train)

    # Fit the model
    result = model.fit()

    # Print the summary which includes coefficients, z-values, and p-values
    print(result.summary2())

    # Remove columns with p-value > 0.05
    significant_columns = result.pvalues[result.pvalues <= 0.05].index.tolist()
    if 'const' in significant_columns:
        significant_columns.remove('const')

    # Print the significant columns
    print(f"Significant columns for {target_column}: {significant_columns}")

    # Recreate the feature set with only significant columns
    X_significant = X[significant_columns]
    X_significant = sm.add_constant(X_significant)

    # Split the data again with significant columns
    X_train, X_test, y_train, y_test = train_test_split(X_significant, y, test_size=0.3, random_state=42)

    # Initialize the logistic regression model using statsmodels with significant columns
    model_significant = sm.Logit(y_train, X_train)

    # Fit the model with significant columns
    result_significant = model_significant.fit()

    # Print the summary for the model with significant columns
    print(result_significant.summary2())

    # Make predictions on the test set with significant columns
    y_pred_significant = result_significant.predict(X_test)
    y_pred_binary_significant = [1 if x > 0.5 else 0 for x in y_pred_significant]

    # Evaluate the model with significant columns
    accuracy_significant = accuracy_score(y_test, y_pred_binary_significant)
    report_significant = classification_report(y_test, y_pred_binary_significant, target_names=['0', '1'])
    print(f"\n The target risk factor is: {target_column}")
    print(report_significant)
    print(f"Accuracy with significant columns: {accuracy_significant:.2f}")
    print("\n" + "="*60 + "\n")


**4.2 TF-IDF + SVM**

In [None]:
# read the excel file
df = pd.read_excel('Labeled.xlsx', sheet_name='All', engine='openpyxl')

# keep only relevant columns (columns that are interesting to predict and Yoni got high f1 score on them)
feature_column = ['post_body']
abusive_types_columns = ['physical_violence','sexual_violence']
risk_factors_columns = ['social_isolation', 'gaslighting', 'mental_condition', 'daily_activity_control', 'aggressive_behavior', 'narcissistic_traits']

# labels_columns = ['abusive_relationship','emotional_violence',
#                     'physical_violence','sexual_violence','economic_violence']
df = df[feature_column + abusive_types_columns + risk_factors_columns]

# re-label the data and unite labels - ('no' + 'cannot be inferred' + 'irrelevant')  and  ('yes' + 'plausibly')
converts_dict = {
    'no': 0,
    'cannot be inferred': 0,
    'irrelevant': 0,
    'yes': 1,
    'plausibly': 1
}

df = df.replace(converts_dict)

# preprocess the 'post_body' column - convert to TF-IDF vector representation
df['post_body'] = df['post_body'].str.lower()

# Calculate Word Frequencies
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['post_body'])

# Sum up the counts of each vocabulary word
word_counts = X.toarray().sum(axis=0)
vocab = vectorizer.get_feature_names_out()

# Count the number of unique words
vocabulary_size = len(vocab)
print(f'There are {vocabulary_size} different words in the vocabulary')

# Create a dictionary with word frequencies
word_freq = dict(zip(vocab, word_counts))

# Filter Words by Frequency Threshold=200
frequency_threshold = 200
filtered_words = {word: freq for word, freq in word_freq.items() if freq >= frequency_threshold}

print(len(filtered_words))

# Set max_features Based on Filtered Words
max_features = len(filtered_words)


In [None]:
# max_features is the embedding dim
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
post_body_vectors = vectorizer.fit_transform(df['post_body'])


X_train_dict = {}
X_test_dict = {}
y_train_dict = {}
y_test_dict = {}

# Perform stratified train-test split for each label
labels_columns = abusive_types_columns + risk_factors_columns
for label in labels_columns:
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
    train_indices, test_indices = next(sss.split(post_body_vectors, df[label]))
    X_train_dict[label], X_test_dict[label] = post_body_vectors[train_indices], post_body_vectors[test_indices]
    y_train_dict[label], y_test_dict[label] = df[label].iloc[train_indices].values, df[label].iloc[test_indices].values


**Plotting the data distribution in a 2D space**


In [None]:
# perform t-SNE to reduce to 2D
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(post_body_vectors.toarray())

plot_df = pd.DataFrame({
    'x': reduced_vectors[:, 0],
    'y': reduced_vectors[:, 1],
})

In [None]:
for column in labels_columns:
    plot_df['label'] = df[column].map({1: 'yes', 0: 'no'})

    # plot the reduced vectors using Plotly
    fig = px.scatter(
      plot_df, x='x', y='y', color=plot_df['label'].astype(str),
      title=f'2D representation of TF-IDF vectors, classified by {column}',
      labels={'color': 'Label'},
      width=800, height=600
    )
    fig.show()

**Creating the SVM classifier**

In [None]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
f1_per_kernel = {kernel: {} for kernel in kernels}

for kernel in kernels:
    svm_classifier = SVC(kernel=kernel, C=1.0, random_state=42, class_weight='balanced')

    f1_scores = {label: [] for label in labels_columns}

    for column in tqdm(labels_columns, desc=f"Processing label columns for {kernel} kernel"):
        svm_classifier.fit(X_train_dict[column], y_train_dict[column])
        y_pred = svm_classifier.predict(X_test_dict[column])
        f1 = round(f1_score(y_test_dict[column], y_pred, average='macro'), 4)
        f1_scores[column].append(f1)

    f1_per_kernel[kernel] = f1_scores


**Adding plots for the different f1 scores per label and kernel**

In [None]:
def plot_f1_scores(column_list, title):
    # Prepare data for plotting
    plot_data = {label: [f1_per_kernel[kernel][label][0] for kernel in kernels] for label in column_list}
    df_plot = pd.DataFrame(plot_data, index=kernels)

    # Plot the F1 scores
    fig, ax = plt.subplots(figsize=(12, 8))

    bar_width = 0.2
    index = np.arange(len(column_list))

    # Generate shades of green using a colormap
    colormap = cm.get_cmap('Greens', len(kernels) + 3)
    green_shades = [colormap(i + 1) for i in range(len(kernels))]

    for i, kernel in enumerate(kernels):
        ax.bar(index + i * bar_width, df_plot.loc[kernel], bar_width, color=green_shades[i], label=kernel)

    ax.set_xlabel('Label Columns')
    ax.set_ylabel('F1 Score')
    ax.set_title(title)
    ax.set_xticks(index + bar_width * (len(kernels) - 1) / 2)
    ax.set_xticklabels(column_list)

    # Position the legend outside the plot area
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.show()

# Plot for abusive_types_columns
plot_f1_scores(abusive_types_columns, 'F1 Scores for Abusive Types Columns by Kernel')

# Plot for risk_factors_columns
plot_f1_scores(risk_factors_columns, 'F1 Scores for Risk Factors Columns by Kernel')

**Getting TF-IDF statistics - the scores of the words**


In [None]:
nltk.download('words')

# here I removed all the non-English words. This might harm our reliability, but it can make the results more interpretable
english_words = set(words.words())


In [None]:
# initial the possible labels again, we might want different features for this part
labels_columns = abusive_types_columns
terms_score_dict = {label: None for label in labels_columns}

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)

for label in labels_columns:
    # keeping posts that are positive ('yes') for the specific label
    positive_on_label_posts = df['post_body'][df[label] == 1].str.lower()

    tfidf_matrix = vectorizer.fit_transform(positive_on_label_posts)
    terms = vectorizer.get_feature_names_out()

    term_scores = defaultdict(float)
    term_counts = defaultdict(int)

    for row in tfidf_matrix.toarray():
        for term_idx, score in enumerate(row):
            if score > 0:
                term = terms[term_idx]
                term_scores[term] += score
                term_counts[term] += 1

    mean_term_scores = {term: round(term_scores[term] / term_counts[term],3) for term in term_scores}

    # Filter out non-real words
    mean_term_scores = {term: score for term, score in mean_term_scores.items() if term in english_words}

    term_score_df = pd.DataFrame(list(mean_term_scores.items()), columns=['Term', 'Mean TF-IDF Score'])

    terms_score_dict[label] = term_score_df.sort_values(by='Mean TF-IDF Score', ascending=False)


for label in terms_score_dict:
    print(f"Top 10 terms for {label}:")
    print(terms_score_dict[label][['Term', 'Mean TF-IDF Score']].head(10))
    print()

In [None]:
# extracting the most significant features from the tf-ids vectorizer
feature_names = vectorizer.get_feature_names_out()
idf_scores = vectorizer.idf_

idf_df = pd.DataFrame({'term': feature_names, 'score': idf_scores})

print(idf_df.sort_values(by='score', ascending=False).head(10))

# **Part 5 - LIME analysis**

In [None]:
with zipfile.ZipFile("distilbert_model.zip", "r") as zip_ref:
    zip_ref.extractall("/content/")


**0. Hyper-parameters and models**

Here you need to select the model you want to use. Modify this cell before using the code.

In [None]:
# Initiating the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Insert the path to the xlsx file and the name of the relevant sheet
file_path = '../content/Abusive Relationship Stories.xlsx'
sheet_name = 'Abusive Relationship Stories'

### If using RoBerta ###
##tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
##model = RobertaForSequenceClassification.from_pretrained('model', output_attentions=True, num_labels=1)

### If using DistilBERT ###
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert_model',  output_attentions=True, num_labels=1)



**1. Load Dataset and Preprocessing**

We will be focusing on classifying the posts to positive and negative physical violence.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load data and select columns
df = pd.read_excel('Labeled.xlsx', sheet_name='Test', engine='openpyxl')
df = df[['title', 'body', 'physical_violence']]

# Replace target labels
df['physical_violence'] = df['physical_violence'].map({
    'no': 0,
    'cannot be inferred': 0,
    'irrelevant': 0,
    'yes': 1,
    'plausibly': 1
})

# Rename columns
df.columns = ['title', 'body', 'label']

# Remove missing values
df = df.dropna()

# Convert to string
df['title'] = df['title'].astype(str)
df['body'] = df['body'].astype(str)

# Plot the label distribution
sns.countplot(x='label', data=df)
plt.show()


**2. Tokenization and Length Distribution Analysis**


In [None]:
# Tokenize the texts and calculate lengths (in tokens)
token_lengths = [len(tokenizer.encode(text, truncation=False)) for text in df['body']]

# Count how many texts are longer than 512 tokens (the limit of the tokenizer)
long_texts_count = sum(1 for length in token_lengths if length > 512)
long_texts_percentage = round(100 * long_texts_count / len(df['body']), 3)

# Print the number of texts longer than 512 tokens with percentage
print(f'Number of texts longer than 512 tokens: {long_texts_count}, which are {long_texts_percentage}% of the data.')

# Plot token length distribution
plt.figure(figsize=(10, 6))
plt.hist(token_lengths, bins=50, alpha=0.7, label='Post body', color='green')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.title('Text Token Length Distribution (Before Summarization)')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()


**3. Filter Texts by Token Length**


In [None]:
# Filter the dataset to only include rows where the length of the text is less than 512 tokens (or 700, then truncate to 512)
filtered_df = df[[len(tokenizer.encode(text, truncation=False)) < 1000 for text in df['body']]]

# Show the number of rows in the new filtered dataset
print(f'Number of texts with less than 1000 tokens: {len(filtered_df)}')

# Recalculate the token lengths for the filtered dataset
filtered_token_lengths = [len(tokenizer.encode(text, truncation=False)) for text in filtered_df['body']]

# Plot token length distribution for the filtered dataset
plt.figure(figsize=(10, 6))
plt.hist(filtered_token_lengths, bins=50, alpha=0.7, label='Post body (filtered)', color='green')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.title('Text Token Length Distribution (Filtered, <700 tokens)')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()

# Check the effect of the data filtering on the label distribution
sns.countplot(x='label', data=filtered_df)


As we can see, dropping the long posts (more than 700 tokens) helps a bit to balance the labels


**4. Hyperparameters and Data Split**
* We will use the train set solely for training the model.
* We will use the test sets both as validations set in training, and to analyze the model with LIME.

**Note:** The GPU we used had low vRAM, so we limited the size of samples in the test set to be up to 150 tokens (will be later used during LIME). If you have a bigger GPU, don't run this cell. Instead, activate the next cell for regular train-test split.

In [None]:
# Define hyperparameters
hyperparameters = {
    'batch_size': 16,
    'epochs': 2,
    'tokenizer_max_length': 512,  # Max length for training samples
    'test_max_length': 512,       # Max length for test samples
    'class_1_weight': 3.0,
    'learning_rate': 1e-5,
    'test_size': 0.2
}

# Set the random seed for reproducibility
random.seed(42)

# Filter the dataset to separate samples that fit the test max length
filtered_texts = filtered_df['body'].tolist()
filtered_labels = filtered_df['label'].tolist()

short_texts, short_labels = [], []
long_texts, long_labels = [], []

for text, label in zip(filtered_texts, filtered_labels):
    encoding = tokenizer(text, truncation=True, max_length=hyperparameters['tokenizer_max_length'], return_tensors='pt')
    if encoding['input_ids'].shape[1] <= hyperparameters['test_max_length']:
        short_texts.append(text)
        short_labels.append(label)
    else:
        long_texts.append(text)
        long_labels.append(label)

# Determine the number of samples needed for the test set (20% of the data)
test_size = int(hyperparameters['test_size'] * len(filtered_texts))
actual_test_size = min(test_size, len(short_texts))

# Randomly sample from short texts for the test set
test_indices = random.sample(range(len(short_texts)), actual_test_size)
test_texts = [short_texts[i] for i in test_indices]
test_labels = [short_labels[i] for i in test_indices]

# Use remaining samples for training
train_texts = long_texts + [short_texts[i] for i in range(len(short_texts)) if i not in test_indices]
train_labels = long_labels + [short_labels[i] for i in range(len(short_labels)) if i not in test_indices]

# Display statistics
print(f'train_size: {len(train_texts)}')
print(f'train 0 label count: {train_labels.count(0)}')
print(f'train 1 label count: {train_labels.count(1)}')
print()
print(f'test_size: {len(test_texts)}')
print(f'test 0 label count: {test_labels.count(0)}')
print(f'test 1 label count: {test_labels.count(1)}')
print()

**Activate this cell to run a regular train-test split**


In [None]:
"""
from sklearn.model_selection import train_test_split
# Define hyperparameters
 hyperparameters = {
     'batch_size': 16,
     'epochs': 2,
     'tokenizer_max_length': 512,
     'class_1_weight': 3.0,
     'learning_rate': 1e-5
 }

 # Split the filtered data into training and testing sets
 train_texts, test_texts, train_labels, test_labels = train_test_split(
     filtered_df['body'].tolist(),
     filtered_df['label'].tolist(),
     test_size=0.2,
     random_state=42
 )

 print(f'train_size: {len(train_texts)}')
 print(f'train 0 label count: {train_labels.count(0)}')
 print(f'train 1 label count: {train_labels.count(1)}')
 print()

 print(f'test_size: {len(test_texts)}')
 print(f'test 0 label count: {test_labels.count(0)}')
 print(f'test 1 label count: {test_labels.count(1)}')
 print()
""""

**5. Tokenization and DataLoader Setup**


In [None]:
# Tokenization function
def tokenize_data(texts, labels,tokenizer):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=hyperparameters['tokenizer_max_length'])
    inputs = torch.tensor(encodings['input_ids'])
    attention_masks = torch.tensor(encodings['attention_mask'])
    labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)
    return TensorDataset(inputs, attention_masks, labels)

# Tokenize the data
train_dataset = tokenize_data(train_texts, train_labels,tokenizer)
test_dataset = tokenize_data(test_texts, test_labels,tokenizer)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=hyperparameters['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=hyperparameters['batch_size'])

# Initiating the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


**6. Model Training and Evaluations, and saving**

(Activating the training is not in this cell, but in the next one)


In [None]:
# Training function
def train_model(model, train_loader, test_loader, device, epochs=hyperparameters['epochs']):
    model.to(device)

    # Define loss function and optimizer
    pos_weight = torch.tensor([hyperparameters['class_1_weight']]).to(device)
    loss_fn = BCEWithLogitsLoss(pos_weight=pos_weight)
    optimizer = AdamW(model.parameters(), lr=hyperparameters['learning_rate'])

    model.train()

    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}', leave=False)

        for batch in progress_bar:
            optimizer.zero_grad()

            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            progress_bar.set_postfix({'Batch Loss': f'{loss.item():.4f}'})

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.4f}')

        # Evaluate on test set after each epoch
        evaluate_model(model, test_loader,loss_fn)

    # Move model back to CPU after training
    model.to('cpu')


# Model evaluation function
def evaluate_model(model, val_loader, loss_fn):
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            val_loss += loss.item()

            # Apply sigmoid activation and threshold to get binary predictions
            preds = torch.round(torch.sigmoid(logits))

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct / total

    # Calculate Precision, Recall, F1
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    print(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')


#### Activate the two cells below to activate the model training and save it


In [None]:
# train_model(model, train_loader, test_loader, device)
# torch.cuda.empty_cache()

In [None]:
"""
import os

 # Save model and tokenizer
 output_dir = 'distilbert_model'
 if not os.path.exists(output_dir):
     os.makedirs(output_dir)

 model.save_pretrained(output_dir)
 tokenizer.save_pretrained(output_dir)

 print(f"Model and tokenizer saved to {output_dir}")
"""

**7. Preparing the test set dataframe for LIME analysis**

Splitting the test_set into eight groups, based on the accuracy and confidence:
* True Positive (TP) with High Confidence.
* True Positive (TP) with Low Confidence.
* True Negative (TN) with High Confidence.
* True Negative (TN) with Low Confidence.
* False Positive (FP) with High Confidence.
* False Positive (FP) with Low Confidence.
* False Negative (FN) with High Confidence.
* False Negative (FN) with Low Confidence.

In [None]:
# Prediction function (using probabilities for confidence)
def predict_with_confidence(model, dataloader):
    model.to(device)  # Move model to GPU
    model.eval()
    predictions, confidences_class_1, true_labels = [], [], []

    with torch.no_grad():
        # Use tqdm to add a progress bar to the dataloader loop
        for batch in tqdm(dataloader, desc="Predicting", leave=False):
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.sigmoid(logits).cpu().numpy()  # Sigmoid output, gives probability for class 1
            preds = np.round(probs)

            predictions.extend(preds.flatten())  # Flatten predictions
            confidences_class_1.extend(probs.flatten())  # Confidence scores for class 1
            true_labels.extend(labels.cpu().numpy().flatten())  # Flatten true labels

    model.to('cpu')  # Move model back to CPU
    torch.cuda.empty_cache()

    return np.array(predictions), np.array(confidences_class_1), np.array(true_labels)

# Define thresholds for high and low confidence
high_conf_thresh = 0.8  # High confidence for class 1
low_conf_thresh = 0.2   # High confidence for class 0

# Make predictions on the test set and get confidence scores
predictions, confidences_class_1, true_labels = predict_with_confidence(model, test_loader)
torch.cuda.empty_cache()

# Create a DataFrame to store predictions, true labels, confidence values, and confidence scores
df_test = pd.DataFrame({
    'body': test_texts,       # Assuming 'test_texts' contains the original text data
    'y_true': true_labels,    # True labels
    'y_pred': predictions,    # Model predictions
    'confidence_class_1': confidences_class_1, # Probability for class 1 (sigmoid output)
    'confidence_in_predicted_class': np.where(predictions == 1, confidences_class_1, 1 - confidences_class_1)  # Confidence in the predicted class
})

# Categorize into True Positive, True Negative, False Positive, False Negative
df_test['group'] = np.where((df_test['y_true'] == 1) & (df_test['y_pred'] == 1), 'True Positive',
                    np.where((df_test['y_true'] == 0) & (df_test['y_pred'] == 0), 'True Negative',
                    np.where((df_test['y_true'] == 0) & (df_test['y_pred'] == 1), 'False Positive', 'False Negative')))

# Define high and low confidence
df_test['confidence_level'] = np.where((df_test['confidence_in_predicted_class'] >= high_conf_thresh) |
                                       (df_test['confidence_in_predicted_class'] <= low_conf_thresh),
                                       'High Confidence', 'Low Confidence')

# Combine group (TP, TN, FP, FN) and confidence level
df_test['final_group'] = df_test['group'] + ' - ' + df_test['confidence_level']

# View distribution of the final groups
group_counts = df_test['final_group'].value_counts()
print(group_counts)

# Plot the distribution of the final groups as a bar chart
plt.figure(figsize=(10, 6))
group_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Model Predictions by Accuracy and Confidence')
plt.xlabel('Group')
plt.ylabel('Number of Predictions')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

Stacked bar

In [None]:


# Count occurrences of each group and confidence level
group_confidence_counts = df_test.groupby(['group', 'confidence_level']).size().unstack(fill_value=0)

# Sort groups by total count (sum of high and low confidence)
group_confidence_counts = group_confidence_counts.loc[group_confidence_counts.sum(axis=1).sort_values(ascending=False).index]

# Plot the stacked bar chart
plt.figure(figsize=(10, 6))
ax = group_confidence_counts.plot(kind='bar', stacked=True, color=['skyblue', 'salmon'])

# Add a custom legend with confidence ranges
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, ['High Confidence [0,0.2]∧[0.8,1]', 'Low Confidence (0.2,0.8)'], title='Confidence Level', loc='upper right')

# Add annotations for each segment
for container in ax.containers:
    ax.bar_label(container, label_type='center', fontsize=10)

# Customize plot appearance
plt.title('Distribution of Model Predictions by Accuracy and Confidence')
plt.xlabel('Group')
plt.ylabel('Number of Predictions')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


### Column Descriptions for df_test

- **body**: The text content of each sample.
- **y_true**: The actual label (ground truth) for each sample.
- **y_pred**: The predicted label from the model for each sample.
- **confidence_class_1**: Model’s confidence score for classifying the sample as Class 1 (abusive).
- **confidence_in_predicted_class**: Model's confidence score for the predicted label.
- **group**: Category based on the correctness and confidence of the prediction (TP, TN, FP, FN).
- **confidence_level**: Confidence level categorization (e.g., Low, High).
- **final_group**: Combined 'group' and 'confidence_level.'

**8. LIME Analysis**
### Watch this video to understand LIME:
* https://www.youtube.com/watch?v=qQvC6FWlc-E

*Note:* From now on, we will conduct various analyses on the model to understand how it makes predictions. This will enable us to improve it with data that reflects our insights.


**8.1. Simple LIME examples on dummy samples**

In [None]:
# Initialize the LIME explainer with fixed class names
explainer = LimeTextExplainer(class_names=['Non-Abusive', 'Abusive'])

# Efficient Prediction Function for LIME
def predict_fn(text, model, tokenizer, max_tokens_length):
    encodings = tokenizer(text, truncation=True, padding=True, max_length=max_tokens_length, return_tensors="pt")
    input_ids = encodings['input_ids'].to(device)
    attention_mask = encodings['attention_mask'].to(device)

    with torch.no_grad():
        prob = torch.sigmoid(model(input_ids, attention_mask=attention_mask).logits).cpu().numpy()

    # Clear GPU memory
    del input_ids, attention_mask
    torch.cuda.empty_cache(); gc.collect()
    return np.hstack([1 - prob, prob])

# --- Per-Sample LIME Analysis (Optimized for GPU Memory) ---
def lime_analysis_per_sample(model, tokenizer, categorized_texts, analysis_params):
    model.to(device)

    for category, texts in categorized_texts.items():
        # Display header for the category with bold effect
        print("\n" + "="*60)
        print(f"\033[1m--- LIME Analysis for '{category}' ---\033[0m")
        print("="*60)

        for text in texts:
            # Perform LIME explanation for each sample
            explanation = explainer.explain_instance(
                text,
                lambda x: predict_fn(x, model, tokenizer, analysis_params['max_tokens_length']),
                num_features=analysis_params['top_k_features'],
                num_samples=analysis_params['num_of_perturbations_in_lime']
            )

            # Display explanation
            explanation.show_in_notebook()  # Viewable in notebooks, or use explanation.as_pyplot_figure() in scripts

            # Clear GPU memory after each sample
            torch.cuda.empty_cache()
            gc.collect()

    # Move model back to CPU after processing
    model.to('cpu')
    torch.cuda.empty_cache()
    gc.collect()


# Dummy sample categorized texts for LIME analysis
categorized_texts = {
    "Direct Violence": ["He hit him hard.", "She punched the wall.", "They attacked the stranger."],
    "Indirect Violence": ["They were shouting loudly.", "He threw the chair across the room.", "The situation was getting tense."],
    "Non-Violent Conflict": ["They argued for hours.", "He left without saying a word.", "She was upset, but kept calm."],
    "Neutral": ["They went for a walk.", "He smiled and waved.", "She worked at her desk."],
    "Ambiguous": ["He pushed him gently.", "They were playing rough.", "She slapped him jokingly."],
    "Sarcasm/Metaphor": ["She killed it at the presentation.", "He was beating the competition."],
    "Threats": ["I’ll hurt you if you don’t listen.", "She threatened to hit him."]
}

In [None]:
# Define analysis parameters for LIME
analysis_params = {
    'max_samples_per_category': 20,      # Max samples per category for analysis
    'top_k_features': 10,               # Top k features to display in LIME analysis
    'max_tokens_length': 200,           # Max token length for the tokenizer
    'num_of_perturbations_in_lime': 300 # Number of perturbations for LIME
}

lime_analysis_per_sample(model, tokenizer, categorized_texts, analysis_params)


**8.2. Low-Confidence Analysis: Identifying Confusing Cases**

This analysis focuses on finding cases where the model is most uncertain (confidence level near 0.5).
* We will identify the most confusing samples within each category: True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN).


* This is a per-sample analysis, aiming to understand the characteristics of samples that the model finds confusing.

In [None]:
# Prepare categorized texts for LIME analysis, selecting only groups that match the condition if specified
def select_extreme_samples(df, analysis_params, condition=None):
    categorized_texts = {}

    for group in df['final_group'].unique():
        # Process all groups if condition is None; otherwise, filter based on condition
        if condition is None or (condition is not None and condition in group):
            group_df = df[df['final_group'] == group].copy()
            group_df['distance_from_0.5'] = abs(group_df['confidence_in_predicted_class'] - 0.5)
            group_df = group_df.sort_values(by='distance_from_0.5')

            # Select up to max_samples_per_category samples
            categorized_texts[group] = group_df['body'].tolist()[:min(analysis_params['max_samples_per_category'], len(group_df))]

    return categorized_texts


# Run the low-confidence analysis
categorized_texts = select_extreme_samples(df_test, analysis_params,condition="Low Confidence")
lime_analysis_per_sample(model, tokenizer, categorized_texts, analysis_params)

**8.3. Strong Tokens Analysis: Key Influential Features by Group**

**Objective:** Identify the most impactful tokens that the model relies on in high-confidence and low-confidence cases across all groups (True Positives, False Positives, True Negatives, and False Negatives).

**Purpose:** This analysis helps us uncover patterns in the model’s decision-making by examining words or phrases that strongly influence predictions. Understanding these patterns highlights areas for model improvement.




In [None]:
# LIME Analysis Function with model argument in predict_fn
def lime_strong_tokens_analysis(model, tokenizer, categorized_texts,analysis_params, num_features=8, num_samples=200):
    aggregated_results = {}
    model.to(device)

    # Define positive and negative direction categories
    positive_direction_categories = {
       "False Positive - Low Confidence"
    }

    negative_direction_categories = {
        "False Negative - Low Confidence"
    }

    # Process each category in categorized_texts
    for category, texts in tqdm(categorized_texts.items(), desc="Processing Groups"):
        feature_importance = Counter()
        all_importances = {}

        # Set relevant direction based on category
        if category in positive_direction_categories:
            relevant_direction = lambda importance: importance > 0
        elif category in negative_direction_categories:
            relevant_direction = lambda importance: importance < 0
        else:
            relevant_direction = lambda importance: True  # Default to include all if unspecified

        for text in tqdm(texts, desc=f"Analyzing {category}", leave=False, dynamic_ncols=True):
            explanation = explainer.explain_instance(
                text, lambda x: predict_fn(x, model, tokenizer, max_tokens_length=analysis_params['max_tokens_length']),
                num_features=analysis_params['top_k_features'], num_samples=analysis_params['num_of_perturbations_in_lime']
            )


            # Collect and aggregate relevant feature importances
            for word, importance in explanation.as_list():
                if relevant_direction(importance):
                    feature_importance[word] += abs(importance)
                    if word not in all_importances:
                        all_importances[word] = []
                    all_importances[word].append(importance)

            # Clear GPU memory after each text
            torch.cuda.empty_cache()
            gc.collect()

        # Aggregate top features with mean and standard deviation
        top_features = feature_importance.most_common(5) # Change this if you want different number of aggregated features
        aggregated_results[category] = [
            (word, np.mean(all_importances[word]), np.std(all_importances[word])) for word, _ in top_features
        ]

    # Reset model to CPU
    model.to('cpu')
    torch.cuda.empty_cache()
    gc.collect()

    return aggregated_results

# Function to plot LIME analysis results
def plot_lime_results(aggregated_lime_results):
    fig, axes = plt.subplots(len(aggregated_lime_results), 1, figsize=(10, len(aggregated_lime_results) * 5))

    if len(aggregated_lime_results) == 1:
        axes = [axes]

    for idx, (group, features) in enumerate(aggregated_lime_results.items()):
        ax = axes[idx]
        words, means, std_devs = zip(*features)

        ax.barh(words, means, xerr=std_devs, color='skyblue' if idx % 2 == 0 else 'salmon', alpha=0.7)
        ax.set_title(f'LIME Analysis for "{group}" Category', fontsize=14)
        ax.set_xlabel('Average Importance of Feature (Across Sentences)', fontsize=12)
        ax.invert_yaxis()
        ax.grid(True)

    plt.tight_layout()
    plt.show()


# Create categorized texts from extreme samples
categorized_body_texts = select_extreme_samples(df_test, analysis_params)

# Run LIME analysis and aggregate results
aggregated_lime_results = lime_strong_tokens_analysis(model, tokenizer, categorized_body_texts,analysis_params)

# Call the plotting function to visualize the results
plot_lime_results(aggregated_lime_results)
