<a href="https://colab.research.google.com/github/casllmproject/bending_effect/blob/main/C2_1_SBERT_simul_text_similarity_Main_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Block1: Setup Data

In [None]:
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import mannwhitneyu
from itertools import combinations
import warnings

# Define the file path
file_path = "/content/drive/MyDrive/CYON_Analysis_Materials/Main_Test/Final_Cleaned_Dec18.csv"

# Read the CSV file into DataFrame df1
try:
    df1 = pd.read_csv(file_path)
    print("File loaded successfully.")
    print("First 5 rows of your data:")
    print(df1.head())
    print("\nDataFrame Info:")
    df1.info()

    # ------------------------------------------------------------------
    # Remove fixed introductory sentences from generatedBody
    # ------------------------------------------------------------------
    prefix = (
        "The Trump administration announced the U.S. withdrawal from the Paris Agreement in January 2025. "
        "This decision comes with far-reaching implications."
    )

    df1["ed_generatedBody"] = (
        df1["generatedBody"]
        .str.replace(prefix, "", regex=False)
        .str.lstrip()
    )

    print("\nText cleaning completed. 'ed_generatedBody' column created.")

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    print("Please check the file path and try again.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

Block 2: Generate Embeddings and Similarity MatrixThis is the most computationally expensive step. We encode all texts into SBERT vectors and then compute a single, large $N \times N$ similarity matrix, where $N$ is the total number of texts. We will reuse this matrix for all subsequent analyses.

In [None]:
# Load a pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Encoding texts... This may take a while.")

# --- IMPORTANT ---
# Make sure 'ed_generatedBody' is the correct column name from Block 1's output!
try:
    texts = df1['ed_generatedBody'].tolist()
except KeyError:
    print("="*50)
    print("ERROR: Could not find column 'ed_generatedBody'.")
    print("Please check the column names printed in Block 1 and")
    print("update the 'texts = df1['...']' line in Block 2 with your correct text column.")
    print("="*50)
    # Stop execution by raising the error again
    raise

embeddings = model.encode(texts, show_progress_bar=True)
print("Embeddings generated. Shape:", embeddings.shape)

# Calculate the pairwise cosine similarity matrix for all texts
print("Calculating full similarity matrix...")
similarity_matrix = cosine_similarity(embeddings)
print("Similarity matrix shape:", similarity_matrix.shape)

Block 3: Define Reusable Analysis Function
This block defines the master function. You only need to run this block once to "teach" Python the function.

In [None]:
def perform_similarity_analysis(df, group_column_name, similarity_matrix):
    """
    Performs a 3-part similarity analysis on a dataframe
    for a specified grouping column.

    Args:
        df (pd.DataFrame): The dataframe containing the group labels.
        group_column_name (str): The name of the column to group by.
        similarity_matrix (np.array): The pre-computed N x N similarity matrix.
    """

    print("\n" + "#"*60)
    print(f"  RUNNING ANALYSIS FOR: '{group_column_name}'")
    print("#"*60)

    # Ensure the column exists
    if group_column_name not in df.columns:
        print(f"ERROR: Column '{group_column_name}' not found in DataFrame.")
        print("Skipping this analysis.")
        return

    # Make sure group values are non-null
    df_analysis = df[[group_column_name]].copy()
    df_analysis = df_analysis.dropna(subset=[group_column_name])

    groups = sorted(df_analysis[group_column_name].unique())
    print(f"Found {len(groups)} unique groups: {groups}")

    all_within_scores = {}
    within_group_results = {}

    # --- Part 1: Within-Group Similarity ---
    print("\n" + "="*30)
    print("Part 1: Within-Group Similarity")
    print("="*30)

    for group in groups:
        group_indices = df_analysis[df_analysis[group_column_name] == group].index.tolist()

        if len(group_indices) < 2:
            print(f"Group '{group}' has fewer than 2 items, skipping.")
            continue

        sub_matrix = similarity_matrix[np.ix_(group_indices, group_indices)]
        iu_indices = np.triu_indices_from(sub_matrix, k=1)
        within_scores = sub_matrix[iu_indices]

        if len(within_scores) > 0:
            all_within_scores[group] = within_scores
            mean_similarity = np.mean(within_scores)
            within_group_results[group] = mean_similarity
            print(f"  - Group '{group}' (N={len(group_indices)}): Avg Similarity = {mean_similarity:.4f}")
        else:
            print(f"  - Group '{group}' (N={len(group_indices)}): Not enough pairs for comparison.")

    # --- Part 2: Between-Group Similarity ---
    print("\n" + "="*30)
    print("Part 2: Between-Group Similarity (Difference)")
    print("="*30)

    between_group_results = {}
    for group1, group2 in combinations(groups, 2):
        group1_indices = df_analysis[df_analysis[group_column_name] == group1].index.tolist()
        group2_indices = df_analysis[df_analysis[group_column_name] == group2].index.tolist()

        if not group1_indices or not group2_indices:
            continue

        sub_matrix = similarity_matrix[np.ix_(group1_indices, group2_indices)]
        between_scores = sub_matrix.flatten()

        if len(between_scores) > 0:
            mean_similarity = np.mean(between_scores)
            pair_name = f"{group1} <-> {group2}"
            between_group_results[pair_name] = mean_similarity
            print(f"  - Pair '{pair_name}': Avg Similarity = {mean_similarity:.4f}")

    # --- Part 3: Statistical Significance ---
    print("\n" + "="*30)
    print("Part 3: Statistical Significance")
    print("="*30)

    stat_results = []

    # Suppress warnings from Mann-Whitney U test (e.g., ties)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for group in groups:
            within_scores = all_within_scores.get(group)

            if within_scores is None or len(within_scores) == 0:
                print(f"  - Group '{group}': Skipping (no within-group pairs).")
                continue

            group_indices = df_analysis[df_analysis[group_column_name] == group].index.tolist()
            other_indices = df_analysis[df_analysis[group_column_name] != group].index.tolist()

            if not other_indices:
                print(f"  - Group '{group}': Skipping (no other groups to compare to).")
                continue

            sub_matrix = similarity_matrix[np.ix_(group_indices, other_indices)]
            between_scores = sub_matrix.flatten()

            if len(between_scores) == 0:
                print(f"  - Group '{group}': Skipping (no between-group pairs).")
                continue

            try:
                stat, p_value = mannwhitneyu(
                    within_scores,
                    between_scores,
                    alternative='greater'
                )

                mean_within = np.mean(within_scores)
                mean_between = np.mean(between_scores)

                stat_results.append({
                    'Group': group,
                    'Mean_Within_Sim': mean_within,
                    'Mean_Between_Sim': mean_between,
                    'U_Statistic': stat,
                    'p_value': p_value,
                    'Significant (p<0.05)': p_value < 0.05
                })

            except ValueError:
                 print(f"  - Group '{group}': Could not run test (e.g., zero variance).")

    if stat_results:
        df_stats = pd.DataFrame(stat_results)
        df_stats = df_stats.set_index('Group')
        print("\n--- Summary of Statistical Significance ---")
        print(df_stats.to_string(float_format="%.6f"))
    else:
        print("\nNo statistical results to display.")

    print("\n" + "#"*60)
    print(f"  ANALYSIS FOR '{group_column_name}' COMPLETE")
    print("#"*60 + "\n")

print("Reusable analysis function 'perform_similarity_analysis' is defined.")

Block 4: Analysis - Run for "DISP"
This block calls the same function on "Group" column.


In [None]:
# Run the complete analysis for the 'Group' column
perform_similarity_analysis(
    df=df1,
    group_column_name='DISP',
    similarity_matrix=similarity_matrix
)

In [None]:
import pandas as pd
import numpy as np
from plotnine import *

# 1. Prepare the data from your summary table
data = {
    'Group': ['DISP 0', 'DISP 1', 'DISP 2', 'DISP 3'],
    'Mean_Within_Sim': [0.783628, 0.850774, 0.827533, 0.825771],
    'Mean_Between_Sim': [0.753403, 0.777711, 0.766096, 0.703685],
    'p_value': [0.000000, 0.000000, 0.000000, 0.000000]
}
df = pd.DataFrame(data)

# 2. Reshape data to 'long format'
df_melted = df.melt(
    id_vars=['Group', 'p_value'],
    value_vars=['Mean_Within_Sim', 'Mean_Between_Sim'],
    var_name='Metric',
    value_name='Similarity'
)

df_melted['Metric'] = df_melted['Metric'].replace({
    'Mean_Within_Sim': 'Within-Group',
    'Mean_Between_Sim': 'Between-Group'
})

# 3. Helper for significance annotations
df_stars = df.copy()
df_stars['label'] = '***'
# Place stars slightly above the highest bar in each group
df_stars['y_pos'] = df[['Mean_Within_Sim', 'Mean_Between_Sim']].max(axis=1) + 0.05

# 4. Construct and display the plot
# Wrapping the plot in parentheses and ensuring it's the last line triggers Colab's auto-render
plot = (
    ggplot(df_melted, aes(x='Group', y='Similarity', fill='Metric'))
    + geom_col(position='dodge', width=0.7, color="#333333", size=0.2)
    + geom_text(
        data=df_stars,
        mapping=aes(x='Group', y='y_pos', label='label'),
        inherit_aes=False,
        size=18,
        va='bottom'
    )
    + scale_fill_manual(values=["#BDC3C7", "#2C3E50"]) # Professional Grayscale/Navy
    + labs(
        title='Comparative Analysis of Group Similarity Metrics',
        subtitle='Significance: *** p < 0.001 (Mann-Whitney U Test)',
        x='Experimental Groups',
        y='Mean Cosine Similarity',
        fill='Similarity Type'
    )
    + scale_y_continuous(expand=(0, 0, 0.1, 0), limits=(0, 1.0))
    + theme_classic()
    + theme(
        figure_size=(10, 6),
        title=element_text(size=14, weight='bold'),
        legend_position='right',
        axis_line=element_line(size=1, color="black"),
        panel_grid_major_y=element_line(color="lightgrey", linetype="dashed")
    )
)

# This line ensures the plot is shown in Colab
plot.draw()