https://gemini.google.com/share/3e823c0c7f9e

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Block1: Setup Data

In [None]:
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import mannwhitneyu
from itertools import combinations
import warnings

# Define the file path
file_path = "/content/drive/MyDrive/CYON_Analysis_Materials/integrated_simul_generation_Oct18_PROCESSED.csv"

# Read the CSV file into DataFrame df1
try:
    df1 = pd.read_csv(file_path)
    print("File loaded successfully.")
    print("First 5 rows of your data:")
    print(df1.head())
    print("\nDataFrame Info:")
    df1.info()
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    print("Please check the file path and try again.")
except Exception as e:
    print(f"An error occurred while loading the file: {e}")

File loaded successfully.
First 5 rows of your data:
        StartDate         EndDate      Status      IPAddress  Progress  \
0  10/16/25 16:42  10/16/25 16:47  IP Address  75.144.84.222       100   
1  10/16/25 16:47  10/16/25 16:50  IP Address  75.144.84.222       100   
2  10/16/25 16:50  10/16/25 16:53  IP Address  75.144.84.222       100   
3  10/16/25 16:53  10/16/25 16:56  IP Address  75.144.84.222       100   
4  10/16/25 16:56  10/16/25 16:59  IP Address  75.144.84.222       100   

   Duration (in seconds)  Finished    RecordedDate         ResponseId  \
0                    273      True  10/16/25 16:47  R_3ezdUQSu2SW0qnf   
1                    166      True  10/16/25 16:50  R_6UX1jrW01JRDOkV   
2                    195      True  10/16/25 16:53  R_7rkPzvOQWkdwVWB   
3                    175      True  10/16/25 16:56  R_1wLWPzhwoSgp6uG   
4                    152      True  10/16/25 16:59  R_6EfS0bSyhChkDkd   

   RecipientLastName  ...                                      

Block 2: Generate Embeddings and Similarity MatrixThis is the most computationally expensive step. We encode all texts into SBERT vectors and then compute a single, large $N \times N$ similarity matrix, where $N$ is the total number of texts. We will reuse this matrix for all subsequent analyses.

In [None]:
# Load a pre-trained SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

print("Encoding texts... This may take a while.")

# --- IMPORTANT ---
# Make sure 'ed_generatedBody' is the correct column name from Block 1's output!
try:
    texts = df1['ed_generatedBody'].tolist()
except KeyError:
    print("="*50)
    print("ERROR: Could not find column 'ed_generatedBody'.")
    print("Please check the column names printed in Block 1 and")
    print("update the 'texts = df1['...']' line in Block 2 with your correct text column.")
    print("="*50)
    # Stop execution by raising the error again
    raise

embeddings = model.encode(texts, show_progress_bar=True)
print("Embeddings generated. Shape:", embeddings.shape)

# Calculate the pairwise cosine similarity matrix for all texts
print("Calculating full similarity matrix...")
similarity_matrix = cosine_similarity(embeddings)
print("Similarity matrix shape:", similarity_matrix.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Encoding texts... This may take a while.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings generated. Shape: (128, 384)
Calculating full similarity matrix...
Similarity matrix shape: (128, 128)


Block 3: Define Reusable Analysis Function
This block defines the master function. You only need to run this block once to "teach" Python the function.

In [None]:
def perform_similarity_analysis(df, group_column_name, similarity_matrix):
    """
    Performs a 3-part similarity analysis on a dataframe
    for a specified grouping column.

    Args:
        df (pd.DataFrame): The dataframe containing the group labels.
        group_column_name (str): The name of the column to group by.
        similarity_matrix (np.array): The pre-computed N x N similarity matrix.
    """

    print("\n" + "#"*60)
    print(f"  RUNNING ANALYSIS FOR: '{group_column_name}'")
    print("#"*60)

    # Ensure the column exists
    if group_column_name not in df.columns:
        print(f"ERROR: Column '{group_column_name}' not found in DataFrame.")
        print("Skipping this analysis.")
        return

    # Make sure group values are non-null
    df_analysis = df[[group_column_name]].copy()
    df_analysis = df_analysis.dropna(subset=[group_column_name])

    groups = sorted(df_analysis[group_column_name].unique())
    print(f"Found {len(groups)} unique groups: {groups}")

    all_within_scores = {}
    within_group_results = {}

    # --- Part 1: Within-Group Similarity ---
    print("\n" + "="*30)
    print("Part 1: Within-Group Similarity")
    print("="*30)

    for group in groups:
        group_indices = df_analysis[df_analysis[group_column_name] == group].index.tolist()

        if len(group_indices) < 2:
            print(f"Group '{group}' has fewer than 2 items, skipping.")
            continue

        sub_matrix = similarity_matrix[np.ix_(group_indices, group_indices)]
        iu_indices = np.triu_indices_from(sub_matrix, k=1)
        within_scores = sub_matrix[iu_indices]

        if len(within_scores) > 0:
            all_within_scores[group] = within_scores
            mean_similarity = np.mean(within_scores)
            within_group_results[group] = mean_similarity
            print(f"  - Group '{group}' (N={len(group_indices)}): Avg Similarity = {mean_similarity:.4f}")
        else:
            print(f"  - Group '{group}' (N={len(group_indices)}): Not enough pairs for comparison.")

    # --- Part 2: Between-Group Similarity ---
    print("\n" + "="*30)
    print("Part 2: Between-Group Similarity (Difference)")
    print("="*30)

    between_group_results = {}
    for group1, group2 in combinations(groups, 2):
        group1_indices = df_analysis[df_analysis[group_column_name] == group1].index.tolist()
        group2_indices = df_analysis[df_analysis[group_column_name] == group2].index.tolist()

        if not group1_indices or not group2_indices:
            continue

        sub_matrix = similarity_matrix[np.ix_(group1_indices, group2_indices)]
        between_scores = sub_matrix.flatten()

        if len(between_scores) > 0:
            mean_similarity = np.mean(between_scores)
            pair_name = f"{group1} <-> {group2}"
            between_group_results[pair_name] = mean_similarity
            print(f"  - Pair '{pair_name}': Avg Similarity = {mean_similarity:.4f}")

    # --- Part 3: Statistical Significance ---
    print("\n" + "="*30)
    print("Part 3: Statistical Significance")
    print("="*30)

    stat_results = []

    # Suppress warnings from Mann-Whitney U test (e.g., ties)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")

        for group in groups:
            within_scores = all_within_scores.get(group)

            if within_scores is None or len(within_scores) == 0:
                print(f"  - Group '{group}': Skipping (no within-group pairs).")
                continue

            group_indices = df_analysis[df_analysis[group_column_name] == group].index.tolist()
            other_indices = df_analysis[df_analysis[group_column_name] != group].index.tolist()

            if not other_indices:
                print(f"  - Group '{group}': Skipping (no other groups to compare to).")
                continue

            sub_matrix = similarity_matrix[np.ix_(group_indices, other_indices)]
            between_scores = sub_matrix.flatten()

            if len(between_scores) == 0:
                print(f"  - Group '{group}': Skipping (no between-group pairs).")
                continue

            try:
                stat, p_value = mannwhitneyu(
                    within_scores,
                    between_scores,
                    alternative='greater'
                )

                mean_within = np.mean(within_scores)
                mean_between = np.mean(between_scores)

                stat_results.append({
                    'Group': group,
                    'Mean_Within_Sim': mean_within,
                    'Mean_Between_Sim': mean_between,
                    'U_Statistic': stat,
                    'p_value': p_value,
                    'Significant (p<0.05)': p_value < 0.05
                })

            except ValueError:
                 print(f"  - Group '{group}': Could not run test (e.g., zero variance).")

    if stat_results:
        df_stats = pd.DataFrame(stat_results)
        df_stats = df_stats.set_index('Group')
        print("\n--- Summary of Statistical Significance ---")
        print(df_stats.to_string(float_format="%.6f"))
    else:
        print("\nNo statistical results to display.")

    print("\n" + "#"*60)
    print(f"  ANALYSIS FOR '{group_column_name}' COMPLETE")
    print("#"*60 + "\n")

print("Reusable analysis function 'perform_similarity_analysis' is defined.")

Reusable analysis function 'perform_similarity_analysis' is defined.


Block 4: Analysis 1 - Run for "gr_per"
This block calls the function to perform the analysis on your original column.

In [None]:
# Run the complete analysis for the 'gr_per' column
perform_similarity_analysis(
    df=df1,
    group_column_name='gr_per',
    similarity_matrix=similarity_matrix
)


############################################################
  RUNNING ANALYSIS FOR: 'gr_per'
############################################################
Found 8 unique groups: ['0C', '0L', '1C', '1L', '2C', '2L', '3C', '3L']

Part 1: Within-Group Similarity
  - Group '0C' (N=16): Avg Similarity = 0.9043
  - Group '0L' (N=16): Avg Similarity = 0.8854
  - Group '1C' (N=16): Avg Similarity = 0.8762
  - Group '1L' (N=16): Avg Similarity = 0.8898
  - Group '2C' (N=16): Avg Similarity = 0.8724
  - Group '2L' (N=16): Avg Similarity = 0.8800
  - Group '3C' (N=16): Avg Similarity = 0.8598
  - Group '3L' (N=16): Avg Similarity = 0.8710

Part 2: Between-Group Similarity (Difference)
  - Pair '0C <-> 0L': Avg Similarity = 0.7554
  - Pair '0C <-> 1C': Avg Similarity = 0.8207
  - Pair '0C <-> 1L': Avg Similarity = 0.8530
  - Pair '0C <-> 2C': Avg Similarity = 0.8204
  - Pair '0C <-> 2L': Avg Similarity = 0.8346
  - Pair '0C <-> 3C': Avg Similarity = 0.6980
  - Pair '0C <-> 3L': Avg Similarity = 0

Block 5: Analysis 2 - Run for "Group"
This block calls the same function on your new "Group" column.


In [None]:
# Run the complete analysis for the 'Group' column
perform_similarity_analysis(
    df=df1,
    group_column_name='Group',
    similarity_matrix=similarity_matrix
)


############################################################
  RUNNING ANALYSIS FOR: 'Group'
############################################################
Found 4 unique groups: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]

Part 1: Within-Group Similarity
  - Group '0' (N=32): Avg Similarity = 0.8229
  - Group '1' (N=32): Avg Similarity = 0.8755
  - Group '2' (N=32): Avg Similarity = 0.8739
  - Group '3' (N=32): Avg Similarity = 0.8607

Part 2: Between-Group Similarity (Difference)
  - Pair '0 <-> 1': Avg Similarity = 0.8332
  - Pair '0 <-> 2': Avg Similarity = 0.8094
  - Pair '0 <-> 3': Avg Similarity = 0.7217
  - Pair '1 <-> 2': Avg Similarity = 0.8372
  - Pair '1 <-> 3': Avg Similarity = 0.7362
  - Pair '2 <-> 3': Avg Similarity = 0.8060

Part 3: Statistical Significance

--- Summary of Statistical Significance ---
       Mean_Within_Sim  Mean_Between_Sim    U_Statistic  p_value  Significant (p<0.05)
Group                                                                      

Block 6: Analysis 3 - Run for "DEM8"
This block calls the same function on your final "DEM8" column.

In [None]:
# Run the complete analysis for the 'DEM8' column
perform_similarity_analysis(
    df=df1,
    group_column_name='DEM8',
    similarity_matrix=similarity_matrix
)


############################################################
  RUNNING ANALYSIS FOR: 'DEM8'
############################################################
Found 2 unique groups: ['Democrat', 'Republican']

Part 1: Within-Group Similarity
  - Group 'Democrat' (N=63): Avg Similarity = 0.8024
  - Group 'Republican' (N=58): Avg Similarity = 0.8234

Part 2: Between-Group Similarity (Difference)
  - Pair 'Democrat <-> Republican': Avg Similarity = 0.8040

Part 3: Statistical Significance

--- Summary of Statistical Significance ---
            Mean_Within_Sim  Mean_Between_Sim    U_Statistic  p_value  Significant (p<0.05)
Group                                                                                      
Democrat           0.802370          0.804013 3617883.500000 0.194478                 False
Republican         0.823425          0.804013 3475652.500000 0.000000                  True

############################################################
  ANALYSIS FOR 'DEM8' COMPLETE
########

In [None]:
print(df1)

          StartDate         EndDate      Status       IPAddress  Progress  \
0    10/16/25 16:42  10/16/25 16:47  IP Address   75.144.84.222       100   
1    10/16/25 16:47  10/16/25 16:50  IP Address   75.144.84.222       100   
2    10/16/25 16:50  10/16/25 16:53  IP Address   75.144.84.222       100   
3    10/16/25 16:53  10/16/25 16:56  IP Address   75.144.84.222       100   
4    10/16/25 16:56  10/16/25 16:59  IP Address   75.144.84.222       100   
..              ...             ...         ...             ...       ...   
123  10/18/25 16:26  10/18/25 16:28  IP Address  108.71.225.169       100   
124  10/18/25 16:28  10/18/25 16:30  IP Address  108.71.225.169       100   
125  10/18/25 16:30  10/18/25 16:31  IP Address  108.71.225.169       100   
126  10/18/25 16:31  10/18/25 16:33  IP Address  108.71.225.169       100   
127  10/18/25 16:33  10/18/25 16:36  IP Address  108.71.225.169       100   

     Duration (in seconds)  Finished    RecordedDate         ResponseId  \


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# 1. Data Preparation
# Subgroup Similarity Data (R = Republican/Conservative, D = Democrat/Liberal)
stats_data = [
    ["0R", 0.9043, 0.7886, True],
    ["0D", 0.8854, 0.7783, True],
    ["1R", 0.8762, 0.8038, True],
    ["1D", 0.8898, 0.8195, True],
    ["2R", 0.8724, 0.8235, True],
    ["2D", 0.8800, 0.8270, True],
    ["3R", 0.8598, 0.7525, True],
    ["3D", 0.8710, 0.7859, True]
]
df_stats = pd.DataFrame(stats_data, columns=["Group", "Within", "Between", "Significant"])

# Overall Political Affiliation Data
pol_data = [
    ["D", 0.8024, 0.8040, False],
    ["R", 0.8234, 0.8040, True]
]
df_pol = pd.DataFrame(pol_data, columns=["Affiliation", "Within", "Between", "Significant"])

# 2. Plotting Setup
plt.rcParams.update({'font.size': 12})
fig, axes = plt.subplots(2, 1, figsize=(14, 14))

# Color Palette
red_color = '#d62728'  # Republican Red
blue_color = '#1f77b4' # Democrat Blue
width = 0.35

# --- Plot 1: Subgroup Analysis ---
x = np.arange(len(df_stats))

for i, row in df_stats.iterrows():
    # Set color based on party affiliation
    color = red_color if 'R' in row['Group'] else blue_color

    # Legend labels (only for the first set to avoid duplicates)
    label_w = f"{'Republican' if 'R' in row['Group'] else 'Democrat'} (Within)" if i < 2 else None
    label_b = f"{'Republican' if 'R' in row['Group'] else 'Democrat'} (Between)" if i < 2 else None

    # Within-group Bar (Solid)
    axes[0].bar(i - width/2, row['Within'], width, color=color, edgecolor='black',
                alpha=0.9, label=label_w)
    # Between-group Bar (Lighter/Transparent)
    axes[0].bar(i + width/2, row['Between'], width, color=color, edgecolor='black',
                alpha=0.3, label=label_b)

# Formatting Plot 1
axes[0].set_xticks(x)
axes[0].set_xticklabels(df_stats["Group"])
axes[0].set_ylabel("Similarity Score")
axes[0].set_ylim(0, 1.1)
axes[0].set_title("Semantic Similarity by Subgroup (Within vs. Between)")
axes[0].legend(ncol=2, frameon=True)
axes[0].grid(axis='y', linestyle='--', alpha=0.5)

# Add Significance Stars
for i, sig in enumerate(df_stats["Significant"]):
    if sig:
        axes[0].text(i, df_stats["Within"][i] + 0.02, "*", ha='center', fontsize=22, fontweight='bold')

# --- Plot 2: Political Affiliation (Main Party) ---
x_pol = np.arange(len(df_pol))

for i, row in df_pol.iterrows():
    color = blue_color if row['Affiliation'] == 'D' else red_color
    label_w = "Democrat (Within)" if row['Affiliation'] == 'D' else "Republican (Within)"
    label_b = "Between-Party Baseline" if i == 0 else None

    # Party Within Bar
    axes[1].bar(i - width/2, row['Within'], width, color=color, edgecolor='black', alpha=0.9, label=label_w)
    # General Baseline Bar
    axes[1].bar(i + width/2, row['Between'], width, color='gray', edgecolor='black', alpha=0.3, label=label_b)

# Formatting Plot 2
axes[1].set_xticks(x_pol)
axes[1].set_xticklabels(["Democrat", "Republican"])
axes[1].set_ylabel("Similarity Score")
axes[1].set_ylim(0, 1.1)
axes[1].set_title("Overall Political Affiliation Similarity (DEM8)")
axes[1].legend(frameon=True)
axes[1].grid(axis='y', linestyle='--', alpha=0.5)

# Add Significance Stars
for i, sig in enumerate(df_pol["Significant"]):
    if sig:
        axes[1].text(i, df_pol["Within"][i] + 0.02, "*", ha='center', fontsize=22, fontweight='bold')

plt.tight_layout()
# Save as high-res PNG
plt.savefig('similarity_analysis_plot.png', dpi=300)
plt.show()