In [1]:
import os, statistics, textwrap, glob, json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
import seaborn as sns

In [2]:
output_dir = "plots"
os.makedirs(output_dir, exist_ok=True)

In [3]:
def compute_distinct1_corpus(messages):
    """
    Given an iterable of text strings, returns the global Distinct‑1
    (unique unigrams / total unigrams) over the entire corpus.
    """
    tokens = []
    for text in messages:
        if not text:
            continue
        tokens.extend(text.lower().split())
    total = len(tokens)
    return (len(set(tokens)) / total) if total > 0 else 0

def process_json_folder(folder_path):
    """
    Loops over every .json in `folder_path`, computes and stores each file's
    Distinct‑1 for assistant & user, then returns:
      - file_scores: { filename: {'assistant':…, 'user':…}, … }
      - avg_scores:  {'assistant': avg_assistant, 'user': avg_user}
    """
    file_scores     = {}
    assistant_list  = []
    user_list       = []

    for fname in os.listdir(folder_path):
        if not fname.lower().endswith(".json"):
            continue

        fullpath = os.path.join(folder_path, fname)
        with open(fullpath, encoding="utf-8") as f:
            data = json.load(f)

        # extract texts by role
        assistant_texts = [item["content"]
                           for item in data
                           if item.get("role") == "assistant"]
        user_texts      = [item["content"]
                           for item in data
                           if item.get("role") == "user"]

        a_score = compute_distinct1_corpus(assistant_texts)
        u_score = compute_distinct1_corpus(user_texts)

        file_scores[fname] = {"assistant": a_score, "user": u_score}
        assistant_list.append(a_score)
        user_list.append(u_score)

    avg_scores = {
        "assistant": statistics.fmean(assistant_list) if assistant_list else 0,
        "user":      statistics.fmean(user_list)      if user_list     else 0
    }

    return file_scores, avg_scores

In [4]:
if __name__ == "__main__":
    folder = "25_03_Transcripts"
    per_file, averages = process_json_folder(folder)

    print("Distinct‑1 per file:")
    for fn, scores in per_file.items():
        print(f"  {fn}: assistant={scores['assistant']:.4f}, user={scores['user']:.4f}")

    print("\nAverage Distinct‑1 across all files:")
    print(f"  assistant: {averages['assistant']:.4f}")
    print(f"  user:      {averages['user']:.4f}")

    # Collect scores
    filenames         = sorted(per_file.keys())
    assistant_scores  = [per_file[f]['assistant'] for f in filenames]
    user_scores       = [per_file[f]['user']      for f in filenames]

    # --- Beeswarm plot with median lines ---
    plt.figure(figsize=(6, 6))

    # Jittered x-positions around 0 for user, around 1 for assistant
    jitter_u = np.random.normal(loc=0, scale=0.04, size=len(user_scores))
    jitter_a = np.random.normal(loc=1, scale=0.04, size=len(assistant_scores))

    plt.scatter(jitter_u, user_scores,      alpha=0.7, label='User')
    plt.scatter(jitter_a, assistant_scores, alpha=0.7, label='Assistant')

    # Median indicators
    med_u = np.median(user_scores)
    med_a = np.median(assistant_scores)
    plt.hlines(med_u, -0.2, 0.2, linestyles='--')
    plt.hlines(med_a,  0.8, 1.2, linestyles='--')

    # Formatting
    plt.xlim(-0.5, 1.5)
    plt.xticks([0, 1], ['User', 'Assistant'])
    plt.ylabel('Distinct-1 Score')
    plt.title('Beeswarm of Distinct-1 Scores by Role')
    plt.legend()
    plt.tight_layout()

    # Save & show
    plot_path = os.path.join(output_dir, 'distinct1_beeswarm.png')
    plt.savefig(plot_path)
    print(f"Saved beeswarm plot to {plot_path}")
    plt.show()

    # # Bar positions and width
    # indices = np.arange(len(filenames))
    # bar_width = 0.35

    # # Dynamic figure width
    # fig_width = max(10, len(filenames) * 0.2)
    # plt.figure(figsize=(fig_width, 6))

    # # Plot grouped bars
    # plt.bar(indices, user_scores, bar_width, label='User')
    # plt.bar(indices + bar_width, assistant_scores, bar_width, label='Assistant')

    # # X-axis ticks and labels
    # plt.xticks(indices + bar_width/2, [i + 1 for i in indices], rotation=45, ha='right', fontsize=8)

    # # Labels and title
    # plt.xlabel('Transcript #')
    # plt.ylabel('Distinct-1 Score')
    # plt.title('Distinct-1 per Transcript by Role')
    # plt.legend()
    # plt.tight_layout()

    # # Save and display
    # plot_path = os.path.join(output_dir, 'distinct1_barplot.png')
    # plt.savefig(plot_path)
    # print(f"Saved barplot to {plot_path}")
    # plt.show()

    
    # # Prepare data for violin plot
    # filenames = sorted(per_file.keys())
    # assistant_scores = [per_file[f]['assistant'] for f in filenames]
    # user_scores      = [per_file[f]['user']      for f in filenames]

    # # Build a DataFrame for seaborn
    # df_v = pd.DataFrame({
    #     "Role": ["User"] * len(user_scores) + ["Assistant"] * len(assistant_scores),
    #     "Distinct1": user_scores + assistant_scores
    # })

    # # Plot violin
    # plt.figure(figsize=(8, 6))
    # sns.violinplot(x="Role", y="Distinct1", data=df_v)
    # plt.xlabel("Role")
    # plt.ylabel("Distinct-1 Score")
    # plt.title("Violin Plot of Distinct-1 Scores by Role")
    # plt.tight_layout()

    # # Save & show
    # violin_path = os.path.join(output_dir, "distinct1_violinplot.png")
    # plt.savefig(violin_path)
    # print(f"Saved violin plot to {violin_path}")
    # plt.show()
    
    # # --- Prepare data for the boxplot ---
    # filenames = sorted(per_file.keys())
    # assistant_scores = [per_file[f]['assistant'] for f in filenames]
    # user_scores      = [per_file[f]['user']      for f in filenames]

    # # --- Draw boxplot instead of lineplot ---
    # plt.figure(figsize=(8, 6))
    # plt.boxplot(
    #     [user_scores, assistant_scores],
    #     labels=["User", "Assistant"],
    #     patch_artist=True
    # )
    # plt.ylabel("Distinct-1 Score")
    # plt.title("Distribution of Distinct-1 Scores by Role")

    # # Save & show
    # plot_path = os.path.join(output_dir, 'distinct1_boxplot.png')
    # plt.savefig(plot_path)
    # print(f"Saved boxplot to {plot_path}")
    # plt.show()
    
    # # 2) Plot scores (make sure this is indented under the if-block)
    # filenames = sorted(per_file.keys())
    # assistant_scores = [per_file[f]['assistant'] for f in filenames]
    # user_scores      = [per_file[f]['user']      for f in filenames]
    # x = range(len(filenames))
    # avg_assistant = averages['assistant']
    # avg_user      = averages['user']

    # # Dynamically size figure
    # num_ticks      = len(filenames)
    # fig_width      = max(10, num_ticks * 0.2)
    # plt.figure(figsize=(fig_width, 6))
    
    # plt.plot(x, user_scores,      marker='o', linestyle='-', label='User',      color='blue')
    # plt.plot(x, assistant_scores, marker='o', linestyle='-', label='Assistant', color='red')
    # plt.axhline(avg_user,      linestyle='--', color='blue', label=f'User Avg ({avg_user:.3f})')
    # plt.axhline(avg_assistant, linestyle='--', color='red',  label=f'Assistant Avg ({avg_assistant:.3f})')
    # # Every tick, rotated and readable
    # plt.xticks(
    #     x,
    #     [i+1 for i in x],
    #     fontsize=8
    # )
    # # plt.subplots_adjust(bottom=0.25)
    # plt.xlabel('Transcript #')  
    # plt.ylabel('Distinct-1 Score')
    # plt.title('Distinct-1 per Transcript by Role')
    # plt.legend()
    # plt.tight_layout()

    # plot_path = os.path.join(output_dir, 'distinct1_lineplot.png')
    # plt.savefig(plot_path)
    # print(f"Saved plot to {plot_path}")
    # plt.show()

Distinct‑1 per file:
  DM_20250313-183641_Interview.json: assistant=0.3574, user=0.4245
  DM_20250313-122155_Interview.json: assistant=0.3462, user=0.4350
  DM_20250313-205138_Interview.json: assistant=0.3652, user=0.4671
  DM_20250314-123247_Interview.json: assistant=0.3456, user=0.4359
  DM_20250313-072430_Interview.json: assistant=0.3149, user=0.3953
  DM_20250314-014924_Interview.json: assistant=0.3212, user=0.4206
  DM_20250313-035214_Interview.json: assistant=0.3416, user=0.4198
  DM_20250314-125859_Interview.json: assistant=0.2695, user=0.3525
  DM_20250313-021315_Interview.json: assistant=0.3421, user=0.4356
  DM_20250313-233249_Interview.json: assistant=0.3615, user=0.4202
  DM_20250313-084757_Interview.json: assistant=0.3377, user=0.4246
  DM_20250313-141135_Interview.json: assistant=0.0911, user=0.1004
  DM_20250314-002211_Interview.json: assistant=0.5745, user=0.8039
  DM_20250313-131405_Interview.json: assistant=0.3418, user=0.4584
  DM_20250313-235756_Interview.json: assi

NameError: name 'user_scores' is not defined

<Figure size 600x600 with 0 Axes>