# Import data

You can import data from either:

## ...from CSV

The import code in this section is specific to the CSV automatically generated for each collective dialouge on remesh.


Get the CSV from Remesh: **Export Data > Conversation Data > Download CSV > Sorted by Question**

In [22]:
# Set this to 'True' to Enable importing from CSV. Set to 'False' to disable.
USE_CSV = True

if USE_CSV:
  # use this to import data from .csv
  from google.colab import files
  import pandas as pd
  import csv
  import io
  import warnings
  warnings.filterwarnings('ignore')

  # set PADDING_ROWS to be the number of non-content rows before the initial column headers in the csv
  PADDING_ROWS = 9
  pd.set_option('display.max_colwidth', 0)
  uploaded = files.upload()
  filename = next(iter(uploaded))
  with open(filename,'r') as file:
    csvreader = csv.reader(file)
    r = 1
    data = []
    qdata = []
    for row in csvreader:
      if r>PADDING_ROWS:
        if len(row) == 0 or not row[0].strip():
          data.append(qdata)
          qdata = []
        else:
          qdata.append(row)
      r=r+1
    data.append(qdata)

  #handle blank row
  data = data[1:]

  #percent string to float
  def p2f(x):
    try:
      if x==' - ':
        return float("nan")
      else:
        return float(x.strip('%'))/100
    except:
      return x

  #make numeric
  nq = len(data)
  qs = []
  meta = [["question type","question text"]]
  for i in range(0,nq):
    d = data[i]
    m = [d[1][1],d[1][2]]
    meta.append(m)
    if d[1][1] == 'Poll Single Select':
      for r in range(1,len(d)):
        for c in range(4,len(d[0])):
          d[r][c] = p2f(d[r][c])
    if d[1][1] == 'Ask Opinion':
      for r in range(1,len(d)):
        for c in range(6,len(d[0])-3):
          d[r][c] = p2f(d[r][c])
    df = pd.DataFrame(d[1:],columns = d[0])
    qs.append(df)
  qmeta = pd.DataFrame(meta[1:],columns = meta[0])

Saving GD3_march_2025_aggregate - GD3_march_2025_aggregate.csv to GD3_march_2025_aggregate - GD3_march_2025_aggregate.csv


## ...from json

This imports the aggregated data set from a json file, usually which has response text embeddings appended for doing retrieval stuff.

In [5]:
# Set this to 'True' to Enable importing from JSON. Set to 'False' to disable.
USE_JSON = False
import json
import pandas as pd
import requests

if USE_JSON:
  # use this to import data from .json
  from google.colab import files
  import io
  import warnings
  warnings.filterwarnings('ignore')

  GD1_DOWNLOAD = 'https://drive.usercontent.google.com/download?id=14JfhnqQiDpvMTGH-mJc04Fg4UubED9_7&export=download&authuser=0&confirm=t&uuid=ea5e840c-7ecc-4dd8-a4e3-d8810f13a54e&at=AEz70l4pHJVk7x335P-7ag8ccrdf%3A1742574770488'

  # Use this to download the JSON with text embeddings computed from GDrive
  # OR set to False to upload the JSON file from your own computer
  USE_DIRECT_DOWNLOAD = True



  if USE_DIRECT_DOWNLOAD:
      download_url = GD1_DOWNLOAD
      response = requests.get(download_url)

      if response.status_code != 200:
          raise Exception(f"Failed to download file: Status code {response.status_code}")

      loaded_list = json.loads(response.text)

  else:
    uploaded = files.upload()
    filename = next(iter(uploaded))
    with open(filename,'r') as file:
        loaded_list = json.load(file)

  # Convert the list of dictionaries back to DataFrames
  qs = [pd.DataFrame(df) for df in loaded_list]

## ...from saved progress
Save progress in this notebook at any time with these functions.

Save automatically to Google Drive.

Efficiently reload state from a saved .pkl file.

In [1]:
# Set this to 'True' to automatically load the last-saved .pkl datafile when running this notebook.
# When working iteratively in the notebook and saving updates to the qs data structure,
# this is the fastest way to save & reload state in the notebook.
USE_AUTOLOAD_PROGRESS = True

import pickle
import os
import datetime
from google.colab import files
import io

DEFAULT_GDRIVE_PATH = '/content/drive/MyDrive/saved_colab_data/GD3'

def save_data_to_download(data, filename=None):
    """
    Save the data structure to a pickle file and download it to the user's local machine.

    Args:
        data: The data structure to save
        filename: Optional custom filename (default: auto-generated with timestamp)

    Returns:
        None
    """
    # Generate filename with timestamp if not provided
    if filename is None:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"qs_data_{timestamp}.pkl"

    # Ensure filename has .pkl extension
    if not filename.endswith('.pkl'):
        filename += '.pkl'

    # Save data using pickle to a temporary file
    with open(filename, 'wb') as f:
        pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

    # Download the file to user's local machine
    files.download(filename)
    print(f"File '{filename}' has been prepared for download. Check your browser's download area.")


def upload_and_load_data():
    """
    Upload a pickle file from the user's local machine and load the data structure.

    Returns:
        The loaded data structure
    """
    print("Please select your saved pickle file (.pkl) to upload...")
    uploaded = files.upload()

    if not uploaded:
        print("No file was uploaded.")
        return None

    # Get the filename of the uploaded file
    filename = list(uploaded.keys())[0]

    if not filename.endswith('.pkl'):
        print(f"Warning: The uploaded file '{filename}' does not have a .pkl extension.")

    # Load data using pickle
    try:
        with open(filename, 'rb') as f:
            data = pickle.load(f)

        print(f"Data successfully loaded from '{filename}'")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None


def save_data_to_drive(data, filename=None, drive_path=None):
    """
    Save the data structure to a pickle file in Google Drive.

    Args:
        data: The data structure to save
        filename: Optional custom filename (default: auto-generated with timestamp)
        drive_path: Path in Google Drive to save the file (default: root)

    Returns:
        Path to the saved file
    """
    # Make sure Google Drive is mounted
    try:
        from google.colab import drive
        drive_mounted = True
    except:
        print("Warning: Google Drive module not available.")
        return None

    # Mount Google Drive if not already mounted
    import os
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')
        print("Google Drive mounted.")

    # Generate filename with timestamp if not provided
    if filename is None:
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"qs_data_{timestamp}.pkl"

    # Ensure filename has .pkl extension
    if not filename.endswith('.pkl'):
        filename += '.pkl'

    # Set the default drive path
    if drive_path is None:
        drive_path = DEFAULT_GDRIVE_PATH

    # Create directory if it doesn't exist
    if not os.path.exists(drive_path):
        os.makedirs(drive_path)

    # Full path to save file
    filepath = os.path.join(drive_path, filename)

    # Save data using pickle
    try:
        with open(filepath, 'wb') as f:
            pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

        print(f"Data saved to Google Drive at {filepath}")
        return filepath
    except Exception as e:
        print(f"Error saving data to Google Drive: {e}")
        return None


def load_data_from_drive(filepath=None, drive_path=None):
    """
    Load a data structure from a pickle file in Google Drive.

    Args:
        filepath: Path to the pickle file (if you know the exact path)
        drive_path: Directory in Google Drive to list available files (default: root)

    Returns:
        The loaded data structure, or None if loading failed
    """
    # Make sure Google Drive is mounted
    try:
        from google.colab import drive
        drive_mounted = True
    except:
        print("Error: Google Drive module not available.")
        return None

    # Mount Google Drive if not already mounted
    import os
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')
        print("Google Drive mounted.")

    # Set the default drive path
    if drive_path is None and filepath is None:
        drive_path = DEFAULT_GDRIVE_PATH

    # If no specific file is provided, list available files and let user choose
    if filepath is None:
        if not os.path.exists(drive_path):
            print(f"Directory {drive_path} does not exist in Google Drive.")
            return None

        # Get all .pkl files in the directory
        files_list = [f for f in os.listdir(drive_path) if f.endswith('.pkl')]

        if not files_list:
            print(f"No .pkl files found in {drive_path}")
            return None

        # Sort by modification time (newest first)
        files_list.sort(key=lambda x: os.path.getmtime(os.path.join(drive_path, x)), reverse=True)

        # Display available files
        print(f"Available saved data files in {drive_path}:")
        for i, file in enumerate(files_list):
            timestamp = datetime.datetime.fromtimestamp(
                os.path.getmtime(os.path.join(drive_path, file))
            ).strftime("%Y-%m-%d %H:%M:%S")
            print(f"{i+1}. {file} (saved on {timestamp})")

        # Ask user to choose a file
        try:
            choice = input(f"Enter the number of the file to load (1-{len(files_list)}) or press Enter for most recent: ").strip()

            if not choice:  # If no input provided, auto-select first file
                choice = "1"

            choice = int(choice)
            if 1 <= choice <= len(files_list):
                filepath = os.path.join(drive_path, files_list[choice-1])
            else:
                print("Invalid choice.")
                return None
        except ValueError:
            print("Invalid input. Please enter a number or press Enter for most recent.")
            return None

    # Load data using pickle
    try:
        with open(filepath, 'rb') as f:
            data = pickle.load(f)

        print(f"Data loaded from {filepath}")
        return data
    except Exception as e:
        print(f"Error loading data: {e}")
        return None


# Example usage:
"""
# Save data for download to local machine
save_data_to_download(qs) # download as qs_data_YYYYMMDD_HHMMSS.pkl
# OR save_data_to_download(qs, filename="my_analysis_progress.pkl")

# Save data to Google Drive
save_data_to_drive(qs) # saves as qs_data_YYYYMMDD_HHMMSS.pkl at DEFAULT_GDRIVE_PATH
# OR save_data_to_drive(qs, filename="my_analysis_progress.pkl")

# Upload data from local machine (will prompt for file upload)
qs = upload_and_load_data()


# Load data from Google Drive
# (will list available files if filepath not provided and prompt for selection)
qs = load_data_from_drive()

# Or load with explicit path from Google Drive
qs = load_data_from_drive(filepath="/content/drive/MyDrive/my_project/my_analysis_progress.pkl")
"""

if USE_AUTOLOAD_PROGRESS:
  qs = load_data_from_drive()

Mounted at /content/drive
Google Drive mounted.
Available saved data files in /content/drive/MyDrive/saved_colab_data/GD3:
1. qs_data_20250417_201629.pkl (saved on 2025-04-17 20:16:31)
Enter the number of the file to load (1-1) or press Enter for most recent: 
Data loaded from /content/drive/MyDrive/saved_colab_data/GD3/qs_data_20250417_201629.pkl


# Analysis library

### Non-LLM functions

In [2]:
import math
import json
import pandas as pd
pd.set_option('display.max_colwidth', 0)
import matplotlib.pyplot as plt
plt.close("all")

# ----------------------------------
# function to show the questions by ID
def show_questions(qs_):
  questions = meta = [["question type","question text"]]
  for i in range(0,len(qs_)):
    # questions.append([qs_[i]["Question Type"][1],qs_[i]["Question"][1]]) # Previous Andrew code
    # New code: Accessing the first row (index 0) instead of the second (index 1)
    # Also using .iloc to ensure we are accessing the row by position, not by label
    questions.append([qs_[i]["Question Type"].iloc[0],qs_[i]["Question"].iloc[0]])
  return pd.DataFrame(questions[1:],columns = questions[0])

# usage example
#show_questions(qs)
# ----------------------------------


# ----------------------------------
# function to show the segments by ID
def show_segments(qs_):
  segments = []
  q0 = qs_[0]
  if q0["Question Type"][1] == 'Poll Single Select':
    for c in range(4,len(q0.columns)):
      segments.append(q0.columns[c])
  if q0["Question Type"][1] == 'Ask Opinion':
    for c in range(5,len(q0.columns)-3):
      segments.append(q0.columns[c])
  return pd.DataFrame(segments)

# usage example
#show_segments(qs)
# ----------------------------------


# ----------------------------------
# function to plot poll data by segment
def plot_poll(df,segs):
  print(df["Question"][1])
  segs_incl = ['Responses']
  for i in range(0,len(segs)):
    segs_incl.append(df.columns[4+segs[i]])
  dfplt = df[segs_incl]
  dfplt = dfplt.set_index('Responses')
  dfplt.plot.barh()
  return dfplt
# usage example
#qid = 5
#segs = [231,232,233,234,235,236]
#d =plot_poll(qs[qid],segs)
# ----------------------------------


# ----------------------------------
# function to make a results table dataframe look prettier
def make_pretty(styler):
  styler.background_gradient(axis=None, vmin=0, vmax=1, cmap="RdYlGn")
  styler.format(precision=2)
  return styler
# ----------------------------------

# ----------------------------------
# function to generate generate a results table for an opinion ask for a given set of segments
# only works for "Ask Opinion" question types
# df: data set for specific question; usually specificed as qs[qid]
# segs: list of ids corresponding to the segments to include in the table
# n: number of participant responses to include in the table
def table_ask(df,segs,n):
  segs_incl = ['English Responses']
  for i in range(0,len(segs)):
    segs_incl.append(df.columns[7+segs[i]])
  dfplt = df[segs_incl]
  #dfplt = dfplt.set_index('Responses')
  return dfplt.iloc[:n].style.pipe(make_pretty)

# usage example
#qid = 18
#segs = [0,231,232,233,234,235,236]
#print(qs[qid]["Question"][1]) #print the text of the question
#table_ask(qs[qid],segs,10)
# ----------------------------------


# ----------------------------------
# compute max-min bridging metric, used in bridging_ask function
def min_bridge(row,segs_incl,col):
  b = 1
  for s in range(0,len(segs_incl)):
    b_ = row[segs_incl[s]]
    try:
      b_ = float(b_) # Try converting b_ to float
    except ValueError:
      b_ = float('nan') # If conversion fails, set to NaN
    b = min(b,b_)
  return b
# ----------------------------------


# ----------------------------------
# compute max-min polarization metric, used in bridging_ask function
def polarization(row,segs_incl,col):
  mx = 0
  mn = 1
  for s in range(0,len(segs_incl)):
    b_ = row[segs_incl[s]]
    try:
      b_ = float(b_)  # Try converting to float
    except ValueError:
      b_ = float('nan')  # Handle non-numeric values by setting to NaN
    mx = max(mx,b_)
    mn = min(mn,b_)
  return mx-mn
# ----------------------------------


# ----------------------------------
# compute max-min divergence metric, used in bridging_ask function
def symmetric_divergence(row,segs_incl,col):
  mx = 0
  mn = 1
  for s in range(0,len(segs_incl)):
    b_ = row[segs_incl[s]]
    try:
      b_ = float(b_)  # Try converting to float
    except ValueError:
      b_ = float('nan')  # Handle non-numeric values by setting to NaN
    mx = max(mx,b_)
    mn = min(mn,b_)
  mx_div = max(mx-0.5,0)
  mn_div = max(0.5-mn,0)
  return math.sqrt(mx_div*mn_div)
# ----------------------------------


# ----------------------------------
# generate dataframe which includes bridging, polarization, divergence metrics
# only works for "Ask Opinion" question types
def bridging_ask(df,segs):
  segs_incl = ['English Responses']
  for i in range(0,len(segs)):
    segs_incl.append(df.columns[7+segs[i]])
  dfplt = df[segs_incl]
  dfplt["bridge"] = df.apply (lambda row: min_bridge(row,segs_incl[1:],df.columns), axis=1)
  dfplt["polarization"] = df.apply (lambda row: polarization(row,segs_incl[1:],df.columns), axis=1)
  dfplt["divergence"] = df.apply (lambda row: symmetric_divergence(row,segs_incl[1:],df.columns), axis=1)
  return dfplt.sort_values(by=["bridge"],ascending=False)

# usage example
#qid = 15
#segs = [0,231,232,233,234,235]
#print(qs[qid]["Question"][1]) #print the question text
#ba = bridging_ask(qs[qid],segs).iloc[:10] #generate a table of the first 10 responses with the different max-min metrics
#ba.style.pipe(make_pretty) #display the table and make it pretty
# ----------------------------------


# ----------------------------------
#function to get responses whose bridging agreement across a given set of segments is over a specified threshold
def get_bridging_responses(df,segs,thresh):
  bdf = bridging_ask(df,segs)
  return bdf.loc[bdf['bridge']>thresh]

# usage example
#qid = 12
#segs = [0,231,232,233,234,235,236]
#thresh = .50
#print(qs[qid]["Question"][1]) #print question text
#ba = get_bridging_responses(qs[qid],segs,thresh) #get bridging responses
#ba.style.pipe(make_pretty) #display in a pretty table
# ----------------------------------


# ----------------------------------
#function to get the top n responses with highest polarization across a set of segments
def get_polarizing_responses(df,segs,n):
  bdf = bridging_ask(df,segs)
  bdfp = bdf.sort_values(by=["polarization"],ascending=False)
  return bdfp.iloc[:n]
# ----------------------------------


# ----------------------------------
#function to get the top n responses with most divergent responses across a set of segments
def get_divergent_responses(df,segs,n):
  bdf = bridging_ask(df,segs)
  bdfp = bdf.sort_values(by=["divergence"],ascending=False)
  return bdfp.iloc[:n]
# ----------------------------------


# ----------------------------------
#generate a text summary of the n responses with the highest polarization
def polarization_summary(df,segs,n):
  pa = get_polarizing_responses(df,segs,n)

  # Exclude first column and last two columns for min/max calculations
  temp_pa = pa.iloc[:, 1:-2]

  # Find the column names of min and max values
  min_col = temp_pa.idxmin(axis=1)
  max_col = temp_pa.idxmax(axis=1)

  # Find the min and max values
  min_val = temp_pa.min(axis=1)
  max_val = temp_pa.max(axis=1)

  for idx in pa.index:
      first_col_text = pa.loc[idx, pa.columns[0]]
      min_col_name = min_col.loc[idx]
      max_col_name = max_col.loc[idx]
      min_value = pa.loc[idx, min_col_name]
      max_value = pa.loc[idx, max_col_name]
      print(first_col_text)
      print("Low : " + str(int(min_value*100)) +"% -- " + min_col_name )
      print("High : " + str(int(max_value*100)) +"% -- " + max_col_name )
      print(" ")

# usage example
#qid = 16
#segs = [0,231,232,233,234,235]
#n = 10
#print(qs[qid]["Question"][1])
#polarization_summary(qs[qid],segs,n)
# ----------------------------------


# ----------------------------------
#generate a text summary of the n responses with the highest (symetric) divergence
def divergence_summary(df,segs,n):
  pa = get_divergent_responses(df,segs,n)

  # Exclude first column and last two columns for min/max calculations
  temp_pa = pa.iloc[:, 1:-2]

  # Find the column names of min and max values
  min_col = temp_pa.idxmin(axis=1)
  max_col = temp_pa.idxmax(axis=1)

  # Find the min and max values
  min_val = temp_pa.min(axis=1)
  max_val = temp_pa.max(axis=1)

  for idx in pa.index:
      first_col_text = pa.loc[idx, pa.columns[0]]
      min_col_name = min_col.loc[idx]
      max_col_name = max_col.loc[idx]
      min_value = pa.loc[idx, min_col_name]
      max_value = pa.loc[idx, max_col_name]
      print(first_col_text)
      print("Low : " + str(int(min_value*100)) +"% -- " + min_col_name )
      print("High : " + str(int(max_value*100)) +"% -- " + max_col_name )
      print(" ")

# usage example
#qid = 16
#segs = [0,231,232,233,234,235]
#n = 10
#print(qs[qid]["Question"][1])
#divergence_summary(qs[qid],segs,n)
#polarization_summary(qs[qid],segs,n)
# ----------------------------------


# ----------------------------------
# function to save qs as a json object that can be reloaded via json import above
# current use is to save a version of qs with embeddings
# can be used to save a qs with other custom stuff done to it tho
def save_qs_to_json(qs,filename):
  qx = []
  for i in range(0,len(qs)):
    qx.append(qs[i].to_dict())

  # Save the list of dictionaries to a JSON file
  with open(filename, 'w') as f:
      json.dump(qx, f)
# example usage
#save_qs_to_json(qs,"global_inputs_v2.json")
# ----------------------------------

### LLM-based functions


In [8]:
# install stuff and set api key (takes about 1 min to run)
!pip install langchain
!pip install openai
!pip install -U sentence-transformers
!pip install -U langchain-openai
import os

# use this import if running notebook in Google Colab to get the secret key
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY') # OR "your-api-key-here"




Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [9]:
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
from openai import OpenAI
client = OpenAI()

# ----------------------------------
# function to generate generate 1024 dimension embeddinggs using text-embedding-3-small model
def get_embedding(text, model="text-embedding-3-small",dimensions=1024):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model,dimensions=dimensions).data[0].embedding
# ----------------------------------


# ----------------------------------
# function to add embeddings to a row qs data frame
def embed_response(row):
  text = row["English Responses"]
  #print(text)
  return get_embedding(text)
# ----------------------------------


# ----------------------------------
# function to add embeddings for all responses in a qs data frame
# used to enable "relevence" ranking
def embed_responses(df):
  #print("begin")
  df["embedding"] = df.apply (lambda row: embed_response(row), axis=1)
  return df
# ----------------------------------


# ---------------------------------
# function to generate embeddings for all open end resposnes in qs
# does not need to be run if you imported a dataset from json that already has embeddings
# !!**WARNING**!!  takes ~2m per 1000 resposnes to run; ie. 45 min for the global inputs data set
def add_embeddings_for_all_responses(qs):
  if USE_DIRECT_DOWNLOAD:
        raise SystemExit("You already have the JSON with text embeddings!\n"
                        "You do not need to run this embedding generation function.")
  questions = show_questions(qs)
  oe_ids = []
  for i in range(0,len(questions)):
    if "Ask" in questions.iloc[i][0]:
      oe_ids.append(i)
      qs[i] = embed_responses(qs[i])
      print(i)
  return qs
# usage example
#qs = add_embeddings_for_all_responses(qs)
# ----------------------------------


# ----------------------------------
# function to rank responses in a qs data frame by simiarlity to a given text
# if data not loaded via json w embeddings, then qs[i] = embed_responses(qs[i]) must be run before this works
def rank_by_similarity(qs, text):
    new_embedding = get_embedding(text)
    qs_copy = qs.copy()

    # Replace NaN embeddings with zero vectors (assuming all embeddings have the same length)
    embedding_dim = len(new_embedding)  # Assuming all embeddings have the same size as the new embedding
    embeddings = np.array([
        np.zeros(embedding_dim) if isinstance(embedding, float) and np.isnan(embedding) else embedding
        for embedding in qs_copy['embedding'].values
    ])

    similarities = cosine_similarity([new_embedding], embeddings)[0]

    # Add the similarities as a new column in the copied dataframe
    qs_copy['cosine_similarity'] = similarities

    # Sort the copied dataframe by cosine similarity in descending order
    qs_sorted = qs_copy.sort_values(by='cosine_similarity', ascending=False)

    return qs_sorted
# ----------------------------------


# ----------------------------------
# Create the components for the prompt chain that powers the main LLM synthesis function

#load LLM
llm = ChatOpenAI(temperature=0.2,model_name='gpt-4-0125-preview')


#summarization prompt
rerankPrompt = PromptTemplate(
    input_variables=["question","responses","query"],
    template="""
    Participants in a research study were asked '{question}'.

    These are their responses:
    {responses}

    Filter the responses, keeping only the responses that are atleast somewhat related to or helpful in anwsering: {query}
    Rank the filtered resposnes starting with the most related to or helpful in answering: {query}
    Output this set of responses formated exactly in the way they are given above, with a newline spereate each response.
    If there are no related response, output "there are no relevant responses"
    """
)
#add to chain
rerankChain = LLMChain(llm=llm, prompt=rerankPrompt,output_key="reranked_responses")


#summarization prompt
summaryPrompt = PromptTemplate(
    input_variables=["question","responses","focus"],
    template="""
    Participants in a research study were asked '{question}'.

    These are their responses:
    {responses}

    Create a hierarchical taxonomy of the unique ideas and themes within these responses using very short bullet points. Avoid dupliate ideas. If there are no responses to analyze, do not provide a taxonomy. Do not include anything in the taxonomy not included in the responses.
    {focus}
    """
    #Summarize all of the unique ideas within these responses into very short bullet points.
)
#add to chain
summaryChain = LLMChain(llm=llm, prompt=summaryPrompt,output_key="summary")


#summarization prompt
bulletsPrompt = PromptTemplate(
    input_variables=["question","summary","focus"],
    template="""
    Participants in a research study were asked '{question}'.

    The TAXONOMY of ideas in participants responses are:
    {summary}

    Summarize the TAXONOMY into 1-15 concise bullet points, with each bullet point starting with a single theme and then overviewing the ideas within that theme. Be direct and specific, DO NOT say things like "Particpants said" or "responses" or "there was a desire" or "this theme" or "focusing on". Just say the ideas. NEVER repeat a theme or idea. A theme name should not include "and". Each bullet should be very short. Each bullets must ONLY contain ideas from the taxonomy. If there is no taxonomy, just output "no ideas to synthesize"

    {focus}

    Example bullet points:
    - Creaive outlets: music, drawing, painting, writing stories, designing houses, creating new receipes, home remodeling.
    - Health improvement: Increased exercise, better diets, access to high-quality healthcare, and reduction of chronic diseases.
    """
)
#add to chain
bulletsChain = LLMChain(llm=llm, prompt=bulletsPrompt,output_key="bullets")

#outcome prompt
outcomePrompt = PromptTemplate(
    input_variables=["question","summary","responses"],
    template="""
    Participants in a research study were asked '{question}'.

    These are their responses:
    {responses}

    The main ideas from these responses are:
    {summary}

    We define an 'outcome' to be a single specific concrete result that can be observed and measured in the world. An 'outcome' should NOT include an explination of how it is acheived (ie. an 'outcome' should NOT include the words "due to" or "as a result of" or "through" or "by" etc.). An 'outcome' MUST be specific enough to be objectively observed or measured.

    Write a list of the 'outcomes' present in the main ideas from the responses summarized above. DO NOT REPEAT ANY IDEAS.

    Here are some examples of 'outcomes':

    - Earths climate remains below 15C
    - The number of gun deaths decreases to below 100 per day globally
    - The fraction of people who cannot afford or access healthcare decreases
    - No nuclear devices are detonated within 100 miles of a human
    - More people report being happy with their life

    """
)
#add to chain
outcomeChain = LLMChain(llm=llm, prompt=outcomePrompt,output_key="outcomes")


#values prompt
valuePrompt = PromptTemplate(
    input_variables=["question","summary","responses"],
    template="""
    Participants in a research study were asked '{question}'.

    These are their responses:
    {responses}

    The main ideas from these responses are:
    {summary}

    We define an 'value' to be a deontilogical property that can be reflected on how an AI behaves, reguardless of the result of that behavior. We do not consider a specific AI behavior to be a 'value'. For example "non-judegment: the AI's behavior does not imply a value judgement about the users feelings or experience" IS a 'value' , but "the AI does not say 'I am judgeing you'" is NOT a 'value'.

    Write a list of the unique 'values' present in the  main ideas from the responses summarized above.

    Here are some example 'values':

    - Empathy: Showing understanding and compassion to make the user feel heard and supported.
    - Respect: Honoring the user's feelings and experiences without minimizing their pain or struggles.
    - Non-judgment: Providing support without criticism or bias to create a safe space for the user to express themselves.

    DO NOT COPY THE EXAMPLE 'values' ABOVE VERBATIM. Construct them based on the responses and summarized ideas above.

    """
)
#add to chain
valueChain = LLMChain(llm=llm, prompt=valuePrompt,output_key="values")

#generate all the sequential chains
genSummaryChain = SequentialChain(
    chains=[summaryChain],
    input_variables=["question", "responses","focus"],
    output_variables=["summary"],
    verbose=False)

genOutcomeChain = SequentialChain(
    chains=[summaryChain, outcomeChain],
    input_variables=["question", "responses","focus"],
    output_variables=["summary","outcomes"],
    verbose=False)

genValuesChain = SequentialChain(
    chains=[summaryChain, valueChain],
    input_variables=["question", "responses","focus"],
    output_variables=["summary","values"],
    verbose=False)

genBulletsChain = SequentialChain(
    chains=[summaryChain, bulletsChain],
    input_variables=["question", "responses","focus"],
    output_variables=["summary","bullets"],
    verbose=False)

rerankOnlyChain = SequentialChain(
    chains=[rerankChain],
    input_variables=["question", "responses","query"],
    output_variables=["reranked_responses"],
    verbose=False)
# ----------------------------------



# ----------------------------------
# function to run the main LLM synthesis pipeline
# qs: main question data object
# qid: = quesiton id
# segs: list with segment ids to be used in the analysis
# synth_type: type of synthesis to generate, can be set to..
#                          "summary" = taxonomy of unique ideas in selected responses
#                          "bullets" = 10ish bullets based on the taxonomy of unique ideas in selected responses
#                          "outcomes" = a list of specific outcomes derived from the selected responses
#                          "values" = a list of values derived from the selected responses
# rank_type: the way that responses will be ranked and selected for use in synthesis, can be set to...
#                          "bridging" = smallest agreement value within any of the given segments (max-min bridging), ranks by decending,  thresh filters by greater than
#                          "polarization" = difference between the segments with largest and smallest agreement
#                          "divergence" = geometric mean of the deveations from 50% of the agreements from the segments with largest and smallest agreement
#                          "low_agreement" = smallest agreement value within any of the given segments (max-min bridging), ranks by assending, thresh filters by less than
#                          "relevance" = cosine_similarity to query_text, can only be used if responses have embeddings (ie. data loaded from a json with embeddings pre-appended, or after uses [this function] to append embeddings)
#                          "sample" = randomly samples n_max responses
# thresh: the threshold value used to filter and select responses for the given rank_type
# n_max: maximium number of response to include in the set of selected responses
# query_text: used to rank and filter resposnes if rank_type = "relevance", else if *not blank* is used to filter responses and focus synthesis

def synthesize(qs,qid,segs,synth_type,rank_type,thresh,n_max,query_text = ""):

  #build string of responses
  if rank_type == "bridging":
    print("ranking by bridging")
    ba_ = get_bridging_responses(qs[qid],segs,thresh)
    ba = ba_.iloc[:n_max]
  elif rank_type == "polarization":
    print("ranking by polarization")
    ba = get_polarizing_responses(qs[qid],segs,n_max)
  elif rank_type == "divergence":
    print("ranking by divergence")
    ba = get_divergent_responses(qs[qid],segs,n_max)
  elif rank_type == "low_agreement":
    print("ranking by low agreement")
    ba_ = bridging_ask(qs[qid],segs)
    ba_ = ba_.sort_values(by=["bridge"],ascending=True)
    ba_ = ba_.loc[ba_["bridge"]<thresh]
    ba = ba_.iloc[:n_max]
  elif rank_type == "relevance":
    ba_ = rank_by_similarity(qs[qid], query_text)
    ba_ = ba_.loc[ba_["cosine_similarity"]>thresh]
    ba = ba_.iloc[:n_max]
  else:
    print("random sampling")
    ba_ = get_bridging_responses(qs[qid],segs,0)
    ba = ba_.sample(n=n_max)

  responses_str = ''
  for ind in ba.index:
    rsp = ba["English Responses"][ind]
     # Convert rsp to string before concatenation to handle potential floats
    responses_str+="-"
    responses_str+=str(rsp)
    responses_str+="\n \n "

  #get quesiton text
  df = qs[qid]
  question_str = df["Question"].iloc[0]

  #filter responses for relevance if there is a query term
  if query_text != "":
    focus = "only include topics and themes that are reasonably relevant to: " + query_text
    prelim_responses = rerankOnlyChain({
        "question":question_str,
        "responses":responses_str,
        "query":query_text
    })
    responses_str = prelim_responses["reranked_responses"]
  else:
    focus = ""

  #select prompt chain to run based on synth_type
  if synth_type == "outcomes":
    out = genOutcomeChain({
        "question":question_str,
        "responses":responses_str,
        "focus":focus
    })
  if synth_type == "values":
    out = genValuesChain({
        "question":question_str,
        "responses":responses_str,
        "focus":focus
    })
  if synth_type == "bullets":
    out = genBulletsChain({
        "question":question_str,
        "responses":responses_str,
        "focus":focus
    })
  if synth_type == "summary":
    out = genSummaryChain({
        "question":question_str,
        "responses":responses_str,
        "focus":focus
    })
  out["data"]=ba
  return out

# usage example [TODO]
#qid = 10
#segs = [0,231,232,233,234,235]
#synth_type = "bullets"      #synthesize results as bullet points
#rank_type = "bridging"      #ranking responses to synthesize by bridgage agreement across [segs]
#thresh = .50                #only keep responses where 50%+ participants in all [segs] agree
#n_max = 100                 #only keep the top 100 responses withi highest bridging agreement
#query_text = "enviroment"   #focus on resposnes and sythesized results related to "enviroment"

#out = synthesize(qs,qid,segs,synth_type,rank_type,thresh,n_max,query_text = query_text) #run the sytthesis
#print(qs[qid]["Question"][1]) #print quesiton text
#print(out[synth_type]) #print synthesis results
#dat = out["data"] #get the data set used for results sythesis

  rerankChain = LLMChain(llm=llm, prompt=rerankPrompt,output_key="reranked_responses")


In [10]:
# prompt: download qs as json file. qs is a list of dataframes, formed using the python code below:
# uploaded = files.upload()
#     filename = next(iter(uploaded))
#     with open(filename,'r') as file:
#         loaded_list = json.load(file)
#   # Convert the list of dictionaries back to DataFrames
#   qs = [pd.DataFrame(df) for df in loaded_list]

import json
from google.colab import files

def save_qs_as_json(qs, filename="qs_data.json"):
  """Saves the list of dataframes (qs) as a JSON file.

  Args:
    qs: A list of pandas DataFrames.
    filename: The name of the JSON file to be saved.
  """
  try:
    # Convert each DataFrame to a list of dictionaries
    qs_list = [df.to_dict(orient='records') for df in qs]

    # Save the list of dictionaries as a JSON file
    with open(filename, 'w') as f:
      json.dump(qs_list, f, indent=4)

    files.download(filename)  # Download the file
    print(f"Data saved to {filename} and downloaded.")

  except Exception as e:
    print(f"An error occurred: {e}")

# Example usage (assuming 'qs' is defined as in your provided code):
# Replace with your actual 'qs' variable.
# This example assumes you have a 'qs' variable already in your colab environment.
# If you do not have it, load it first as in your original code.
# For this example, we'll just create some dummy data.
#import pandas as pd
#qs = [pd.DataFrame({'A': [1, 2], 'B': [3, 4]}), pd.DataFrame({'C': [5, 6], 'D': [7, 8]})]

#save_qs_as_json(qs) #Save and download the JSON file


### LLM-based functions for identifying Themes.

1. Identifies similar clusters of responses based on embeddings. (Uses UMAP to reduce dimensionality & HDBSCAN to cluster)
2. Prompts LLM to label clusters given a sample of responses from the cluster.
3. Prompts LLM to merge any similar clusters, and re-applies merged labels.

In [11]:
!pip install -U umap-learn
!pip install hdbscan



In [12]:
# Import UMAP from umap.umap_ instead of just importing umap
import umap.umap_ as umap
import hdbscan
from concurrent.futures import ThreadPoolExecutor

client = OpenAI()


def prepare_embeddings_for_clustering(qs, qid):
    """
    Prepare embeddings from qs[qid] for clustering.
    Returns numpy array of embeddings and list of corresponding response texts.
    """
    embeddings = []
    texts = []

    # Get embeddings and corresponding texts
    # Reset index to align with range(len(qs[qid]['embedding']))
    df = qs[qid].reset_index(drop=True)
    for idx in range(len(qs[qid]['embedding'])):
        if qs[qid]['embedding'][idx] is not None:  # Skip None embeddings
            embeddings.append(qs[qid]['embedding'][idx])
            texts.append(qs[qid]['English Responses'][idx])

    return np.array(embeddings), texts

def discover_themes(qs, qid, n_neighbors=15, min_cluster_size=5, min_samples=5):
    """
    Perform theme discovery using UMAP and HDBSCAN.
    Returns cluster labels and reduced embeddings.
    """
    # Prepare data
    embeddings, texts = prepare_embeddings_for_clustering(qs, qid)

    # Reduce dimensionality with UMAP
    reducer = umap.UMAP(n_neighbors=n_neighbors, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings)

    # Cluster with HDBSCAN
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean',
        cluster_selection_method='eom'
    )
    cluster_labels = clusterer.fit_predict(reduced_embeddings)

    return cluster_labels, reduced_embeddings, texts

def label_clusters_with_llm(cluster_labels, texts, model="gpt-4-turbo-preview", word_limit=4):
    """
    Use LLM to label clusters based on their content.
    Returns dictionary mapping cluster IDs to labels.
    """

    MAX_SAMPLE_OF_RESPONSES_TO_USE_FOR_LABELING = 50

    from openai import OpenAI
    client = OpenAI()

    # Group texts by cluster
    clusters = {}
    for label, text in zip(cluster_labels, texts):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(text)

    # Generate labels for each cluster
    cluster_labels = {}
    for cluster_id, cluster_texts in clusters.items():
        if cluster_id == -1:  # Skip noise points
            cluster_labels[cluster_id] = "Noise/Unclustered"
            continue

        # Prepare prompt with sample of texts
        sample_texts = cluster_texts[:MAX_SAMPLE_OF_RESPONSES_TO_USE_FOR_LABELING]
        prompt = f"""Analyze these responses and provide a concise 1-{word_limit} word phrase that either summarizes their shared meaning or, if more generality is needed, captures their common theme:

        Responses:
        {chr(10).join(sample_texts)}

        Shared meaning or Theme:"""

        # Get LLM response
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )

        cluster_labels[cluster_id] = response.choices[0].message.content.strip()

    return cluster_labels


def merge_similar_clusters(cluster_labels, reduced_embeddings, cluster_names, distance_threshold=0.5, similarity_threshold=0.8):
    """
    Merges similar clusters based on LLM evaluation and cluster distance,
    merging iteratively until no further merges are possible.
    """
    merged = True  # Flag to track if any merges occurred in the current iteration

    # Convert cluster_labels to a numpy array if it's not already
    cluster_labels = np.array(cluster_labels)

    while merged:
        merged = False  # Reset the flag for the current iteration

        # Get unique cluster IDs (excluding noise cluster -1)
        unique_clusters = np.unique(cluster_labels)
        unique_clusters = unique_clusters[unique_clusters != -1]

        if len(unique_clusters) <= 1:
            break  # No more clusters to merge

        # 1. Calculate Cluster Distances
        cluster_distances = calculate_cluster_distances(reduced_embeddings, cluster_labels)

        # print the average, median, min, and max cluster distances
        print(f"Average cluster distance: {np.mean(cluster_distances):.4f}")
        print(f"Median cluster distance: {np.median(cluster_distances):.4f}")
        print(f"Min cluster distance: {np.min(cluster_distances):.4f}")
        print(f"Max cluster distance: {np.max(cluster_distances):.4f}")

        # 2. Prepare Prompts for LLM
        prompts = []
        merge_candidates = []

        for i_idx, i in enumerate(unique_clusters):
            for j_idx, j in enumerate(unique_clusters[i_idx+1:], i_idx+1):
                # If the distance is within the threshold, proceed with LLM evaluation
                if cluster_distances[i_idx, j_idx] <= distance_threshold:
                    print(f"Cluster distance of {cluster_distances[i_idx, j_idx]:.4f} <= distance threshold {distance_threshold:.2f}")
                    print(f"Prompting LLM to compare clusters {cluster_names[i]} and {cluster_names[j]}")
                    distance = cluster_distances[i_idx, j_idx]
                    prompt = f"""Evaluate if these two theme labels should be merged into a single theme.
                    Themes should be specific enough for meaningful differentiation,
                    but broad enough for grouping similar responses under the same theme.

                    Theme 1: "{cluster_names[i]}"
                    Theme 2: "{cluster_names[j]}"
                    Distance measure: {distance:.4f} (lower means more similar)

                    Should these themes be merged? Answer with only 'yes' or 'no' and a brief explanation.


                    Examples:
                    Theme 1: "Anti-War"
                    Theme 2: "Promote Peace"
                    Response: no, because while both share a common goal, promoting peace can involve a wider range of actions beyond simply opposing war, such as conflict resolution, diplomacy, and building understanding between different groups.

                    Theme 1: "Anti-War"
                    Theme 2: "End conflict"
                    Response: yes, because being opposed to war is synonymous with wanting an end to conflict.
                    """

                    prompts.append(prompt)
                    merge_candidates.append((i, j))

        if not prompts:
            break  # No candidates to merge

        # 3. LLM Evaluation
        llm_responses = get_llm_responses(prompts)

        # 4. Process merges
        cluster_mapping = {label: label for label in unique_clusters}  # Initially, each cluster maps to itself

        for (i, j), llm_response in zip(merge_candidates, llm_responses):
            # Fixed: use 'in' operator instead of 'contains' method
            if "yes" in llm_response.lower():
                # Merge cluster j into cluster i
                cluster_mapping[j] = cluster_mapping[i]
                # Update the cluster name
                cluster_names[i] = f"{cluster_names[i]}/{cluster_names[j]}"
                merged = True  # Set the flag to indicate a merge occurred

        # Update cluster labels based on the mapping
        for idx, label in enumerate(cluster_labels):
            if label in cluster_mapping:
                cluster_labels[idx] = cluster_mapping[label]

        # Update cluster_names to remove merged clusters
        for old_label, new_label in cluster_mapping.items():
            if old_label != new_label:
                cluster_names[old_label] = None

        # Clean up cluster_names
        cluster_names = {k: v for k, v in cluster_names.items() if v is not None}

    return cluster_labels, cluster_names, cluster_distances


# Helper functions:

def calculate_cluster_distances(reduced_embeddings, cluster_labels):
  """
  Calculates the average Euclidean distance between clusters.

  Args:
    reduced_embeddings: The reduced embeddings of the data points.
    cluster_labels: The cluster labels for each data point.

  Returns:
    A distance matrix where element (i, j) represents the average distance
    between cluster i and cluster j.
  """
  unique_labels = np.unique(cluster_labels)
  num_clusters = len(unique_labels)
  distances = np.zeros((num_clusters, num_clusters))

  for i in range(num_clusters):
    for j in range(i + 1, num_clusters):
      cluster_i_points = reduced_embeddings[cluster_labels == unique_labels[i]]
      cluster_j_points = reduced_embeddings[cluster_labels == unique_labels[j]]

      # Calculate average distance between points in cluster i and cluster j
      avg_distance = np.mean([np.linalg.norm(p1 - p2)
                             for p1 in cluster_i_points
                             for p2 in cluster_j_points])
      distances[i, j] = avg_distance
      distances[j, i] = avg_distance  # Distance matrix is symmetric

  return distances





def get_llm_responses(prompts, model="gpt-4-turbo-preview", max_workers=10):
    """
    Calls the OpenAI API in parallel to get LLM responses for the given prompts.
    Returns:
        A list of LLM responses corresponding to the input prompts.
    """

    def get_response(prompt):
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3
        )
        return response.choices[0].message.content.strip()

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        responses = list(executor.map(get_response, prompts))

    return responses


# Alternative approach: Instead of using cluster distance, just ask an LLM to
# merge similar themes after getting an initial list of cluster themes
def merge_themes_with_llm(qs, qid, cluster_names, model="gpt-4-turbo-preview"):
    """
    Prompts an LLM with a list of cluster names to identify themes to merge.
    Returns:
        1. A dictionary mapping original cluster IDs to merged theme names
        2. A dictionary containing information about which clusters were merged

    Args:
        cluster_names: Dictionary mapping cluster IDs to theme names
        model: The OpenAI model to use
    """
    # Skip if there are too few clusters to merge
    if len(cluster_names) <= 1:
        return cluster_names, {}

    # Create a list of themes with their original IDs for the prompt
    themes_with_ids = {i+1: (cluster_id, name) for i, (cluster_id, name) in enumerate(cluster_names.items())}

    # Create reverse mapping for parsing the response
    id_to_cluster_id = {i+1: cluster_id for i, (cluster_id, _) in enumerate(cluster_names.items())}

    # Format the themes list for the prompt
    prompt_list = [f"{i}. {name}" for i, (_, name) in themes_with_ids.items()]
    prompt_text = "\n".join(prompt_list)

    # get question text
    df = qs[qid]
    question_str = df["Question"][1]

    # Construct the prompt using JSON format for structured output
    prompt = f"""You are given a list of themes derived from clustering responses to the following survey question.
    Your task is to identify any themes that should be merged due to significant overlap or redundancy.

    Question:
    "{question_str}"

    Themes:
    {prompt_text}

    Please identify any themes that should be merged because they represent the same or very similar concepts.
    Respond with JSON in the following format:

    ```json
    {{
      "merges": [
        {{
          "merged_name": "New Theme Name",
          "original_indices": [1, 3, 5]
        }},
        {{
          "merged_name": "Another Theme Name",
          "original_indices": [2, 4]
        }}
      ]
    }}
    ```

    If no merges are recommended, respond with:
    ```json
    {{
      "merges": []
    }}
    ```

    The "original_indices" should contain the numbers of the themes from the list above that should be merged.
    The "merged_name" should be a concise, descriptive name that encompasses all merged themes.
    """

    # Call the LLM API
    import json

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        response_format={"type": "json_object"}  # Ensure JSON response
    )

    llm_output = response.choices[0].message.content.strip()

    # Parse the JSON response
    try:
        # Extract the JSON part from the response
        if "```json" in llm_output:
            json_str = llm_output.split("```json")[1].split("```")[0].strip()
        elif "```" in llm_output:
            json_str = llm_output.split("```")[1].strip()
        else:
            json_str = llm_output

        merge_data = json.loads(json_str)

        # Initialize output dictionaries
        merged_mapping = {cluster_id: name for cluster_id, name in cluster_names.items()}
        merge_info = {}

        # Process each merge
        for merge in merge_data.get("merges", []):
            merged_name = merge["merged_name"]
            original_indices = merge["original_indices"]

            # Get the original cluster IDs from the indices
            original_cluster_ids = [id_to_cluster_id[idx] for idx in original_indices]

            # Create merge information entry
            merge_info[merged_name] = {
                "original_names": [cluster_names[cluster_id] for cluster_id in original_cluster_ids],
                "cluster_ids": original_cluster_ids
            }

            # Update the mapping
            for cluster_id in original_cluster_ids:
                merged_mapping[cluster_id] = merged_name

        return merged_mapping, merge_info

    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error parsing LLM response: {e}")
        print(f"Raw response: {llm_output}")
        # Return the original mapping if there's an error
        return cluster_names, {}


def apply_merged_themes(cluster_labels, merged_mapping):
    """
    Applies the merged theme mapping to original cluster labels.

    Args:
        cluster_labels: Original numeric cluster labels from HDBSCAN
        merged_mapping: Dictionary mapping original cluster IDs to merged theme names

    Returns:
        numeric_mapping: Dictionary mapping original cluster IDs to new consolidated cluster IDs
        new_cluster_names: Dictionary mapping new cluster IDs to theme names
    """
    # Create a mapping of theme names to new cluster IDs
    unique_themes = list(set(merged_mapping.values()))
    theme_to_id = {theme: i for i, theme in enumerate(unique_themes)}

    # Create mapping from original cluster IDs to new cluster IDs
    numeric_mapping = {}
    for original_id, theme_name in merged_mapping.items():
        numeric_mapping[original_id] = theme_to_id[theme_name]

    # Create new dictionary mapping new cluster IDs to theme names
    new_cluster_names = {theme_to_id[theme]: theme for theme in unique_themes}

    # Make sure noise cluster (-1) is preserved
    if -1 in cluster_labels:
        numeric_mapping[-1] = -1
        new_cluster_names[-1] = "Noise/Unclustered"

    return numeric_mapping, new_cluster_names


def apply_numeric_mapping(cluster_labels, numeric_mapping):
    """
    Applies the numeric mapping to cluster labels, preserving their numpy array format.

    Args:
        cluster_labels: Original numeric cluster labels from HDBSCAN
        numeric_mapping: Dictionary mapping original cluster IDs to new cluster IDs

    Returns:
        numpy array of new cluster labels
    """
    import numpy as np

    # Convert to numpy array if it's not already
    cluster_labels = np.array(cluster_labels)

    # Create new array with mapped values
    new_labels = np.array([numeric_mapping.get(label, -1) for label in cluster_labels])

    return new_labels


def save_themes_to_data(qs, qid, cluster_labels, texts, cluster_names, merge_info=None):
    """
    Saves theme information to the qs data structure for the given question ID.

    Args:
        qs: The data structure containing question data (list of pandas DataFrames)
        qid: The question ID to save themes for
        cluster_labels: The final cluster labels for each response
        texts: The response texts that were clustered
        cluster_names: Dictionary mapping cluster IDs to theme names
        merge_info: Optional dictionary with information about merged themes

    Returns:
        Updated qs data structure
    """
    import pandas as pd
    import numpy as np

    # Create the metadata dictionary for question-level themes info
    themes_metadata = {
        'cluster_names': cluster_names,
        'merge_info': merge_info if merge_info else None
    }

    # Store question-level metadata in a column called 'question_themes'
    if 'question_themes' not in qs[qid].columns:
        qs[qid]['question_themes'] = None

    # Check if index 0 exists, if not use the first available index
    if 0 not in qs[qid].index:
        first_idx = qs[qid].index[0]
        qs[qid].at[first_idx, 'question_themes'] = themes_metadata
    else:
        qs[qid].at[0, 'question_themes'] = themes_metadata

    # Create mapping from response text to theme
    response_to_theme = {}
    for text, label in zip(texts, cluster_labels):
        theme_name = cluster_names.get(label, "Unclustered")
        response_to_theme[text] = theme_name

    # Add or update 'response_theme' column for individual response themes
    if 'response_theme' not in qs[qid].columns:
        qs[qid]['response_theme'] = np.nan

    # Map each response to its theme
    for i, response in enumerate(qs[qid]['English Responses']):
        if i < len(qs[qid]) and response in response_to_theme:
            idx = qs[qid].index[i]
            qs[qid].at[idx, 'response_theme'] = response_to_theme[response]

    # Also add a 'cluster_id' column to store the numeric label
    if 'cluster_id' not in qs[qid].columns:
        qs[qid]['cluster_id'] = np.nan

    # Create mapping from text to cluster ID
    text_to_cluster = {text: label for text, label in zip(texts, cluster_labels)}

    # Assign cluster IDs to each response
    for i, response in enumerate(qs[qid]['English Responses']):
        if i < len(qs[qid]) and response in text_to_cluster:
            idx = qs[qid].index[i]
            qs[qid].at[idx, 'cluster_id'] = int(text_to_cluster[response])

    return qs


def reclassify_unclustered_responses(qs, qid, batch_size=50, model="gpt-4-turbo-preview"):
    """
    Uses an LLM to attempt to classify unclustered responses into existing themes.

    Args:
        qs: The data structure containing question data
        qid: The question ID to process
        batch_size: Number of responses to process in each batch (to avoid API limits)
        model: The OpenAI model to use

    Returns:
        Updated qs data structure with reclassified responses
    """
    import pandas as pd
    import numpy as np

    # Get the DataFrame for the question
    df = qs[qid]

    # Get question text - check if it exists
    question_str = None
    if 'Question' in df.columns and len(df) > 0:
        # Get from the first available row
        question_str = df.iloc[0]['Question']
        if pd.isna(question_str) and len(df) > 1:
            question_str = df.iloc[1]['Question']

    if not question_str:
        question_str = f"Question {qid}"  # Fallback if question text not found

    # Check if required columns exist
    if 'response_theme' not in df.columns:
        raise ValueError("Column 'response_theme' not found. Run save_themes_to_data first.")

    if 'question_themes' not in df.columns:
        raise ValueError("Column 'question_themes' not found. Run save_themes_to_data first.")

    # Get theme metadata from the first row that has it
    themes_metadata = None
    for idx in df.index:
        if not pd.isna(df.at[idx, 'question_themes']):
            themes_metadata = df.at[idx, 'question_themes']
            break

    if not themes_metadata or 'cluster_names' not in themes_metadata:
        raise ValueError("Theme metadata not found or invalid.")

    # Get available themes (excluding "Noise/Unclustered")
    available_themes = [theme for theme_id, theme in themes_metadata['cluster_names'].items()
                       if theme != "Noise/Unclustered"]

    if not available_themes:
        print("No valid themes found for reclassification.")
        return qs

    # Create a formatted string of available themes for the prompt
    themes_formatted = "\n".join([f"{i+1}. {theme}" for i, theme in enumerate(available_themes)])

    # Find unclustered responses
    unclustered_mask = df['response_theme'] == "Noise/Unclustered"
    unclustered_indices = df[unclustered_mask].index.tolist()

    if not unclustered_indices:
        print("No unclustered responses found.")
        return qs

    print(f"Found {len(unclustered_indices)} unclustered responses. Attempting reclassification...")

    # Create prompts for each unclustered response
    prompts = []
    batch_indices = []

    for idx in unclustered_indices:
        response_text = df.at[idx, 'English Responses']

        # Skip empty responses
        if pd.isna(response_text) or response_text.strip() == "":
            continue

        prompt = f"""Evaluate if the following response to the question below can be classified into one of the existing themes.

Question:
"{question_str}"

Response:
"{response_text}"

Available themes:
{themes_formatted}

Steps:
1. Analyze the response's core meaning or intent
2. Compare it to the available themes
3. If the response clearly matches a theme, select that theme
4. If the response doesn't clearly match any theme, select "None"

Return ONLY the theme number or "None" with no explanation or additional text.
"""

        prompts.append(prompt)
        batch_indices.append(idx)

    # Process responses in batches
    all_classifications = []

    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i+batch_size]
        batch_idx = batch_indices[i:i+batch_size]

        print(f"Processing batch {i//batch_size + 1}/{(len(prompts) + batch_size - 1)//batch_size}...")

        # Get LLM responses using the provided function
        batch_responses = get_llm_responses(batch_prompts, model=model)

        # Process the responses
        for idx, response in zip(batch_idx, batch_responses):
            response = response.strip()

            # Try to parse as a number
            try:
                theme_num = int(response)
                if 1 <= theme_num <= len(available_themes):
                    selected_theme = available_themes[theme_num - 1]
                    df.at[idx, 'response_theme'] = selected_theme
                    all_classifications.append((idx, selected_theme))
                else:
                    # Number out of range
                    pass
            except ValueError:
                # Not a number, check if it's a theme name directly
                if response in available_themes:
                    df.at[idx, 'response_theme'] = response
                    all_classifications.append((idx, response))
                # Otherwise, leave as "Noise/Unclustered"

    # Print results
    reclassified_count = len(all_classifications)
    print(f"Reclassification complete. {reclassified_count} of {len(unclustered_indices)} responses were reclassified.")

    return qs

# Example usage:
"""
# After running save_themes_to_data:
qs = reclassify_unclustered_responses(qs, qid)

# To visualize the result:
fig, axes = plot_theme_distribution(qs, qid)
plt.show()
"""



def plot_themes(reduced_embeddings, cluster_labels, cluster_names):
    """
    Plot the clustered embeddings with their labels.
    """
    import matplotlib.pyplot as plt

    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        reduced_embeddings[:, 0],
        reduced_embeddings[:, 1],
        c=cluster_labels,
        cmap='tab20'
    )

    # Add legend
    legend_elements = [
        plt.Line2D([0], [0], marker='o', color='w',
                  label=cluster_names[label],
                  markerfacecolor=scatter.cmap(scatter.norm(label)))
        for label in set(cluster_labels) if label != -1
    ]
    plt.legend(handles=legend_elements, title="Themes", bbox_to_anchor=(1.05, 1))

    plt.title("Question Response Themes")
    plt.xlabel("UMAP Dimension 1")
    plt.ylabel("UMAP Dimension 2")
    plt.tight_layout()
    plt.show()


def plot_clusters(qs, qid, figsize=(12, 8), title=None, marker_size=50, show_legend=True,
                  legend_cols=1, highlight_cluster=None, show_unclustered=True):
    """
    Plot the clustered embeddings with their labels, reading directly from the qs structure.
    The function re-runs the initial clustering steps to ensure proper visualization.

    Args:
        qs: The data structure containing question data
        qid: The question ID to visualize
        figsize: Size of the figure as a tuple (width, height)
        title: Custom title for the plot (default: auto-generated based on qid)
        marker_size: Size of the scatter plot markers
        show_legend: Whether to show the legend
        legend_cols: Number of columns in the legend
        highlight_cluster: Cluster ID or name to highlight (others will be faded)
        show_unclustered: Whether to show unclustered points (noise)

    Returns:
        Matplotlib figure and axes objects
    """
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    from matplotlib.lines import Line2D

    # Get the DataFrame for the question
    df = qs[qid]

    # Check if we have the necessary columns
    if 'cluster_id' not in df.columns or 'response_theme' not in df.columns:
        raise ValueError("Missing required columns 'cluster_id' or 'response_theme'. Run save_themes_to_data first.")

    # Get theme metadata
    themes_metadata = None
    for idx in df.index:
        if 'question_themes' in df.columns and not pd.isna(df.at[idx, 'question_themes']):
            themes_metadata = df.at[idx, 'question_themes']
            break

    if not themes_metadata or 'cluster_names' not in themes_metadata:
        raise ValueError("Theme metadata not found or invalid. Run save_themes_to_data first.")

    # Simply re-run the discover_themes function to get embeddings and cluster labels
    print("Running dimensionality reduction and clustering...")

    # Use the discover_themes function which correctly handles your data format
    from umap.umap_ import UMAP
    import hdbscan

    # Prepare embeddings for clustering (directly without using the function)
    embeddings = []
    texts = []

    # Get embeddings and corresponding texts safely
    for idx in range(len(df['embedding'])):
        try:
            embedding_val = df['embedding'].iloc[idx]
            if embedding_val is not None and isinstance(embedding_val, (list, np.ndarray)) and len(embedding_val) > 0:
                embeddings.append(embedding_val)
                texts.append(df['English Responses'].iloc[idx])
        except (TypeError, ValueError):
            continue

    if not embeddings:
        raise ValueError("No valid embeddings found for visualization.")

    embeddings_array = np.array(embeddings)

    # Run dimensionality reduction with UMAP
    reducer = UMAP(n_neighbors=15, random_state=42)
    reduced_embeddings = reducer.fit_transform(embeddings_array)

    # Get the existing cluster IDs for these texts
    text_to_cluster_id = {}
    text_to_theme = {}

    for idx in df.index:
        if pd.notna(df.at[idx, 'response_theme']) and pd.notna(df.at[idx, 'cluster_id']):
            text = df.at[idx, 'English Responses']
            text_to_cluster_id[text] = int(df.at[idx, 'cluster_id'])
            text_to_theme[text] = df.at[idx, 'response_theme']

    # Map reduced embeddings to their cluster IDs and themes
    cluster_ids = []
    themes = []

    for text in texts:
        cluster_id = text_to_cluster_id.get(text, -1)  # Default to noise cluster
        theme = text_to_theme.get(text, "Noise/Unclustered")
        cluster_ids.append(cluster_id)
        themes.append(theme)

    # Convert to numpy arrays for filtering
    cluster_ids = np.array(cluster_ids)
    themes = np.array(themes)

    # Create figure and axis
    fig, ax = plt.subplots(figsize=figsize)

    # Get coordinates for plotting
    x_coords = reduced_embeddings[:, 0]
    y_coords = reduced_embeddings[:, 1]

    # Filter out unclustered points if needed
    if not show_unclustered:
        mask = cluster_ids != -1
        x_coords = x_coords[mask]
        y_coords = y_coords[mask]
        cluster_ids = cluster_ids[mask]
        themes = themes[mask]

    # Apply highlighting if specified
    highlight_mask = None
    if highlight_cluster is not None:
        if isinstance(highlight_cluster, str):
            # Highlight by theme name
            highlight_mask = themes == highlight_cluster
        else:
            # Highlight by cluster ID
            highlight_mask = cluster_ids == highlight_cluster

    # Create the scatter plot
    scatter = ax.scatter(
        x_coords,
        y_coords,
        c=cluster_ids,
        cmap='tab20',
        s=marker_size,
        alpha=0.7 if highlight_mask is None else np.where(highlight_mask, 1.0, 0.2)
    )

    # Create legend
    if show_legend:
        # Get unique cluster IDs (excluding noise if not showing it)
        unique_ids = sorted(set(cluster_ids))
        if not show_unclustered and -1 in unique_ids:
            unique_ids.remove(-1)

        # Create legend elements
        legend_elements = []
        for label in unique_ids:
            if label == -1:
                name = "Noise/Unclustered"
            else:
                name = themes_metadata['cluster_names'].get(label, f"Cluster {label}")

            legend_elements.append(
                Line2D([0], [0], marker='o', color='w', markerfacecolor=scatter.cmap(scatter.norm(label)),
                      markersize=10, label=name)
            )

        # Add the legend
        ax.legend(
            handles=legend_elements,
            title="Themes",
            loc='center left',
            bbox_to_anchor=(1, 0.5),
            ncol=legend_cols
        )

    # Set title and labels
    if title is None:
        # Try to get the question text
        question_text = None
        if 'Question' in df.columns:
            for idx in df.index:
                if pd.notna(df.at[idx, 'Question']):
                    question_text = df.at[idx, 'Question']
                    break

        if question_text:
            # Truncate long questions
            if len(question_text) > 70:
                question_text = question_text[:67] + "..."
            title = f"Question {qid}: {question_text}"
        else:
            title = f"Question {qid} Response Themes"

    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.set_xlabel("UMAP Dimension 1")
    ax.set_ylabel("UMAP Dimension 2")

    # Add summary information
    summary_text = (
        f"Total points: {len(x_coords)}\n"
        f"Number of clusters: {len([l for l in unique_ids if l != -1])}"
    )

    plt.figtext(0.02, 0.02, summary_text, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))

    # Adjust layout
    plt.tight_layout()

    return fig, ax



def plot_theme_distribution(qs, qid, min_percentage=1, sort_by='count', figsize=(12, 8)):
    """
    Creates a visualization of theme distribution for a specific question.

    Args:
        qs: The data structure containing question data
        qid: The question ID to visualize
        min_percentage: Minimum percentage for a theme to be shown separately (smaller will be grouped as 'Other')
        sort_by: How to sort the bars - 'count' (default), 'alphabetical', or 'percentage'
        figsize: Size of the figure as a tuple (width, height)

    Returns:
        Matplotlib figure and axes objects
    """
    import matplotlib.pyplot as plt
    import pandas as pd
    import numpy as np

    # Get the DataFrame for the question
    df = qs[qid]

    # Check if response_theme column exists
    if 'response_theme' not in df.columns:
        raise ValueError("Column 'response_theme' not found. Run save_themes_to_data first.")

    # Count occurrences of each theme
    theme_counts = df['response_theme'].value_counts()
    total_responses = len(df)
    theme_percentages = (theme_counts / total_responses) * 100

    # Combine themes with percentage less than min_percentage into 'Other'
    small_themes = theme_percentages[theme_percentages < min_percentage].index
    if not small_themes.empty:
        # Create a copy to avoid modifying the original DataFrame
        df_copy = df.copy()
        df_copy.loc[df_copy['response_theme'].isin(small_themes), 'response_theme'] = 'Other'

        # Recalculate counts and percentages
        theme_counts = df_copy['response_theme'].value_counts()
        theme_percentages = (theme_counts / total_responses) * 100

    # Create a DataFrame for plotting
    plot_data = pd.DataFrame({
        'Count': theme_counts,
        'Percentage': theme_percentages
    })

    # Sort the data
    if sort_by == 'count':
        plot_data = plot_data.sort_values('Count', ascending=False)
    elif sort_by == 'alphabetical':
        plot_data = plot_data.sort_index()
    elif sort_by == 'percentage':
        plot_data = plot_data.sort_values('Percentage', ascending=False)

    # Create figure and axis
    fig, ax1 = plt.subplots(figsize=figsize)

    # Plot counts as bars
    bars = ax1.bar(plot_data.index, plot_data['Count'], color='steelblue')
    ax1.set_xlabel('Theme')
    ax1.set_ylabel('Count', color='steelblue')
    ax1.tick_params(axis='y', labelcolor='steelblue')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')

    # Create a second y-axis for percentages
    ax2 = ax1.twinx()
    ax2.set_ylabel('Percentage (%)', color='red')
    ax2.plot(plot_data.index, plot_data['Percentage'], 'ro-')
    ax2.tick_params(axis='y', labelcolor='red')

    # Add percentage labels above each bar
    for i, bar in enumerate(bars):
        percentage = plot_data['Percentage'].iloc[i]
        count = plot_data['Count'].iloc[i]
        ax1.text(
            bar.get_x() + bar.get_width()/2,
            bar.get_height() + 0.5,
            f'{percentage:.1f}%\n({count})',
            ha='center',
            va='bottom',
            fontweight='bold'
        )

    # Add title
    plt.title(f'Theme Distribution for Question {qid}', fontsize=14, fontweight='bold')

    # Add summary information
    total_themes = len(plot_data)
    primary_theme = plot_data.index[0] if not plot_data.empty else "None"
    primary_percentage = plot_data['Percentage'].iloc[0] if not plot_data.empty else 0

    summary_text = (
        f'Total Responses: {total_responses}\n'
        f'Number of Themes: {total_themes}\n'
        f'Primary Theme: {primary_theme} ({primary_percentage:.1f}%)'
    )

    plt.figtext(0.02, 0.02, summary_text, fontsize=10, bbox=dict(facecolor='white', alpha=0.8))

    # Adjust layout
    plt.tight_layout()

    # Return figure and axes for further customization if needed
    return fig, (ax1, ax2)

# Example usage:
"""
# After running save_themes_to_data:
fig, axes = plot_theme_distribution(qs, qid)
plt.show()

# With customization:
fig, axes = plot_theme_distribution(
    qs,
    qid,
    min_percentage=2,  # Group themes with <2% representation as 'Other'
    sort_by='alphabetical',  # Sort themes alphabetically
    figsize=(14, 10)  # Larger figure
)
plt.show()
"""

def get_responses_by_theme(qs, qid, theme_name, max_responses=None, return_dataframe=False):
    """
    Retrieves all responses tagged with a specific theme.

    Args:
        qs: The data structure containing question data
        qid: The question ID to analyze
        theme_name: The theme name to filter by
        max_responses: Maximum number of responses to return (None for all)
        return_dataframe: If True, returns a DataFrame; if False, returns a list of responses

    Returns:
        Either a list of responses or a DataFrame, depending on return_dataframe parameter
    """
    import pandas as pd

    # Get the DataFrame for the question
    df = qs[qid]

    # Check if required column exists
    if 'response_theme' not in df.columns:
        raise ValueError("Column 'response_theme' not found. Run save_themes_to_data first.")

    # Filter responses with the specified theme
    filtered_df = df[df['response_theme'] == theme_name].copy()

    # Sort by cluster_id if available
    if 'cluster_id' in filtered_df.columns:
        filtered_df = filtered_df.sort_values('cluster_id')

    # Limit the number of responses if specified
    if max_responses is not None and max_responses > 0:
        filtered_df = filtered_df.head(max_responses)

    if return_dataframe:
        return filtered_df
    else:
        # Return just the responses as a list
        responses = filtered_df['English Responses'].tolist()
        return responses

# Example usage:
"""
# Get all responses for a specific theme
responses = get_responses_by_theme(qs, qid, "Peace and Unity")

# Print the first 5 responses
for i, response in enumerate(responses[:5], 1):
    print(f"{i}. {response}")
    print("-" * 40)

# Or get the entire DataFrame with all columns
theme_df = get_responses_by_theme(qs, qid, "Peace and Unity", return_dataframe=True)
"""


# Example usage workflow:
"""
# Step 1: Discover themes
cluster_labels, reduced_embeddings, texts = discover_themes(qs, qid)

# Step 2: Label clusters
cluster_names = label_clusters_with_llm(cluster_labels, texts)

# Step 3: Merge similar clusters by LLM recognition of theme similarity
merged_mapping, merge_info = merge_themes_with_llm(qs, qid, cluster_names)

# Step 4: Apply merged themes to get new numeric labels and theme names
numeric_mapping, new_cluster_names = apply_merged_themes(cluster_labels, merged_mapping)
new_cluster_labels = apply_numeric_mapping(cluster_labels, numeric_mapping)

# Step 5: Save themes to the data structure
qs = save_themes_to_data(qs, qid, new_cluster_labels, texts, new_cluster_names, merge_info)

# Step 6: Reclassify unclustered responses
qs = reclassify_unclustered_responses(qs, qid)

# Step 7: Visualize the results
plot_theme_distribution(qs, qid)
fig, ax = plot_clusters(qs, qid, show_unclustered=False)
plt.show()

# Step 8: Look at some individual responses by theme
responses = get_responses_by_theme(qs, qid, "Peace and Unity", max_responses=5)
for i, response in enumerate(responses, 1):
    print(f"{i}. {response}")
"""

'\n# Step 1: Discover themes\ncluster_labels, reduced_embeddings, texts = discover_themes(qs, qid)\n\n# Step 2: Label clusters\ncluster_names = label_clusters_with_llm(cluster_labels, texts)\n\n# Step 3: Merge similar clusters by LLM recognition of theme similarity\nmerged_mapping, merge_info = merge_themes_with_llm(qs, qid, cluster_names)\n\n# Step 4: Apply merged themes to get new numeric labels and theme names\nnumeric_mapping, new_cluster_names = apply_merged_themes(cluster_labels, merged_mapping)\nnew_cluster_labels = apply_numeric_mapping(cluster_labels, numeric_mapping)\n\n# Step 5: Save themes to the data structure\nqs = save_themes_to_data(qs, qid, new_cluster_labels, texts, new_cluster_names, merge_info)\n\n# Step 6: Reclassify unclustered responses\nqs = reclassify_unclustered_responses(qs, qid)\n\n# Step 7: Visualize the results\nplot_theme_distribution(qs, qid)\nfig, ax = plot_clusters(qs, qid, show_unclustered=False)\nplt.show()\n\n# Step 8: Look at some individual respon

# Analysis library usage examples

### List questions and segments by ID for reference

In [3]:
# show questions by type and ID
# this ID aka "qid" is what is used to reference the question in the other analysis functons
show_questions(qs)

Unnamed: 0,question type,question text
0,Poll Single Select,Please select your preferred language:
1,Poll Single Select,How old are you?
2,Poll Single Select,What is your gender?
3,Poll Single Select,What best describes where you live?
4,Poll Single Select,"Overall, would you say the increased use of artificial intelligence (AI) in daily life makes you feel…"
...,...,...
61,Poll Single Select,Do you agree or disagree?
62,Ask Opinion,Please explain whether you agree or disagree and why.
63,Poll Single Select,"Do you feel you were able to fully express your views on your culture, your values, and what you want for the future in this conversation? (through what you responded and how you voted)"
64,Poll Single Select,Do you feel like you understand yourself better after participating in this conversation?


In [None]:
# list the sgements by their ID
# these IDs are what are used to reference segments in other analysis functions
# common usage of segments IDs si: segs = [0,1] to do analysis comparing segments 0 and 1
show_segments(qs)

### Plot poll results

In [None]:
#choose question and segments
qid = 25

segs = [0]

#plot
d = plot_poll(qs[qid],segs)

### Ranking and displaying responses to collective response prompts

#### Basic
First we'll create a simple visualization of the results of a **collective response** question (aka "ask opinion" on Remesh) where users respond with natural language and then vote on the responses submitted by others. The visualization is generated for a selected *question* and *set of segments*. In the visualization each row corresponds to a response, columns correspond to the selected set of segments, and values correspond to the fraction of each segment which  agrees* with each response.

*this agreement fraction is computed on Remesh using [elicitation inference](https://openreview.net/pdf?id=tkxnRPkb_H). We sample around 10-30 votes per person, then infer the rest. Accuracy of individual vote inferences is 75-80%, and the aggregated agreement fraction values for each segment have a 1 stdv confidence range of around +/- 1-3% relative to the participant sample*

In [None]:
#choose question and segments
qid = 21
segs = segs = [231,232,233,234,235,236,0]

#print the question text
print(qs[qid]["Question"][1])

#visualize table of results with the first 5 responses
table_ask(qs[qid],segs,15)

#### Rank by bridging, polarization, and divergence

Now we'll create the same table, but add metrics for bridging, polarizaiton, and divergence. We'll use the "bridging_ask" function that automatically ranks by the bridging metric.

- **The bridging metric** is meant to capture the degree to which there is agreement for a response across ALL specified population segments; even those which typically disagree. To capture this we use the segment-level analouge of a Max-Min social wellfare function. If a_ij is the fraction of the j^th segment which agrees with i^th response, then we compute the bridging metric for that response as b_i = MIN(a_i1,a_i2,...,a_iN)

- **The polarization metric** is meant to capture the degree to which there is polariation between specified segments about a response. To capture this we compute the difference in agreement fraction for the segments which most agree with the response and least agree with the response. ie p_i = MAX(a_i1,a_i2,...,a_iN) - MIN(a_i1,a_i2,...,a_iN)

- **The divergence metric** is meant to capture the degree to which there is a minority vs majority split between specified segments about a response. ie. where the majority of one segment agrees with a response but a majority of another segment disagrees with it.  To capture this we compute the geometric mean of how far away the highest and lowest agreement segments are from 50% agreement. ie d_i = SQRT(MAX(dmax_i-0.5,0)*MAX((0.5-dmin)_i,0)), where dmax_i = MAX(a_i1,a_i2,...,a_iN) and dmin_i = MIN(a_i1,a_i2,...,a_iN)

In [None]:
#choose question and segments
qid = 21
segs = segs = [231,232,233,234,235,236,0]

#print the question
print(qs[qid]["Question"][1])

#generate a table with bridging, polarization, and consensus metrics; keeping the 5 responses with the hightest bridging agreement
ba = bridging_ask(qs[qid],segs).iloc[:5]

#display a pretty version fo the table
ba.style.pipe(make_pretty)

We can show responses with the highest divergence in a similar manner using the *get_divergent_responses* funciton.

In [None]:
#choose question and segments
qid = 21
segs = segs = [231,232,233,234,235,236,0]

#print the question
print(qs[qid]["Question"][1])

#generate a table with bridging, polarization, and consensus metrics; keeping the 5 responses with the hightest polarization
ba = get_divergent_responses(qs[qid],segs,5)

#display a pretty version fo the table
ba.style.pipe(make_pretty)

#### Rank by relevence

We use the *rank_by_similarity* function to rank responses by relevance to a given term, topic, phrase, or question

In [None]:
#choose question and segments
qid = 24
segs = segs = [231,232,233,234,235,236,0]

#text we want responses to be relevant to
text = "the next generation of children"

#print the question
print(qs[qid]["Question"][1])

#generate a table with responsees ranked by relatedness to the search text
ba = rank_by_similarity(qs[qid], text)

#display a pretty version of the table the top 5 most relevant responses and agreement data for the given segments
table_ask(ba,segs,5)

### LLM result synthesis

#### Generate a **bullet point summary** of ideas in bridging responses

In [None]:
#select question and segments
qid = 24
segs = segs = [0]

#set the params for the
synth_type = "bullets"      #synthesize results as bullet points
rank_type = "bridging"      #ranking responses to synthesize by bridgage agreement across [segs]
thresh = .50                #only keep responses where 50%+ participants in all [segs] agree
n_max = 50                  #only keep the top 50 responses withi highest bridging agreement
query_text = ""             #set blank to not focus on any particular topic

#run the sytthesis
out = synthesize(qs,qid,segs,synth_type,rank_type,thresh,n_max,query_text = query_text)

#print quesiton text
print(qs[qid]["Question"].iloc[0])

#print synthesis results
print(out[synth_type])

#### Generate a **taxonomy summarizing ideas** from bridging responses related to *healthcare*

In [None]:
#select question and segments
qid = 27
segs = segs = [0]

#set the params for the
synth_type = "summary"      #synthesize results as taxonomic summary
rank_type = "bridging"      #ranking responses to synthesize by bridgage agreement across [segs]
thresh = .50                #only keep responses where 50%+ participants in all [segs] agree
n_max = 1000                #only keep the top 100 responses withi highest bridging agreement
query_text = ""   #focus on resposnes and sythesized results related to "healthcare"

#run the sytthesis
out = synthesize(qs,qid,segs,synth_type,rank_type,thresh,n_max,query_text = query_text)

#print quesiton text
print(qs[qid]["Question"].iloc[0])

#print synthesis results
print(out[synth_type])

#### Generate a set of **outcomes** related to *healthcare*


In [None]:
#select question and segments
qid = 21
segs = segs = [231,232,233,234,235]

#set the params for the
synth_type = "outcomes"      #synthesize result into outcomes
rank_type = "relevance"      #ranking responses to synthesize by bridgage agreement across [segs]
thresh = .33                #only keep responses where cosine similarity with query_text is >0.33
n_max = 100                 #only keep the top 100 responses withi highest bridging agreement
query_text = "human-AI relationships"  #focus on resposnes and sythesized results related to "climate"

#run the sytthesis
out = synthesize(qs,qid,segs,synth_type,rank_type,thresh,n_max,query_text = query_text)

#print quesiton text
print(qs[qid]["Question"][1])

#print synthesis results
print(out[synth_type])

#show the top 10 most relevant responses used in the synthesis along with their agreement across contintents
dat = out["data"]
table_ask(dat,segs,10)

## Get Themes of Feedback on the Survey

In [None]:
qid = 27 # Feedback question
#print quesiton text
print(qs[qid]["Question"][1])

# Step 1: Discover themes
cluster_labels, reduced_embeddings, texts = discover_themes(qs, qid)

# Step 2: Label clusters
cluster_names = label_clusters_with_llm(cluster_labels, texts)

# Step 3: Merge similar clusters by LLM recognition of theme similarity
merged_mapping, merge_info = merge_themes_with_llm(qs, qid, cluster_names)

# Step 4: Apply merged themes to get new numeric labels and theme names
numeric_mapping, new_cluster_names = apply_merged_themes(cluster_labels, merged_mapping)
new_cluster_labels = apply_numeric_mapping(cluster_labels, numeric_mapping)

# Step 5: Save themes to the data structure
qs = save_themes_to_data(qs, qid, new_cluster_labels, texts, new_cluster_names, merge_info)

# Step 6: Reclassify unclustered responses
qs = reclassify_unclustered_responses(qs, qid)

# Step 7: Visualize the results
plot_theme_distribution(qs, qid)
fig, ax = plot_clusters(qs, qid, show_unclustered=False)
plt.show()

In [None]:
responses = get_responses_by_theme(qs, qid, "General Satisfaction/Positive Feedback")
for i, response in enumerate(responses, 1):
    print(f"{i}. {response}")

In [None]:
save_data_to_drive(qs)

# Global Dialogues Cadence Analysis

## GD Indicators

In [14]:
pd.set_option('display.max_rows', None)
show_questions(qs)
indicator_questions = [4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28]
indicator_questions = {
    'Risks':[]
                       }

In [None]:
qs[]

In [None]:
# prompt: I want to output a dataframe that shows the comma-delimited-joined "Responses" for each dataframe at every index in qs where qs[i]["Question Type"] == "Poll Single Select"
# For example, the output of qs[35]["Responses] is a df:
# Responses
# 0	Profoundly Worse
# 1	Noticeably Worse
# 2	No Major Change
# 3	Noticeably Better
# 4	Profoundly Better
# and the output of qs[35]["Question Type"] is a dataframe:
# 	Question Type
# 0	Poll Single Select
# 1	Poll Single Select
# 2	Poll Single Select
# 3	Poll Single Select
# 4	Poll Single Select
# qs is a list of dataframes. Make this as simple and straightforward using standard pandas pd dataframe practices/tricks to do this like a regular data scientist in a python notebook would do

import pandas as pd

def get_poll_responses(qs):
  """
  Returns a DataFrame with comma-delimited "Responses" for each DataFrame
  where "Question Type" is "Poll Single Select".
  """

  poll_data = []
  for i in range(len(qs)):
    if "Question Type" in qs[i].columns and all(qs[i]["Question Type"] == "Poll Single Select"):
      responses = ", ".join(qs[i]["Responses"].astype(str))
      poll_data.append({"Index": i, "Responses": responses})
  return pd.DataFrame(poll_data)

# Example usage (assuming 'qs' is defined as in your provided code):
result_df = get_poll_responses(qs)
result_df


In [None]:
qs[35]["Responses"]
get_poll_responses(qs)

In [20]:
show_questions(qs)

Unnamed: 0,question type,question text
0,Poll Single Select,Please select your preferred language:
1,Poll Single Select,How old are you?
2,Poll Single Select,What is your gender?
3,Poll Single Select,What best describes where you live?
4,Poll Single Select,"Overall, would you say the increased use of artificial intelligence (AI) in daily life makes you feel…"
5,Poll Single Select,What religious group or faith do you most identify with?
6,Poll Single Select,What country or region do you most identify with?
7,Poll Single Select,"Thinking about the last three months, how often, if at all, have you noticed AI systems in your daily life?"
8,Poll Single Select,"Thinking about the last three months, how often, if at all, have you noticed human interactions which have been replaced with automated systems?"
9,Poll Single Select,"Thinking about the last three months, how often, if at all, have you been expected to use an AI system at work?"


## Convergence Analysis

#### Identify Segments and Prepare Data
First, we need a reliable way to get the names of all segment columns used for agreement percentages. We can inspect the columns of the first 'Ask Opinion' DataFrame. We also need to filter qs to only include 'Ask Opinion' questions.

In [45]:
# Import necessary libraries (if not already imported)
import pandas as pd
import numpy as np
from scipy.stats import variation # Keep this if using CV later

# Assuming 'qs' is the list of DataFrames loaded previously (e.g., from JSON)

# --- Direct Filtering of qs ---
# Filter qs directly for 'Ask Opinion' questions
# This checks the 'Question Type' column within each DataFrame
qs_opinion = [
    df for df in qs
    if 'Question Type' in df.columns and not df.empty and df['Question Type'].iloc[0] == 'Ask Opinion'
]

# --- Identify Segment Columns ---
# (This part remains similar, using the first identified 'Ask Opinion' DF)
if qs_opinion:
    first_opinion_df = qs_opinion[0]
    # Determine segment columns - **Double-check the starting index based on your JSON structure**
    # If loaded from the same source CSV logic, index 6 should be correct.
    # Inspect first_opinion_df.columns if unsure.
    segment_start_index = 6
    segment_end_offset = -3 # Exclude last 3 columns (Submitted By, Language, etc.)
    segment_cols = first_opinion_df.columns[segment_start_index:segment_end_offset].tolist()

    print(f"Identified {len(segment_cols)} segment columns from the first 'Ask Opinion' DataFrame.")
    # print(segment_cols) # Optionally print all segment names

    # Define a helper function for safe numeric conversion (can be defined elsewhere too)
    def safe_to_numeric(series):
        return pd.to_numeric(series, errors='coerce')

    # --- Pre-process Filtered DataFrames ---
    qs_opinion_processed = []
    # Now iterate through the correctly filtered qs_opinion list
    for df in qs_opinion:
        df_copy = df.copy()
        for col in segment_cols:
            if col in df_copy.columns:
                df_copy[col] = safe_to_numeric(df_copy[col])
            else:
                df_copy[col] = np.nan # Handle potentially missing segment columns
        qs_opinion_processed.append(df_copy)

    print(f"\\nSuccessfully processed {len(qs_opinion_processed)} 'Ask Opinion' questions for numeric analysis.")

else:
    print("No 'Ask Opinion' questions found in the 'qs' list based on internal 'Question Type'.")
    segment_cols = []
    qs_opinion_processed = []

# --- Retrieve Question Text ---
# If you need the question text later, you can get it from the DataFrame directly
# Example: Get text for the first processed opinion question
if qs_opinion_processed:
    first_question_text = qs_opinion_processed[0]['Question'].iloc[0]
    print(f"\\nExample: Text of first opinion question: '{first_question_text[:100]}...'") # Print first 100 chars
else:
     print("\\nNo processed opinion questions to retrieve text from.")


Identified 259 segment columns from the first 'Ask Opinion' DataFrame.
\nSuccessfully processed 15 'Ask Opinion' questions for numeric analysis.
\nExample: Text of first opinion question: 'Can you explain why you gave that trust score to your AI chatbot?...'


In [46]:
qs_opinion[0]

Unnamed: 0,Question ID,Question Type,Question,Star,English Responses,Original Responses,Sentiment,All(967),O1: French (13),O1: Chinese (China) (91),...,O7: Venezuela (0),O7: Vietnam (12),O7: Yemen (0),O7: Zambia (0),O7: Zimbabwe (1),Submitted By,Language,Sample ID,Participant ID,embedding
0,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,"While they are made to be helpful on the surface, there are deeper issues (such as data collection etc.).","While they are made to be helpful on the surface, there are deeper issues (such as data collection etc.).",Neutral,0.56,0.23,0.57,...,,0.5,,,0.0,"Europe, O1: English, O2: 18-25, O3: Female, O4: Urban, O5: Equally concerned and excited, O6: Other religious group, O7: Germany, Western Europe",English,,eaffc3e9-065d-4fd2-b370-6278744bf38b,"[-0.0014181445585563779, 0.0442076213657856, 0.08159715682268143, 0.05447141453623772, 0.019189627841114998, 0.010437910445034504, 0.017961638048291206, -0.012536490336060524, 0.007372519001364708, -0.002866072580218315, 0.028683636337518692, -0.02225043624639511, -0.03738953173160553, -0.034237079322338104, 0.009548992849886417, 0.030461471527814865, 0.006025396287441254, -0.0108136385679245, 0.01240819226950407, 0.05285853520035744, 0.06462524086236954, 0.06704456359148026, 0.027565615251660347, -0.009475680068135262, -0.004675982519984245, -0.018823063001036644, -0.006616480648517609, 0.06407539546489716, 0.044940751045942307, 0.0024559791199862957, 0.023496754467487335, -0.04098185896873474, -0.05711067467927933, -0.016055503860116005, 0.018062442541122437, 0.027217378839850426, 0.028573665767908096, -0.041348423808813095, 0.029490076005458832, 0.04156836122274399, 0.018878048285841942, 0.027070753276348114, -0.052052091807127, -0.013462063856422901, -0.012014136649668217, -0.008669239468872547, -0.06099625676870346, -0.009988869540393353, -0.016101324930787086, 0.037077952176332474, -0.009778095409274101, 0.07573212683200836, -0.0033311506267637014, -0.038709163665771484, -0.007432085927575827, -0.03456698730587959, -0.031616147607564926, -0.009040385484695435, -0.014515935443341732, -0.0035808724351227283, 0.07800482958555222, -0.0015338413650169969, -0.011940822936594486, 0.028628651052713394, -0.03082803450524807, -0.006506511475890875, -0.011785034090280533, 0.006483601406216621, -0.08782874047756195, 0.031487852334976196, 0.03544674068689346, -0.05065914988517761, 0.0003084292693529278, -0.00013481250789482147, 0.009943049401044846, -0.03348562493920326, 0.08313672244548798, 0.023570068180561066, -0.011198530904948711, 0.022543687373399734, 0.011620080098509789, -0.006634809076786041, -0.06873075664043427, -0.03645479306578636, -0.0007852488779462874, 0.006799762602895498, -0.09559989720582962, -0.0038328850641846657, -0.08284347504377365, -0.012059956789016724, -0.057917118072509766, -0.0027011188212782145, -0.01732015050947666, 0.06015315651893616, 0.03892910107970238, 0.0009839953854680061, -0.012362372130155563, 0.03266085684299469, -0.04582050442695618, 0.0030447724275290966, ...]"
1,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,"I think ""trust"" and ""distrust"" require an element of agency- and I believe that chatbots, while occassionally inaccurate, are tools and can't be assessed as either trustworthy or untrustworthy.","I think ""trust"" and ""distrust"" require an element of agency- and I believe that chatbots, while occassionally inaccurate, are tools and can't be assessed as either trustworthy or untrustworthy.",Negative,0.56,0.38,0.44,...,,0.67,,,0.0,"North America, Northern America, O1: English, O2: 36-45, O3: Male, O4: Suburban, O5: More excited than concerned, O6: I do not identify with any religious group or faith, O7: Canada",English,,a98c329f-cca1-473c-b74e-823bf2601949,"[0.013131069019436836, 0.05228687822818756, -0.010141340084373951, 0.04106982797384262, 0.09383150190114975, -0.01073483470827341, 0.014748342335224152, -0.004428953863680363, 0.023962346836924553, 0.027909085154533386, 0.013605864718556404, -0.04519461840391159, 0.0232798270881176, 0.0025520268827676773, -0.007210960146039724, -0.023235315456986427, -0.0634445771574974, -0.02777555026113987, -0.01816093549132347, -0.0188582930713892, -0.001222228049300611, -0.003088026773184538, -0.0007752524106763303, -0.06546246260404587, -0.031692612916231155, -0.04178202152252197, 0.015683095902204514, 0.0037686910945922136, 0.014904133975505829, -0.005237590055912733, 0.026944657787680626, -0.030950745567679405, -0.008531485684216022, 0.00260395766235888, 0.015163788571953773, 0.0679551362991333, -0.01535667385905981, 0.02632148750126362, 0.0576283298432827, -0.0028932865243405104, -0.03439301624894142, -0.0025019508320838213, 0.02369527332484722, -0.0009435637621209025, -0.007759942673146725, -0.035075534135103226, -0.05246492847800255, -0.05035801976919174, -0.0110686756670475, 0.045432016253471375, -0.11585015803575516, 0.03694504126906395, 0.025638969615101814, -0.013821006752550602, -0.026395674794912338, 0.010081990621984005, -0.015697933733463287, 0.00874662771821022, -0.03807268291711807, 0.02473388984799385, -0.015505047515034676, -0.005526918917894363, 0.04163365066051483, 0.08035917580127716, -0.025980228558182716, 0.005556593649089336, -0.03216740861535072, -0.03741983696818352, -0.05068444460630417, -0.004258323926478624, -0.00645054504275322, 0.03267187997698784, -0.03519423305988312, 0.03985316678881645, 0.0005341451615095139, 0.006802932359278202, 0.02001560665667057, 0.03041660040616989, -0.008991443552076817, -0.029496684670448303, -0.020816825330257416, -0.018947316333651543, -0.055610448122024536, -0.05196045711636543, -0.04397795349359512, 0.010742252692580223, -0.03041660040616989, -0.021781254559755325, -0.03237513452768326, -0.03477878496050835, -0.042790964245796204, 0.05098119005560875, 0.02691498212516308, 0.06009133160114288, 0.05012062191963196, -0.05246492847800255, -0.014703829772770405, 0.01455545611679554, 0.019095690920948982, -0.025312546640634537, ...]"
2,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,"I'm asking the question to Chat to benefit myself and my respect, so I believe he will answer according to what I asked without interfering in anything, giving the exact answer I want...","To fazendo a pergunta para o Chat para me beneficiar e ao meu respeito, então creio que ele vá responder conforme o que eu perguntei sem interferir em nada, dando a resposta exata que eu quero...",Positive,0.56,0.31,0.48,...,,0.58,,,1.0,"44+, 55+, O1: Portuguese (Portugal), O2: 56-65, O3: Female, O4: Urban, O5: Equally concerned and excited, O6: Christianity, O7: Brazil, South America",PortuguesePortugal,,6ff4b00f-1d8f-464d-8714-6fa31037ebfa,"[0.020496023818850517, -0.03051287680864334, 0.039944130927324295, 0.04049890860915184, 0.023686006665229797, -0.044875502586364746, -0.03627641871571541, 0.018230672925710678, -0.014262458309531212, -0.016967007890343666, 0.02419455349445343, 0.021667225286364555, 0.006672765593975782, 0.00913845282047987, -0.030250897631049156, -0.03359498456120491, -0.015171680599451065, -0.008837946690618992, 0.018708400428295135, -0.010086201131343842, 0.01984878070652485, 0.02923380210995674, -0.03347169980406761, 0.02199084684252739, -0.02262267842888832, -0.03760172799229622, -0.03596821054816246, 0.028679022565484047, 0.010956896468997002, 0.01372308935970068, -0.009338789619505405, -0.02692222036421299, -0.018646758049726486, -0.04210160672664642, 0.005925354082137346, -0.006025522481650114, 0.021281961351633072, -0.024055859073996544, 0.025381166487932205, -0.01565711200237274, -0.018076566979289055, -0.034026481211185455, -0.04114615172147751, 0.0441974401473999, -0.005439921747893095, -0.01821526326239109, -0.06435443460941315, -0.018954968079924583, 0.0035752460826188326, 0.00617962796241045, -0.035598356276750565, 0.012605824507772923, 0.026768114417791367, 0.11126412451267242, 0.03926606476306915, 0.012413193471729755, 0.028108831495046616, 0.02650613524019718, -0.01665879786014557, 0.049991805106401443, -0.01880086399614811, 0.018292315304279327, -0.0030782560352236032, -0.01722898706793785, -0.013676857575774193, 0.018246084451675415, -0.017444735392928123, -0.017799178138375282, -0.019401874393224716, -0.013330120593309402, -0.03639970347285271, 0.013307004235684872, -0.0372626967728138, -0.01903202198445797, -0.02379387989640236, -0.03569081798195839, 0.059207309037446976, -0.0013609436573460698, 0.030774855986237526, -0.03473536670207977, 0.024302426725625992, 0.03476618602871895, -0.049745235592126846, -0.062166132032871246, -0.014578374102711678, 0.004634721204638481, -0.03427305072546005, 0.010702623054385185, -0.01067180186510086, -0.059207309037446976, -0.1192159652709961, 0.008853357285261154, -0.05261159688234329, 0.01716734655201435, 0.08346350491046906, -0.04321116581559181, 0.018600527197122574, -0.009508305229246616, -0.004257162567228079, -0.019879601895809174, ...]"
3,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,Because they properly understand and lead us to the right answer from various information.,Because they properly understand and lead us to the right answer from various information.,Positive,0.56,0.23,0.46,...,,0.67,,,0.0,"44+, Asia, Eastern Asia, O1: English, O2: 46-55, O3: Female, O4: Urban, O5: Equally concerned and excited, O6: I do not identify with any religious group or faith, O7: Japan",English,,c9a758a9-5068-4c7d-a138-e2f6bfd35133,"[0.021484559401869774, -0.025409622117877007, 0.0026892588939517736, 0.045802246779203415, 0.057046227157115936, -0.018031682819128036, -0.026914721354842186, 0.04046062007546425, -0.018282532691955566, 0.03269902616739273, 0.048045139759778976, 0.013900037854909897, -0.021971503272652626, -0.022163329645991325, 0.05164557322859764, -0.027091791853308678, -0.008410850539803505, -0.004478408955037594, 0.006496275309473276, 0.01913837343454361, 0.0482812337577343, 0.010926728136837482, 0.03284658491611481, -0.018931791186332703, -0.0365060418844223, 0.0005155334947630763, -0.0440315417945385, 0.016924992203712463, -0.002641302300617099, 0.0027040147688239813, 0.013442604802548885, -0.022620761767029762, -0.009620832279324532, -0.007798481732606888, -0.0049985540099442005, 0.06988383829593658, -0.008432984352111816, -0.029157616198062897, -0.014121375977993011, 0.015169043093919754, 0.003611501306295395, 0.015028862282633781, -0.024745607748627663, 0.0085952989757061, 0.07466474175453186, -0.04196571931242943, -0.06947067379951477, 0.0011860036756843328, 0.027490202337503433, 0.030175771564245224, -0.015552695840597153, 0.003012043656781316, 0.03520752489566803, 0.007960796356201172, 0.0019662207923829556, 0.02039262466132641, 0.027534469962120056, 0.03712578862905502, 0.013236022554337978, 0.010963617824018002, 0.00953967496752739, -0.019713854417204857, 0.03384998440742493, 0.024406222626566887, -0.008418228477239609, -0.04819269850850105, -0.02701801247894764, -0.005337939132004976, 0.012328536249697208, 0.02113041840493679, 0.06138445436954498, 0.028242751955986023, 0.012166221626102924, -0.0016498916083946824, 0.018990814685821533, -0.02939371019601822, -0.0012311936588957906, 0.004850994795560837, -0.009893816895782948, -0.033790960907936096, 0.026117904111742973, -0.00227424968034029, -0.01165714394301176, -0.08056709915399551, -0.016556095331907272, -0.0008152622613124549, 0.02989540994167328, 0.020894324406981468, -0.07307110726833344, -0.07253989577293396, -0.034115590155124664, 0.04450372979044914, -0.010469296015799046, 0.023373311385512352, -0.028803475201129913, -0.01919739693403244, 0.000343996420269832, 0.01125135738402605, 0.042231325060129166, 0.0013612298062071204, ...]"
4,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,"To me an AI chatbot is a resource to get information that puts you in the ""ballpark"". However, it should never be relied on completely. What it stats as facts still needs to be checked regularly.","To me an AI chatbot is a resource to get information that puts you in the ""ballpark"". However, it should never be relied on completely. What it stats as facts still needs to be checked regularly.",Neutral,0.56,0.31,0.48,...,,0.5,,,0.0,"North America, Northern America, O1: English, O2: 36-45, O3: Male, O4: Urban, O5: Equally concerned and excited, O6: Christianity, O7: Canada",English,,2d45fe1e-b751-44ce-be41-9cef434ed76c,"[0.009808898903429508, 0.0029124883003532887, 0.0009478791616857052, 0.02221338264644146, 0.03166010603308678, -0.055774904787540436, -0.011627317406237125, 0.006104153115302324, 0.034104786813259125, 0.06295803934335709, 0.005715569946914911, -0.019723432138562202, 0.005783477332442999, 0.0018693497404456139, -0.009695718996226788, 0.010103165172040462, -0.0903022289276123, 0.015422606840729713, -0.0032369366381317377, -0.03076975978910923, 0.011597136035561562, 0.04038248211145401, -0.026167122647166252, -0.022379379719495773, -0.015218883752822876, -0.05375276505947113, 0.015007615089416504, 0.001405313378199935, -0.024144981056451797, 0.015543331392109394, 0.017550382763147354, -0.021851208060979843, -0.04089556261897087, -0.0035972248297184706, 0.02938142418861389, 0.03848106414079666, 0.037545446306467056, -0.004832768812775612, 0.06579507142305374, -0.025835130363702774, 0.010465340688824654, 0.03259572386741638, 0.006817184388637543, 0.04472857713699341, -0.04970847815275192, -0.05182116478681564, 0.00648896349593997, -0.0057382057420909405, -0.007379309739917517, 0.017444748431444168, -0.135091170668602, 0.042917702347040176, -0.002276796381920576, 0.02085522748529911, 0.024522246792912483, 0.03510076552629471, -0.0029124883003532887, 0.0023597946856170893, 0.025518227368593216, 0.054446931928396225, -0.009529721923172474, 0.01715802773833275, 0.027646003291010857, 0.02483914978802204, -0.0868012085556984, -0.0065266904421150684, 0.011974401772022247, -0.017459839582443237, -0.049346305429935455, -0.04152936860918999, 0.028551440685987473, 0.05106663703918457, -0.036277834326028824, -0.010865241289138794, -0.008601649664342403, -0.029894504696130753, -0.03446695953607559, 0.06416528671979904, -0.010072984732687473, -0.011589591391384602, -0.02595585398375988, 0.028958886861801147, -0.0979079008102417, -0.06627797335386276, -0.0005055355723015964, -0.007341583259403706, -0.03585529699921608, 0.007632077671587467, -0.06241477653384209, -0.018274731934070587, -0.07201240956783295, -0.0013572120806202292, -0.0060022915713489056, 0.0014175744727253914, 0.050432831048965454, -0.007318947464227676, -0.022394470870494843, 0.03724363446235657, 0.012140397913753986, -0.006787003483623266, ...]"
5,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,"Taking ChatGBT as an example, it has access to massive online information such as research papers, books journals, government data and many more and the AI usually base it's answers on these resources. However, not all online information is correct and sometimes the AI might not accurately distinguish correct and misinformed resources","Taking ChatGBT as an example, it has access to massive online information such as research papers, books journals, government data and many more and the AI usually base it's answers on these resources. However, not all online information is correct and sometimes the AI might not accurately distinguish correct and misinformed resources",Neutral,0.56,0.23,0.49,...,,0.67,,,0.0,"Africa, Eastern Africa, O1: English, O2: 26-35, O3: Female, O4: Urban, O5: More excited than concerned, O6: Christianity, O7: Kenya",English,,feb1806c-338e-4fc7-a1ca-78979d41e3c9,"[0.012442115694284439, 0.022574301809072495, 0.049468472599983215, 0.07145778834819794, 0.012224622070789337, -0.08345741778612137, -0.0032792736310511827, 0.027584148570895195, -0.049648467451334, 0.015629516914486885, 0.03230900317430496, -0.021269341930747032, 0.02441924624145031, 0.017339464277029037, -0.04040875285863876, -0.0033917701803147793, -0.0475185327231884, -0.0023774264845997095, -0.020354371517896652, -0.01231461949646473, 0.002534921746701002, 0.020699361339211464, 0.006746041588485241, 0.02188432402908802, -0.061738092452287674, -0.05414832755923271, 0.014054565690457821, -0.019139409065246582, -0.014279558323323727, 0.04463862255215645, 0.03551890328526497, -0.03440893813967705, -0.0035980138927698135, -0.01202962826937437, -0.0033392717596143484, 0.04955846816301346, -0.007473519071936607, 0.010544674471020699, 0.05933816730976105, -0.030539056286215782, 0.003751759184524417, 0.014249559491872787, -0.0554082877933979, 0.019214406609535217, -0.04706854745745659, -0.05786821246147156, -0.02852911874651909, -0.025514211505651474, -0.012764605693519115, 0.010912163183093071, -0.07043782621622086, 0.0020474367775022984, -0.07319773733615875, -0.02365426905453205, 0.013139594346284866, 0.004308616742491722, -0.047998517751693726, 0.025004226714372635, -0.0054185823537409306, 0.016784481704235077, 0.0010809041559696198, 0.03851880878210068, 0.014954538084566593, -0.007529767230153084, -0.045628588646650314, -0.027179161086678505, 0.011812134645879269, -0.0129970982670784, -0.07091780751943588, -0.043138667941093445, -0.0015458897687494755, -0.003097404260188341, -0.03008907102048397, -0.027014166116714478, -0.009164717048406601, 0.021314341574907303, 0.03347896412014961, 0.059848152101039886, 0.051748402416706085, -0.021059349179267883, 0.021269341930747032, 0.04868849739432335, -0.037918828427791595, -0.07601764798164368, -0.003620513016358018, -0.0048336004838347435, -0.01552452053874731, -0.02096935175359249, -0.0888572558760643, 0.014294558204710484, -0.02345927432179451, 0.008617233484983444, -0.024794233962893486, 0.04187870770692825, 0.03479892387986183, -0.0009201278444379568, 0.01016218587756157, 0.0046311067417263985, 0.08255744725465775, 0.04649856314063072, ...]"
6,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,"Because AI is designed to meet human needs, the advice it provides tends to be helpful and reliable.","Debido a que la IA está diseñada para satisfacer las necesidades humanas, el asesoramiento que brinda tiende a ser útil y confiable.",Positive,0.56,0.23,0.45,...,,0.58,,,0.0,"O1: Spanish, O2: 18-25, O3: Female, O4: Suburban, O5: More excited than concerned, O6: I do not identify with any religious group or faith, O7: Australia, Oceania",Spanish,,b871b020-b6b0-459b-9a83-ac451c4db731,"[0.018713217228651047, 0.025152010843157768, 0.017371172085404396, 0.03438044711947441, 0.046473920345306396, -0.058688025921583176, -0.009318910539150238, 0.04737866669893265, -0.007441557012498379, 0.05289763584733009, -0.009394305758178234, -0.02754959464073181, -0.001596504240296781, -0.03929624706506729, -0.021668728440999985, -0.03029399923980236, -0.013684322126209736, -0.0327518992125988, -0.0015870798379182816, -0.024880586192011833, 0.006857240106910467, 0.07165608555078506, 0.02219649776816368, -0.0177783090621233, 0.03006781078875065, -0.029841624200344086, 0.033053480088710785, 0.04517711326479912, -0.025046456605196, -0.025423435494303703, 0.03808991611003876, -0.02620755136013031, -0.0011724033392965794, -0.050123073160648346, 0.029660673812031746, 0.048494525253772736, 0.016496581956744194, 0.0029894402250647545, 0.028831321746110916, 0.007467945571988821, -0.005877096205949783, 0.006171139422804117, 0.018155287951231003, -0.015410884283483028, -0.05983404070138931, -0.02610199712216854, -0.015772784128785133, -0.02396075800061226, -0.004293785896152258, 0.04674534499645233, -0.09602398425340652, 0.014317646622657776, -0.008715745061635971, -0.013774797320365906, -0.006913787219673395, 0.03748675063252449, 0.044513631612062454, -0.009115342050790787, 0.019210828468203545, 0.04406125470995903, 0.041950177401304245, 0.030882084742188454, 0.020884612575173378, 0.055038873106241226, -0.050153229385614395, -0.01999494433403015, -0.021563174203038216, -0.023523462936282158, -0.011867285706102848, -0.05075639486312866, 0.013043458573520184, -0.011120867915451527, 0.0043955701403319836, -0.0019150511361658573, -0.03564709424972534, -0.04204064980149269, 0.003585066180676222, 0.03661216050386429, -0.0003937070141546428, 0.020703664049506187, -0.03570741042494774, -0.00010425813525216654, -0.014106538146734238, -0.09059549123048782, -0.02703690342605114, -0.03347569704055786, -0.006491571199148893, -0.010035169310867786, -0.06689108163118362, -0.039024822413921356, -0.051208771765232086, 0.00894947163760662, 0.01847195066511631, -0.023342514410614967, -0.017039431259036064, 0.011776810511946678, 0.0042183902114629745, 0.030444789677858353, 0.025649622082710266, 0.0491580069065094, ...]"
7,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,Reliance on crowd data,依赖大众数据,Neutral,0.56,0.31,0.49,...,,0.42,,,0.0,"Europe, Norther Europe, O1: Chinese (China), O2: 26-35, O3: Male, O4: Urban, O5: Equally concerned and excited, O6: I do not identify with any religious group or faith, O7: Sweden",ChineseSimplified,,c108ed2c-de31-447f-a228-627e84aa5d1e,"[0.03771648928523064, 0.026784682646393776, 0.04379752650856972, 0.023216906934976578, 0.006063461769372225, -0.00975426472723484, -0.0567329116165638, 0.005544991698116064, -0.01690739206969738, 0.041336990892887115, -0.0012434490490704775, -0.020703645423054695, 0.026819832623004913, 0.02666165679693222, -0.0027900710701942444, -0.04225090518593788, 0.004622290842235088, -0.00941154733300209, -0.036486219614744186, 0.015536521561443806, -0.015571672469377518, 0.053042106330394745, 0.013638394884765148, -0.048191338777542114, -0.04963250830769539, -0.002763708122074604, -0.009815777651965618, 0.05174154043197632, 0.025466538965702057, -0.028647659346461296, 0.026890134438872337, -0.03929826244711876, -0.010114557109773159, -0.05332331359386444, -0.03856009989976883, 0.01910429820418358, -0.005294544156640768, 0.010923018679022789, 0.018981270492076874, 0.009525786153972149, 0.04914040118455887, -0.05595960095524788, 0.014596246182918549, 0.05929889529943466, -0.029245218262076378, -0.034271739423274994, -0.021933913230895996, -0.034377191215753555, 0.014684122055768967, 0.049421604722738266, -0.05768197402358055, -0.09195371717214584, -0.014121714048087597, -0.0180849339812994, -0.00850642193108797, -0.031248796731233597, -0.020650919526815414, 0.06661020219326019, 0.00989486649632454, -0.00798795185983181, 0.00208266731351614, -0.02650347910821438, 0.04899980127811432, -0.018682491034269333, -0.010369397699832916, 0.030563361942768097, -0.022373294457793236, -0.006863135378807783, -0.010993319563567638, 0.04189939796924591, -0.009174280799925327, 0.0031481669284403324, 0.01234661415219307, 0.0019969879649579525, 0.01278599537909031, -0.010870292782783508, -0.014613821171224117, 0.014077776111662388, -0.0372595340013504, 0.004189500585198402, 0.02710103802382946, -0.0028449937235563993, -0.045695651322603226, -0.05226879566907883, -0.032760266214609146, -0.05718986690044403, -0.04615261033177376, -0.03943886235356331, -0.03609956428408623, 0.01926247589290142, -0.016046203672885895, 0.04204000160098076, -0.01691618002951145, 0.02523806132376194, 0.0010314475512132049, -0.010369397699832916, 0.022847825661301613, 0.015255318023264408, 0.05472933128476143, 0.04566050320863724, ...]"
8,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,"I think AI can analyze various factors to be as close to you as possible, and they are also tailored to meet each person's needs.","Creo que la IA puede analizar varios factores para ser lo más apegada a ti, además de que están hechos para satisfacer las necesidades de cada persona.",Positive,0.55,0.15,0.44,...,,0.33,,,0.0,"Central America, North America, O1: Spanish, O2: 18-25, O3: Male, O4: Urban, O5: More excited than concerned, O6: Christianity, O7: Mexico",Spanish,,5eb3c73e-bb43-4bb0-981a-3f98e9ddfd87,"[0.011274497024714947, 0.008073716424405575, 0.008698084391653538, 0.030170582234859467, 0.05970961973071098, -0.08721049129962921, 0.017467934638261795, 0.029797397553920746, -0.0011276290751993656, 0.047021325677633286, -0.0354812927544117, -0.04214121401309967, 0.0056157186627388, -0.029165852814912796, -0.016290966421365738, -0.045385051518678665, 0.004216274246573448, -0.010642952285706997, 0.033873725682497025, -0.03743333742022514, 0.02052518166601658, 0.03011316806077957, -0.010843898169696331, -0.03820841386914253, 0.024443626403808594, -0.05571940913796425, 0.03605542331933975, 0.06946984678506851, -0.0013563843676820397, -0.005590600427240133, 0.02625213749706745, -0.024443626403808594, -0.03304123505949974, -0.05457114800810814, -0.0059027839452028275, 0.04733709618449211, -0.0131404222920537, 0.046648140996694565, 0.03149108216166496, -0.011403676122426987, -0.007901477627456188, 0.04148096218705177, 0.0208696611225605, 0.015300589613616467, -0.0183721911162138, -0.04839923977851868, 0.013872439041733742, 0.014841285534203053, -0.043720073997974396, 0.06212097033858299, -0.06952726095914841, 0.022333694621920586, -0.05141342803835869, 0.01581730879843235, 0.0051492368802428246, 0.03932797163724899, -0.019534805789589882, -0.006950573064386845, 0.06992915272712708, 0.05253298208117485, -0.004603812471032143, 0.013693023473024368, 0.050064221024513245, 0.06906795501708984, -0.0004294768732506782, -0.044609975069761276, 0.005271239671856165, -0.01644885167479515, -0.015027877874672413, -0.05884842202067375, 0.02480245754122734, 0.0183721911162138, 0.003392755053937435, 0.0052030617371201515, -0.01227204967290163, -0.03137625753879547, 0.0029998342506587505, 0.04317464679479599, -0.008561727590858936, -0.0030608356464654207, 0.02929503284394741, -0.0039040904957801104, -0.03622766211628914, -0.02368290163576603, 0.010119058191776276, -0.03407467156648636, -0.0017143191071227193, 0.012839003466069698, -0.07779474556446075, 0.0037749111652374268, -0.010707542300224304, 0.026854975149035454, -0.02866348810493946, -0.0011365999234840274, 0.044236790388822556, 0.019549159333109856, 0.029567744582891464, 0.0134418411180377, -0.012853357009589672, 0.02295088581740856, ...]"
9,9cc4f2fc-dee9-4196-b1c7-b437544668c8,Ask Opinion,Can you explain why you gave that trust score to your AI chatbot?,,The AI has the capacity to store info,The AI has the capacity to store info,Neutral,0.55,0.46,0.54,...,,0.58,,,0.0,"North America, Northern America, O1: English, O2: 18-25, O3: Male, O4: Suburban, O5: More excited than concerned, O6: Other religious group, O7: United States",English,,e6dd50c5-4a9d-44e7-ac6a-60e6ff4dd62d,"[0.00714302621781826, 0.014757540076971054, 0.005752140190452337, 0.008282452821731567, 0.048028796911239624, -0.033632732927799225, 0.028886429965496063, 0.02607322484254837, -0.061073269695043564, -0.01620343327522278, 0.045577067881822586, -0.040013521909713745, 0.004966328386217356, -0.03338127210736275, 0.012714429758489132, -0.058998726308345795, -0.05252363905310631, -0.01096206996589899, 0.028194915503263474, -0.046520039439201355, 0.021452654153108597, 0.03129101172089577, 0.010176259092986584, -0.013971728272736073, 0.020336801186203957, -0.024092979729175568, 0.036398787051439285, 0.036147329956293106, -0.027519118040800095, -0.003852440742775798, 0.06651108711957932, -0.04384827986359596, -0.015834100544452667, -0.01636059582233429, -0.015488344244658947, 0.03281548619270325, -0.0034870384261012077, 0.01269085519015789, -0.00704087084159255, -0.024218710139393806, -0.00284070847555995, 0.010883488692343235, -0.039604898542165756, 0.021546950563788414, 0.0060193161480128765, -0.0031962881330400705, -0.0022042011842131615, 0.008282452821731567, -0.017916502431035042, 0.07003151625394821, -0.04689722880721092, 0.05173783004283905, -0.06443654000759125, -0.01928381435573101, 0.00557926157489419, 0.025208832696080208, -0.02162553183734417, 0.02608894184231758, 0.0568927526473999, -0.005237433593720198, 0.0225842222571373, 0.03759322315454483, 0.04117652028799057, -0.0030234097503125668, -0.023920102044939995, -0.03495289385318756, 0.000902209838386625, 0.10020668059587479, -0.03158962354063988, -0.00031530685373581946, -0.015928398817777634, 0.004027283750474453, -0.024344440549612045, -0.02109117992222309, 0.0017061932012438774, -0.01185003761202097, -0.011779313907027245, 0.03671311214566231, 0.043628253042697906, -0.01720927096903324, -0.02373150736093521, 0.04061073809862137, 0.005905373487621546, -0.11108230799436569, 0.047431580722332, 0.005646055564284325, -0.06104183569550514, -0.027157645672559738, -0.07260898500680923, -0.01593625731766224, -0.017224987968802452, 0.04199376702308655, -0.0006276669446378946, -0.01981816627085209, 0.049034636467695236, -0.006078251637518406, 0.051360636949539185, -0.04853171855211258, -0.00603110296651721, 0.05638983100652695, ...]"


In [34]:
    # --- DEBUG CELL for Step 1 ---
    question_types_in_processed_list = []
    if qs_opinion_processed:
        for df in qs_opinion_processed:
            # Accessing the question type from the DataFrame itself for verification
            # This assumes the structure [Question ID, Question Type, Question, ...] is consistent
            if 'Question Type' in df.columns and len(df) > 0:
                 # Use iloc[0] assuming type is consistent for all rows in the df
                question_types_in_processed_list.append(df['Question Type'].iloc[0])
            else:
                question_types_in_processed_list.append("Unknown Structure or Empty DF")
        print("\\nQuestion Types found in qs_opinion_processed:")
        print(pd.Series(question_types_in_processed_list).value_counts())
    else:
        print("\\nqs_opinion_processed is empty.")
    # --- END DEBUG CELL ---

\nQuestion Types found in qs_opinion_processed:
Ask Opinion           9
Poll Single Select    6
Name: count, dtype: int64


In [29]:
# --- Response Convergence ---
# Calculate convergence for each response across all segments

response_convergence_data = []

if qs_opinion_processed:
    for i, df in enumerate(qs_opinion_processed):
        question_text = qmeta_opinion.iloc[i]['question text']
        question_id = df['Question ID'].iloc[0] # Assuming Question ID is constant per df

        # Select only the segment columns that actually exist in this df and are numeric
        valid_segment_cols = [col for col in segment_cols if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]

        if not valid_segment_cols:
            continue # Skip if no valid segment columns

        # Calculate Standard Deviation across valid segments for each response
        # Use axis=1 to calculate row-wise
        df['agreement_sd'] = df[valid_segment_cols].std(axis=1, skipna=True)
        df['agreement_mean'] = df[valid_segment_cols].mean(axis=1, skipna=True)
        # Coefficient of Variation (SD/mean), handles scale differences
        # Add small epsilon to mean to avoid division by zero
        df['agreement_cv'] = df['agreement_sd'] / (df['agreement_mean'] + 1e-9)


        for index, row in df.iterrows():
            response_convergence_data.append({
                'Question ID': question_id,
                'Question': question_text,
                'Response': row['English Responses'], # Or 'Original Responses'
                'Mean Agreement': row['agreement_mean'],
                'SD Agreement': row['agreement_sd'],
                'CV Agreement': row['agreement_cv'],
                'Response Index (within Q)': index # Keep track of original row
            })

    # Create a DataFrame for easier sorting and viewing
    response_convergence_df = pd.DataFrame(response_convergence_data)

    # Sort by Standard Deviation (ascending) to find most convergent responses
    most_convergent_responses = response_convergence_df.sort_values(by='SD Agreement', ascending=True).reset_index(drop=True)

    # Sort by Coefficient of Variation (ascending) - alternative convergence measure
    most_convergent_responses_cv = response_convergence_df.sort_values(by='CV Agreement', ascending=True).reset_index(drop=True)


    print("\\n--- Most Convergent Responses (Lowest SD across all segments) ---")
    print(most_convergent_responses[['Question', 'Response', 'Mean Agreement', 'SD Agreement']].head(10)) # Show top 10

    print("\\n--- Most Convergent Responses (Lowest CV across all segments) ---")
    print(most_convergent_responses_cv[['Question', 'Response', 'Mean Agreement', 'SD Agreement', 'CV Agreement']].head(10)) # Show top 10

else:
    print("Cannot calculate response convergence: No processed 'Ask Opinion' data available.")


KeyError: 'English Responses'

In [5]:
show_questions(qs)

Unnamed: 0,question type,question text
0,Poll Single Select,Please select your preferred language:
1,Poll Single Select,How old are you?
2,Poll Single Select,What is your gender?
3,Poll Single Select,What best describes where you live?
4,Poll Single Select,"Overall, would you say the increased use of artificial intelligence (AI) in daily life makes you feel…"
...,...,...
61,Poll Single Select,Do you agree or disagree?
62,Ask Opinion,Please explain whether you agree or disagree and why.
63,Poll Single Select,"Do you feel you were able to fully express your views on your culture, your values, and what you want for the future in this conversation? (through what you responded and how you voted)"
64,Poll Single Select,Do you feel like you understand yourself better after participating in this conversation?


In [None]:
segments_df = show_segments(qs)

In [3]:
# prompt: segments_df is a dataframe with Index (starting at 0) and "Segment" column where each value is text formatted like this:
# All(986)
# O1: French (14)
# O1: Chinese (China) (91)
# O1: Hindi (5)
# O1: Portuguese (Portugal) (25)
# O1: English (742)
# O1: Russian (27)
# O1: Spanish (64)
# O1: Arabic (18)
# O2: &lt;18 (0)
# O2: 18-25 (288)
# O2: 26-35 (400)
# etc
# extract the integer within (...) in the text of the Segment. create a dict, key is the integer Index value, value is a dict {"segment": <segment_text>, "count": <integer_count_from_parenthesis>}
# Ensure that the integer extraction from the (..) can also handle these kinds of cases:
# O1: Chinese (China) (91)
# O1: Portuguese (Portugal) (25)

import re

def extract_segment_data(segments_df):
    segment_data = {}
    for index, row in segments_df.iterrows():
        # Access the segment text using the column name returned by show_segments
        # show_segments returns a DataFrame with a single column named 0.
        segment_text = row[0]  # Updated column name
        match = re.search(r'\((\d+)\)', segment_text)
        if match:
            count = int(match.group(1))
            segment_data[index] = {"segment": segment_text, "count": count}
        else:
            # Handle cases where no integer is found within parentheses
            segment_data[index] = {"segment": segment_text, "count": None}
    return segment_data

# Example usage (assuming segments_df is defined)
# Assuming segments_df is already created as shown in the prompt
# Replace with your actual segments_df
# Example segments_df (replace this with your actual data)
#data = {'Segment': ['All(986)', 'O1: French (14)', 'O1: Chinese (China) (91)', 'O1: Hindi (5)', 'O1: Portuguese (Portugal) (25)']}
segments_df = show_segments(qs)

segment_info = extract_segment_data(segments_df)
segment_info


{0: {'segment': 'All(986)', 'count': 986},
 1: {'segment': 'O1: French (14)', 'count': 14},
 2: {'segment': 'O1: Chinese (China) (91)', 'count': 91},
 3: {'segment': 'O1: Hindi (5)', 'count': 5},
 4: {'segment': 'O1: Portuguese (Portugal) (25)', 'count': 25},
 5: {'segment': 'O1: English (742)', 'count': 742},
 6: {'segment': 'O1: Russian (27)', 'count': 27},
 7: {'segment': 'O1: Spanish (64)', 'count': 64},
 8: {'segment': 'O1: Arabic (18)', 'count': 18},
 9: {'segment': 'O2: &lt;18 (0)', 'count': 0},
 10: {'segment': 'O2: 18-25 (288)', 'count': 288},
 11: {'segment': 'O2: 26-35 (400)', 'count': 400},
 12: {'segment': 'O2: 36-45 (184)', 'count': 184},
 13: {'segment': 'O2: 46-55 (85)', 'count': 85},
 14: {'segment': 'O2: 56-65 (21)', 'count': 21},
 15: {'segment': 'O2: 65+ (8)', 'count': 8},
 16: {'segment': 'O3: Male (488)', 'count': 488},
 17: {'segment': 'O3: Female (483)', 'count': 483},
 18: {'segment': 'O3: Non-binary (8)', 'count': 8},
 19: {'segment': 'O3: Other / prefer not t

In [4]:
def filter_segments_by_count(segment_info, min_count):
  filtered_segments = {}
  for segment_id, info in segment_info.items():
    if info["count"] is not None and info["count"] >= min_count:
      filtered_segments[segment_id] = info
  return filtered_segments


In [5]:
# set minimum segment size threshold for considering divergence
MIN_SEG_SIZE = 30 # minimum participants in segment for us to consider checking it in our divergence search

# filter segments to use by threshold
filtered_segments = filter_segments_by_count(segment_info, MIN_SEG_SIZE)

# show count of segments that will be analyzed
print(f"Number of segments to analyze: {len(filtered_segments)}")



Number of segments to analyze: 44


In [19]:
qid = 33
#segs = list(filtered_segments.keys())
segs = [223,224,225,226,227,228]
ba = get_divergent_responses(qs[qid],segs,15)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfplt["bridge"] = df.apply (lambda row: min_bridge(row,segs_incl[1:],df.columns), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfplt["polarization"] = df.apply (lambda row: polarization(row,segs_incl[1:],df.columns), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfplt["divergence

In [21]:
ba.style.pipe(make_pretty)

Unnamed: 0,English Responses,Africa (192),Asia (474),Europe (125),North America (95),South America (69),Oceania (11),bridge,polarization,divergence
303,"Till now I haven't read any article about how AI is using my data. Of course, they are using my data. I am sharing everything with them, but still, I am not sure about the consequences. Till now I trust AI; that's why I asked personal questions as well. If I didn't trust AI, then I wouldn't ask personal questions at all. AI knows all about my family, my occupation, etc. It's scary, but still I am using it. I feel it is a necessity.",0.53,0.54,0.56,0.52,0.39,0.82,0.39,0.43,0.19
150,"Sometimes Chatgpt will hallucinate, which may be a technical limitation. Therefore he may give wrong answers, which is not in my best interest. Supervision and suspicion are needed.",0.51,0.56,0.54,0.45,0.62,0.27,0.27,0.35,0.17
288,"The reason for giving this trust score to the AI ​​chatbot is that even though it has deep knowledge on most of the topics, it also makes errors and I have experienced this on many occasions. Instead of blindly trusting the information given by it, one must check it at his level once.",0.49,0.55,0.55,0.49,0.61,0.27,0.27,0.34,0.16
46,"In my opinion, AI Chatbot is unbiased in most of its responses. Therefore, the trust score automatically gets high when the results are not varied in accordance to personal values and thoughts.",0.49,0.57,0.54,0.52,0.61,0.27,0.27,0.34,0.16
805,"The Chat GPT quotes ""Chat gpt can make mistakes"" meaning that it cannot be trusted 100% so there is some improvements that need to be done for it to be trusted wholly.",0.51,0.52,0.5,0.42,0.61,0.27,0.27,0.34,0.16
69,Because AI chat bots are helping us on a daily basis. It makes life easier and I trust they will do the best sometimes. Privacy is a concern. So that's the main reason for low ratings.,0.52,0.56,0.54,0.52,0.61,0.27,0.27,0.34,0.16
49,"I assigned that trust score based on my experiences with AI chatbot's performance dependability, and transparency. While they are often useful for providing quick solutions and assisting with everyday chores, their limits become apparent when dealing with difficult questions. Furthermore, issues such as data privacy, information veracity, and feedback response are big impact on my trust level.",0.51,0.57,0.55,0.48,0.61,0.27,0.27,0.34,0.16
960,Because it is literally impossible for me to have ALL the variables necessary to act in my best interest.,0.43,0.45,0.46,0.48,0.55,0.82,0.43,0.39,0.15
594,"I have used chatgpt to prep for some recent interviews. It enabled me to research the most important topics. When I attended the interviews, I felt like I was reasonable prepared because they went well.",0.52,0.54,0.51,0.45,0.57,0.18,0.18,0.39,0.15
955,few sources of data,0.41,0.49,0.46,0.48,0.51,0.73,0.41,0.32,0.14


In [20]:
ba[['English Responses','divergence']]

Unnamed: 0,English Responses,divergence
303,"Till now I haven't read any article about how AI is using my data. Of course, they are using my data. I am sharing everything with them, but still, I am not sure about the consequences. Till now I trust AI; that's why I asked personal questions as well. If I didn't trust AI, then I wouldn't ask personal questions at all. AI knows all about my family, my occupation, etc. It's scary, but still I am using it. I feel it is a necessity.",0.187617
150,"Sometimes Chatgpt will hallucinate, which may be a technical limitation. Therefore he may give wrong answers, which is not in my best interest. Supervision and suspicion are needed.",0.166132
288,"The reason for giving this trust score to the AI ​​chatbot is that even though it has deep knowledge on most of the topics, it also makes errors and I have experienced this on many occasions. Instead of blindly trusting the information given by it, one must check it at his level once.",0.15906
46,"In my opinion, AI Chatbot is unbiased in most of its responses. Therefore, the trust score automatically gets high when the results are not varied in accordance to personal values and thoughts.",0.15906
805,"The Chat GPT quotes ""Chat gpt can make mistakes"" meaning that it cannot be trusted 100% so there is some improvements that need to be done for it to be trusted wholly.",0.15906
69,Because AI chat bots are helping us on a daily basis. It makes life easier and I trust they will do the best sometimes. Privacy is a concern. So that's the main reason for low ratings.,0.15906
49,"I assigned that trust score based on my experiences with AI chatbot's performance dependability, and transparency. While they are often useful for providing quick solutions and assisting with everyday chores, their limits become apparent when dealing with difficult questions. Furthermore, issues such as data privacy, information veracity, and feedback response are big impact on my trust level.",0.15906
960,Because it is literally impossible for me to have ALL the variables necessary to act in my best interest.,0.149666
594,"I have used chatgpt to prep for some recent interviews. It enabled me to research the most important topics. When I attended the interviews, I felt like I was reasonable prepared because they went well.",0.149666
955,few sources of data,0.143875


In [9]:

# pick "Ask Opinion" question ID
qid = 33

# iterate through each possible pair of filtered_segments to calculate divergence via segs = [seg_id1, seg_id2]; ba = get_divergent_responses(qs[qid],segs,3)
# get the ba['divergence'][0] result (the highest divergence value) and save it in a dict of key: set(seg_id1, seg_id2), value: <divergence float>

# segs = ....

In [10]:
#choose question and segments
qid = 33



segs = segs = [231,232,233,234,235,236,0]

#print the question
print(qs[qid]["Question"][1])

#generate a table with bridging, divergence, and consensus metrics; keeping the 3 responses with the hightest divergence
ba = get_divergent_responses(qs[qid],segs,3)

#display a pretty version fo the table
ba.style.pipe(make_pretty)

Can you explain why you gave that trust score to your AI chatbot?


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfplt["bridge"] = df.apply (lambda row: min_bridge(row,segs_incl[1:],df.columns), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfplt["polarization"] = df.apply (lambda row: polarization(row,segs_incl[1:],df.columns), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfplt["divergence

Unnamed: 0,English Responses,Northern Africa (33),Eastern Africa (151),Middle Africa (0),Southern Africa (8),Western Africa (0),Central Asia (16),All(967),bridge,polarization,divergence
608,"AI often gives incorrect information. For serious issues, such as writing reports, research, programming code, AI cannot be trusted completely, you always need to double-check the information. Personally, for example, AI often refers to false sources.",0.58,0.48,,0.13,,0.75,0.52,0.13,0.62,0.3
894,"I'm not a fan of AI at all, but I feel at this point in time they aren't as big of a problem to really bring trust into it. They just spit out information, they don't have much in the way of deciding or acting in my best interest or not.",0.55,0.5,,0.13,,0.75,0.5,0.13,0.62,0.3
772,It has good functions but depends on how people use it,0.61,0.51,,0.13,,0.75,0.51,0.13,0.62,0.3


In [22]:
ba['divergence']

Unnamed: 0,divergence
608,0.304138
894,0.304138
772,0.304138
777,0.304138
854,0.304138
