In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Flag to control whether to skip experiments without interventions
skip_no_interventions = True

# Helper function to convert time strings to seconds
def time_to_seconds(time_str):
    h, m, s = map(float, time_str.split(':'))
    return int(h * 3600 + m * 60 + s)

# Function to determine if two time intervals overlap
def check_overlap(row, intervals):
    for start, end in intervals:
        if row['Start'] <= end and row['End'] >= start:
            return True
    return False

# Load the CSV files
file_path_original = 'reduct-highlights-export.csv'
file_path_researcher = 'reduct-highlights-export (1).csv'

original_df = pd.read_csv(file_path_original)
researcher_df = pd.read_csv(file_path_researcher)

# Add Start and End columns for the original dataframe
original_df['Start'] = original_df['Timestamp'].apply(time_to_seconds)
original_df['Duration'] = original_df['Duration'].apply(time_to_seconds)
original_df['End'] = original_df['Start'] + original_df['Duration']

# Add Start and End columns for the researcher dataframe
researcher_df['Start'] = researcher_df['Timestamp'].apply(time_to_seconds)
researcher_df['Duration'] = researcher_df['Duration'].apply(time_to_seconds)
researcher_df['End'] = researcher_df['Start'] + researcher_df['Duration']

# Loop through each unique experiment
for recording in original_df['Recording'].unique():
    # Filter data for the current experiment
    experiment_original = original_df[original_df['Recording'] == recording].copy()
    experiment_researcher = researcher_df[researcher_df['Recording'] == recording].copy()

    # Filter the researcher dataframe for rows with Tags == "#green"
    researcher_filtered = experiment_researcher[experiment_researcher['Tags'] == '#green'].copy()

    # Skip experiments with no interventions if the flag is set to True
    if skip_no_interventions and researcher_filtered.empty:
        print(f"Skipping experiment: {recording} (no researcher interventions).\n")
        continue

    # Categorize participant's data using .loc to avoid SettingWithCopyWarning
    experiment_original.loc[:, 'Category'] = experiment_original['Tags'].apply(
        lambda x: 'Required Help' if x == 'RH' else 'Confused'
    )

    # Check overlaps for each category in the original data with researcher interventions
    researcher_intervals = list(zip(researcher_filtered['Start'], researcher_filtered['End']))
    experiment_original.loc[:, 'Overlaps'] = experiment_original.apply(check_overlap, intervals=researcher_intervals, axis=1)

    # Summary print statements
    total_help_count = experiment_original[experiment_original['Category'] == 'Required Help'].shape[0]
    total_help_duration = experiment_original[experiment_original['Category'] == 'Required Help']['Duration'].sum()

    total_confused_count = experiment_original[experiment_original['Category'] == 'Confused'].shape[0]
    total_confused_duration = experiment_original[experiment_original['Category'] == 'Confused']['Duration'].sum()

    total_help_or_confused_count = total_help_count + total_confused_count
    total_help_or_confused_duration = total_help_duration + total_confused_duration

    total_interventions = researcher_filtered.shape[0]

    print(f"Summary for Recording: {recording}")
    print(f"Participant thought they required help {total_help_count} times for a total of {total_help_duration} seconds.")
    print(f"Participant was confused {total_confused_count} times for a total of {total_confused_duration} seconds.")
    print(f"Participant thought they required help or was confused {total_help_or_confused_count} times for a total of {total_help_or_confused_duration} seconds.")
    print(f"Researcher decided to intervene {total_interventions} times.\n")

    # Count overlaps by category
    overlap_counts = experiment_original[experiment_original['Overlaps']].groupby('Category').size()

    # Create a list of intervals from the participant's data for all highlights
    participant_intervals = list(zip(experiment_original['Start'], experiment_original['End']))

    # Check overlaps for researcher interventions with participant data
    researcher_filtered.loc[:, 'Overlaps'] = researcher_filtered.apply(
        check_overlap, intervals=participant_intervals, axis=1
    )

    # Calculate totals and overlaps
    total_help = experiment_original[experiment_original['Category'] == 'Required Help'].shape[0]
    total_confused = experiment_original[experiment_original['Category'] == 'Confused'].shape[0]
    total_help_or_confused = total_help + total_confused

    help_overlaps = overlap_counts.get('Required Help', 0)
    confused_overlaps = overlap_counts.get('Confused', 0)
    help_or_confused_overlaps = help_overlaps + confused_overlaps

    # Total researcher interventions
    total_interventions = researcher_filtered.shape[0]

    # Calculate overlaps from researcher's perspective
    researcher_help_overlaps = experiment_original[
        (experiment_original['Category'] == 'Required Help') & (experiment_original['Overlaps'])
    ].shape[0]
    researcher_confused_overlaps = experiment_original[
        (experiment_original['Category'] == 'Confused') & (experiment_original['Overlaps'])
    ].shape[0]
    researcher_help_or_confused_overlaps = researcher_help_overlaps + researcher_confused_overlaps

    # Convert to percentages
    help_percentage = (help_overlaps / total_help) * 100 if total_help > 0 else 0
    confused_percentage = (confused_overlaps / total_confused) * 100 if total_confused > 0 else 0
    help_or_confused_percentage = (
        (help_or_confused_overlaps / total_help_or_confused) * 100
        if total_help_or_confused > 0
        else 0
    )

    researcher_help_percentage = (researcher_help_overlaps / total_interventions) * 100 if total_interventions > 0 else 0
    researcher_confused_percentage = (researcher_confused_overlaps / total_interventions) * 100 if total_interventions > 0 else 0
    researcher_help_or_confused_percentage = (
        (researcher_help_or_confused_overlaps / total_interventions) * 100
        if total_interventions > 0
        else 0
    )

    # Results
    results = {
        "Participant Required Help": f"{help_overlaps} out of {total_help} ({help_percentage:.2f}%)",
        "Participant Confused": f"{confused_overlaps} out of {total_confused} ({confused_percentage:.2f}%)",
        "Participant Required Help or Confused": f"{help_or_confused_overlaps} out of {total_help_or_confused} ({help_or_confused_percentage:.2f}%)",
        "Researcher Intervened on Required Help": f"{researcher_help_overlaps} out of {total_interventions} ({researcher_help_percentage:.2f}%)",
        "Researcher Intervened on Confusion": f"{researcher_confused_overlaps} out of {total_interventions} ({researcher_confused_percentage:.2f}%)",
        "Researcher Intervened on Required Help or Confusion": f"{researcher_help_or_confused_overlaps} out of {total_interventions} ({researcher_help_or_confused_percentage:.2f}%)",
    }

    # Print results for the current experiment
    print(f"Results for Recording: {recording}")
    print("The researcher would successfully intervene:\n")
    print(f"When the participant required help: {results['Participant Required Help']}.")
    print(f"When the participant was confused: {results['Participant Confused']}.")
    print(f"When the participant required help or was confused: {results['Participant Required Help or Confused']}.\n")
    print("The participant required help or was confused when the researcher intervened:\n")
    print(f"Required Help: {results['Researcher Intervened on Required Help']}.")
    print(f"Confused: {results['Researcher Intervened on Confusion']}.")
    print(f"Required Help or Confused: {results['Researcher Intervened on Required Help or Confusion']}.\n")

    # Visualization - Bar Plot for Percentages
    categories = ["Required Help", "Confused", "Help or Confused"]
    participant_percentages = [help_percentage, confused_percentage, help_or_confused_percentage]
    researcher_percentages = [researcher_help_percentage, researcher_confused_percentage, researcher_help_or_confused_percentage]

    plt.figure(figsize=(10, 6))
    x = range(len(categories))
    plt.bar(x, participant_percentages, width=0.4, color='blue', align='center', label="Participant Overlaps (%)")
    plt.bar([p + 0.4 for p in x], researcher_percentages, width=0.4, color='orange', align='center', label="Researcher Interventions (%)")
    plt.xticks([p + 0.2 for p in x], categories)
    plt.title(f"Percentage Comparison: Participant Overlaps vs Researcher Interventions - {recording}")
    plt.ylabel("Percentage")
    plt.legend()
    plt.show()

    # Visualization - Time Series
    # Prepare data for the time series plot
    confused_intervals = experiment_original[experiment_original['Category'] == 'Confused'][['Start', 'End']].values
    help_intervals = experiment_original[experiment_original['Category'] == 'Required Help'][['Start', 'End']].values
    researcher_intervention_starts = researcher_filtered['Start'].values

    # Determine the common x-axis range based on maximum End time
    max_time = max(experiment_original['End'].max(), researcher_filtered['End'].max() if not researcher_filtered.empty else 0)

    # Create the plot
    fig, ax = plt.subplots(figsize=(14, 6))

    # Label flags to ensure labels are added only once
    confused_label_added = False
    help_label_added = False
    intervention_label_added = False

    # Plot participant confusion intervals (blue regions)
    if len(confused_intervals) > 0:
        for start, end in confused_intervals:
            ax.axvspan(start, end, color='blue', alpha=0.3, label='Confused' if not confused_label_added else "")
            confused_label_added = True

    # Plot participant required help intervals (red regions)
    if len(help_intervals) > 0:
        for start, end in help_intervals:
            ax.axvspan(start, end, color='red', alpha=0.3, label='Required Help' if not help_label_added else "")
            help_label_added = True

    # Mark researcher intervention start times (green dots)
    if len(researcher_intervention_starts) > 0:
        ax.scatter(researcher_intervention_starts, [0.5] * len(researcher_intervention_starts), color='green', label='Researcher Intervention' if not intervention_label_added else "")
        intervention_label_added = True

    # Customize the plot
    ax.set_xlim(0, max_time)
    ax.set_title(f"Participant Confusion, Help, and Researcher Interventions Over Time - {recording}")
    ax.set_xlabel("Time (seconds)")
    ax.set_yticks([])
    ax.legend()
    plt.show()


In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Margin of error in seconds
margin_of_error = 5

# Helper function to convert time strings to seconds
def time_to_seconds(time_str):
    h, m, s = map(float, time_str.split(':'))
    return int(h * 3600 + m * 60 + s)

# Load the CSV files
file_path_original = 'reduct-highlights-export.csv'
file_path_researcher = 'reduct-highlights-export (1).csv'
file_path_additional = 'reduct-highlights-export (2).csv'

original_df = pd.read_csv(file_path_original)
researcher_df = pd.read_csv(file_path_researcher)
additional_df = pd.read_csv(file_path_additional)

# Add Start and End columns for all dataframes
for df in [original_df, researcher_df, additional_df]:
    df['Start'] = df['Timestamp'].apply(time_to_seconds)
    df['Duration'] = df['Duration'].apply(time_to_seconds)
    df['End'] = df['Start'] + df['Duration']

# Apply the margin of error to the original dataframe
original_df['Start'] = original_df['Start'] - margin_of_error
original_df['End'] = original_df['End'] + margin_of_error

# Ensure no negative start times (adjust for edge cases)
original_df['Start'] = original_df['Start'].apply(lambda x: max(0, x))

# Prepare the subplot layout
num_experiments = len(original_df['Recording'].unique())
cols = 2
rows = (num_experiments + 1) // 2  # Calculate rows based on columns
fig = make_subplots(rows=rows, cols=cols, subplot_titles=original_df['Recording'].unique())

# Loop through each unique experiment and add to subplots
for idx, recording in enumerate(original_df['Recording'].unique()):
    # Determine subplot row and column
    row = idx // cols + 1
    col = idx % cols + 1

    # Filter data for the current experiment
    experiment_original = original_df[original_df['Recording'] == recording].copy()
    experiment_researcher = researcher_df[researcher_df['Recording'] == recording].copy()
    experiment_additional = additional_df[additional_df['Recording'] == recording].copy()

    # Filter the researcher dataframe for rows with Tags == "#green"
    researcher_filtered = experiment_researcher[experiment_researcher['Tags'] == '#green'].copy()

    # Categorize participant's data
    experiment_original['Category'] = experiment_original['Tags'].apply(
        lambda x: 'Required Help' if x == 'RH' else 'Confused'
    )

    # Add participant confusion intervals (blue regions)
    for _, row_data in experiment_original[experiment_original['Category'] == 'Confused'].iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row_data['Start'], row_data['End'], row_data['End'], row_data['Start'], row_data['Start']],
                y=[0, 0, 1, 1, 0],
                fill="toself",
                fillcolor="rgba(0, 0, 255, 0.2)",  # Blue with transparency
                line=dict(color="rgba(0, 0, 255, 0)"),
                name="Confused",
                hoverinfo="skip",
                showlegend=(idx == 0),
            ),
            row=row,
            col=col,
        )

    # Add participant required help intervals (red regions)
    for _, row_data in experiment_original[experiment_original['Category'] == 'Required Help'].iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row_data['Start'], row_data['End'], row_data['End'], row_data['Start'], row_data['Start']],
                y=[0, 0, 1, 1, 0],
                fill="toself",
                fillcolor="rgba(255, 0, 0, 0.2)",  # Red with transparency
                line=dict(color="rgba(255, 0, 0, 0)"),
                name="Required Help",
                hoverinfo="skip",
                showlegend=(idx == 0),
            ),
            row=row,
            col=col,
        )

    # Add researcher intervention points (green dots)
    fig.add_trace(
        go.Scatter(
            x=researcher_filtered['Start'],
            y=[0.5] * len(researcher_filtered),
            mode="markers",
            marker=dict(color="green", size=10),
            name="Researcher Intervention",
            showlegend=(idx == 0),
        ),
        row=row,
        col=col,
    )

    # Add highlights from the third CSV as black regions with hover tooltips
    for _, row_data in experiment_additional.iterrows():
        fig.add_trace(
            go.Scatter(
                x=[row_data['Start'], row_data['End'], row_data['End'], row_data['Start'], row_data['Start']],
                y=[0, 0, 1, 1, 0],
                fill="toself",
                fillcolor="rgba(0, 0, 0, 0.5)",  # Black with transparency
                line=dict(color="rgba(0, 0, 0, 0)"),
                name="Additional Highlights",
                hoverinfo="text",
                text=f"Tags: {row_data['Tags']}",
                showlegend=(idx == 0),
            ),
            row=row,
            col=col,
        )

# Customize layout
fig.update_layout(
    title="Combined Interactive Plot for All Experiments",
    xaxis_title="Time (seconds)",
    yaxis_title="",
    yaxis=dict(showticklabels=False),
    showlegend=False,
    template="plotly_white",
    height=300 * rows,  # Adjust height based on number of rows
)

# Save the combined plot as an interactive HTML file
output_file = "combined_interactive_plot.html"
fig.write_html(output_file)
print(f"Saved combined interactive plot to {output_file}")


In [None]:
from transformers import pipeline
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Define possible categories
CATEGORIES = [
    "FACIAL MOVEMENT", "GAZE", "GESTURE (hand, head movement)", "LIP MOVEMENT",
    "BROW MOVEMENT", "MANUAL ACTIONS", "HIGH LEVEL TASK ACTIONS", "GENERAL POSTURE",
    "SELF-VOCALIZATIONS", "CHIN MOVEMENT", "TASK SPECIFIC ACTIONS / FIDGETING",
    "FACE FIDGETING", "PUNCTUATIONS", "TAKING OF BREATH", "EMOTION EXPRESSION",
    "MOMENTARY RESET", "SOLVED BEHAVIORS", "OTHER"
]

# Load Zero-Shot Classification model
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Load the CSV file with tags
file_path_additional = 'reduct-highlights-export (2).csv'
additional_df = pd.read_csv(file_path_additional)

# Classify tags into categories
def classify_tag(tag):
    result = classifier(tag, CATEGORIES)
    return result['labels'][0]  # Return the top category



config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

TypeError: object of type 'float' has no len()

In [17]:
additional_df['Tags'] = additional_df['Tags'].fillna("OTHER")

additional_df['Category'] = additional_df['Tags'].apply(classify_tag)

# Visualize clusters
def visualize_clusters(data, column, title):
    categories = data[column].unique()
    category_map = {category: idx for idx, category in enumerate(categories)}
    data['Category_ID'] = data[column].map(category_map)
    
    embeddings = TSNE(n_components=2, random_state=42).fit_transform(
        [[category_map[cat]] for cat in data['Category']]
    )
    
    plt.figure(figsize=(10, 8))
    for cat in categories:
        subset = embeddings[data['Category'] == cat]
        plt.scatter(subset[:, 0], subset[:, 1], label=cat, alpha=0.7)
    
    plt.legend()
    plt.title(title)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()

# Save classified tags
additional_df.to_csv('classified_tags_zero_shot.csv', index=False)
visualize_clusters(additional_df, 'Category', "Zero-Shot Classification Clusters")


AttributeError: 'list' object has no attribute 'shape'

In [None]:
from collections import Counter
import pandas as pd

# Load the clustered tags from Zero-Shot Learning
clustered_df = pd.read_csv('classified_tags_zero_shot.csv')

# Load the original and researcher CSVs
file_path_original = 'reduct-highlights-export.csv'
file_path_researcher = 'reduct-highlights-export (1).csv'
original_df = pd.read_csv(file_path_original)
researcher_df = pd.read_csv(file_path_researcher)

# Add Start and End columns for original_df and researcher_df
for df in [original_df, researcher_df]:
    df['Start'] = df['Timestamp'].apply(lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(":")))))
    df['End'] = df['Start'] + df['Duration']

# Apply a margin of error (5 seconds) to original_df
margin_of_error = 5
original_df['Start'] -= margin_of_error
original_df['End'] += margin_of_error

# Ensure no negative start times
original_df['Start'] = original_df['Start'].clip(lower=0)

# Categorize participant's data
original_df['Category'] = original_df['Tags'].apply(lambda x: 'Required Help' if x == 'RH' else 'Confused')

# Analyze overlaps
def analyze_overlaps_with_margin(original_df, clustered_df, category):
    """
    Analyze overlaps of clustered tags with the given category in original_df,
    applying a margin of error of 5 seconds.
    """
    overlapping_categories = []
    for _, row in original_df[original_df['Category'] == category].iterrows():
        overlaps = clustered_df[
            (clustered_df['Start'] <= row['End']) & (clustered_df['End'] >= row['Start'])
        ]
        overlapping_categories.extend(overlaps['Category'].tolist())
    return Counter(overlapping_categories)

# Perform analysis for each category
results = {}
for category in ['Confused', 'Required Help']:
    results[category] = analyze_overlaps_with_margin(original_df, clustered_df, category)

# Analyze overlaps for researcher interventions
researcher_intervals = researcher_df[researcher_df['Tags'] == '#green']
results['Intervention'] = analyze_overlaps_with_margin(researcher_intervals, clustered_df, 'Confused')  # Assuming interventions overlap with confused

# Display results
print("Category Analysis for Overlapping Confused, Required Help, and Interventions:")
for category, counts in results.items():
    print(f"\n{category} Overlap Analysis:")
    for tag_category, count in counts.most_common():
        print(f"{tag_category}: {count} occurrences")


In [16]:
import openai
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Set your OpenAI API Key
openai.api_key = "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"

# Define possible categories
CATEGORIES = [
    "FACIAL MOVEMENT", "GAZE", "GESTURE (hand, head movement)", "LIP MOVEMENT",
    "BROW MOVEMENT", "MANUAL ACTIONS", "HIGH LEVEL TASK ACTIONS", "GENERAL POSTURE",
    "SELF-VOCALIZATIONS", "CHIN MOVEMENT", "TASK SPECIFIC ACTIONS / FIDGETING",
    "FACE FIDGETING", "PUNCTUATIONS", "TAKING OF BREATH", "EMOTION EXPRESSION",
    "MOMENTARY RESET", "SOLVED BEHAVIORS", "OTHER"
]

# Load the CSV file with tags
file_path_additional = 'reduct-highlights-export (2).csv'
additional_df = pd.read_csv(file_path_additional)

# Classify tags using GPT model
def classify_tag_gpt(tag):
    messages = [
        {"role": "system", "content": "You are an assistant that classifies user behaviors into predefined categories."},
        {"role": "user", "content": f"""
Classify the following behavior into one of these categories: {', '.join(CATEGORIES)}.

Behavior: {tag}

Response:"""}
    ]
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=10,
    )
    return response['choices'][0]['message']['content'].strip()

# Apply classification to all tags
additional_df['Category'] = additional_df['Tags'].apply(classify_tag_gpt)

# Visualize clusters
def visualize_clusters(data, column, title):
    categories = data[column].unique()
    category_map = {category: idx for idx, category in enumerate(categories)}
    data['Category_ID'] = data[column].map(category_map)
    
    embeddings = TSNE(n_components=2, random_state=42).fit_transform(
        [[category_map[cat]] for cat in data['Category']]
    )
    
    plt.figure(figsize=(10, 8))
    for cat in categories:
        subset = embeddings[data['Category'] == cat]
        plt.scatter(subset[:, 0], subset[:, 1], label=cat, alpha=0.7)
    
    plt.legend()
    plt.title(title)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.show()

# Save classified tags
additional_df.to_csv('classified_tags_gpt.csv', index=False)
visualize_clusters(additional_df, 'Category', "ChatGPT gpt-4o Classification Clusters")


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
# Load the classified tags from both methods
zero_shot_df = pd.read_csv('classified_tags_zero_shot.csv')
gpt_df = pd.read_csv('classified_tags_gpt.csv')

# Combine data for comparison
combined_df = zero_shot_df.copy()
combined_df['GPT_Category'] = gpt_df['Category']

# Visualize both methods
def compare_visualizations(data, column1, column2):
    categories1 = data[column1].unique()
    categories2 = data[column2].unique()

    category_map1 = {category: idx for idx, category in enumerate(categories1)}
    category_map2 = {category: idx for idx, category in enumerate(categories2)}

    data['Category1_ID'] = data[column1].map(category_map1)
    data['Category2_ID'] = data[column2].map(category_map2)

    embeddings1 = TSNE(n_components=2, random_state=42).fit_transform(
        [[category_map1[cat]] for cat in data[column1]]
    )
    embeddings2 = TSNE(n_components=2, random_state=42).fit_transform(
        [[category_map2[cat]] for cat in data[column2]]
    )

    plt.figure(figsize=(14, 8))

    # Zero-shot clusters
    plt.subplot(1, 2, 1)
    for cat in categories1:
        subset = embeddings1[data[column1] == cat]
        plt.scatter(subset[:, 0], subset[:, 1], label=cat, alpha=0.7)
    plt.title("Zero-Shot Classification Clusters")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.legend()

    # GPT-3/4 clusters
    plt.subplot(1, 2, 2)
    for cat in categories2:
        subset = embeddings2[data[column2] == cat]
        plt.scatter(subset[:, 0], subset[:, 1], label=cat, alpha=0.7)
    plt.title("GPT-3/4 Classification Clusters")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.legend()

    plt.tight_layout()
    plt.show()

# Compare both methods
compare_visualizations(combined_df, 'Category', 'GPT_Category')
