In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

print(os.getcwd())
# os.chdir('../')
print(os.getcwd())

/Users/bb320/Library/CloudStorage/GoogleDrive-burint@bnmanalytics.com/My Drive/Imperial/01_Projects/TeamofRivals/Analysis/Con2vec-1
/Users/bb320/Library/CloudStorage/GoogleDrive-burint@bnmanalytics.com/My Drive/Imperial/01_Projects/TeamofRivals/Analysis/Con2vec-1


In [None]:
# merged_df = pd.read_csv('./Output/super_May22/Merged_data.csv')
# linfeat = pd.read_csv('./Output/super_May22/may_super_turn_level_bb.csv')
# merged_df = pd.concat([merged_df, linfeat], axis = 1)
# print(list(merged_df))


['Pair_Speaker_turn', 'PairID', 'PersonID', 'Speaker', 'Speaker_original', 'Turn', 'Speaker_turn', 'Turn_Boundary', 'Turn Start', 'Turn End', 'PairID_text', 'PersonID_text', 'Speaker_text', 'Speaker_original_text', 'Turn_text', 'Word', 'Start Time', 'End Time', 'Backchannel', 'Overlap', 'Contested', 'Duration', 'Sentiment', 'word_count', 'PairID_vocal', 'PersonID_vocal', 'Turn Start_vocal', 'Turn End_vocal', 'Rms', 'Pitch', 'Pulse', 'ZCR', 'Spectral_Centroid', 'Spectral_Bandwidth', 'positive_bert', 'negative_bert', 'neutral_bert', 'info_exchange_zscore_chats', 'discrepancies_lexical_wordcount', 'hear_lexical_wordcount', 'home_lexical_wordcount', 'conjunction_lexical_wordcount', 'certainty_lexical_wordcount', 'inclusive_lexical_wordcount', 'bio_lexical_wordcount', 'achievement_lexical_wordcount', 'adverbs_lexical_wordcount', 'anxiety_lexical_wordcount', 'third_person_lexical_wordcount', 'negation_lexical_wordcount', 'swear_lexical_wordcount', 'death_lexical_wordcount', 'health_lexical_w

In [None]:
# Step 0: Subset Data to Relevant Variables
def subset_columns(df, audio_df):
    # Select columns containing specific keywords from the main dataset
    relevant_columns = [
        col for col in df.columns 
        if any(keyword in col for keyword in ['lexical_wordcount', 'convokit', 'receptiveness'])
    ]
    
    # Rename columns to remove specific substrings in their names
    rename_mapping = {
        col: col.replace('_lexical_wordcount', '').replace('_politeness_convokit', '').replace('_receptiveness_yeomans', '')
        for col in relevant_columns
    }
    df = df.rename(columns=rename_mapping)

    # Include specific audio variables from the audio dataset
    audio_variables = ['Rms', 'Pitch', 'Pulse', 'ZCR', 'Spectral_Centroid', 'Spectral_Bandwidth']
    audio_variable_mapping = {
        'Rms': 'Loudness_Energy',
        'Pitch': 'Intonation_Patterns',
        'Pulse': 'Rhythm_Strength',
        'ZCR': 'Voiced_Unvoiced_Distinction',
        'Spectral_Centroid': 'Spectral_Center',
        'Spectral_Bandwidth': 'Spectral_Width'
    }
    
    # Rename audio variables for better interpretability
    audio_df = audio_df[audio_variables].rename(columns=audio_variable_mapping)

    # Combine the two datasets
    combined_df = pd.concat([df[list(rename_mapping.values())], audio_df], axis=1)
    
    return combined_df

In [7]:
# Step 1: Preprocess Data
def preprocess_data(df, variables_to_include):
    scaler = StandardScaler()
    numeric_data = df[variables_to_include]
    scaled_data = scaler.fit_transform(numeric_data)
    return scaled_data


In [8]:
# Step 2: Perform PCA and Clustering on Entire Dataset
def perform_pca_clustering(df, variables_to_include, n_clusters=5):
    scaled_data = preprocess_data(df, variables_to_include)
    
    # PCA
    pca = PCA()
    pca_data = pca.fit_transform(scaled_data)
    n_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.8) + 1
    pca = PCA(n_components=n_components)
    pca_data_reduced = pca.fit_transform(scaled_data)
    
    # Clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(pca.components_.T)
    
    # Assign clusters to variables
    clusters = kmeans.labels_
    clustered_variables = pd.DataFrame({
        'Variable': variables_to_include,
        'Cluster': clusters
    })
    
    # Evaluate clustering
    silhouette_avg = silhouette_score(pca.components_.T, clusters)
    print(f"Silhouette Score for entire dataset: {silhouette_avg:.2f}")
    
    return clustered_variables, silhouette_avg, pca

In [9]:
# Step 3: Rank Clusters by Variability Across StageLabels and Generate Intuitive Names
def rank_clusters_by_stage_label(clustered_variables, df, stage_label_col):
    cluster_summary = []

    for cluster_id in clustered_variables['Cluster'].unique():
        cluster_vars = clustered_variables[clustered_variables['Cluster'] == cluster_id]['Variable']

        # Generate descriptive name for the cluster based on variable composition
        cluster_name = ", ".join(cluster_vars.head(3))  # Use top 3 variables as representation
        if len(cluster_vars) > 3:
            cluster_name += ", ..."  # Indicate more variables if applicable

        # Calculate variability of each cluster across StageLabel groups
        cluster_variability = cluster_vars.apply(
            lambda var: df.groupby(stage_label_col)[var].var().mean()
        ).mean()

        cluster_summary.append({
            'Cluster': cluster_id,
            'ClusterName': cluster_name,
            'MeanVariability': cluster_variability
        })

    # Convert to DataFrame and rank by MeanVariability
    cluster_summary_df = pd.DataFrame(cluster_summary)
    cluster_summary_df = cluster_summary_df.sort_values(by='MeanVariability', ascending=False)

    # Create rankings for each StageLabel
    stage_label_rankings = {}
    for label, segment_df in df.groupby(stage_label_col):
        stage_ranking = []
        for _, row in cluster_summary_df.iterrows():
            cluster_id = row['Cluster']
            cluster_vars = clustered_variables[clustered_variables['Cluster'] == cluster_id]['Variable']
            stage_variability = cluster_vars.apply(
                lambda var: segment_df[var].var()
            ).mean()
            stage_ranking.append((row['ClusterName'], stage_variability))
        stage_label_rankings[label] = sorted(stage_ranking, key=lambda x: x[1], reverse=True)

    return cluster_summary_df, stage_label_rankings

Main

In [26]:
# Load Dataset
merged_df = pd.read_csv('./Output/super_May22/Merged_data.csv')
audio_df = pd.read_csv('./Output/super_May22/Vocal_agg.csv')

# Load StageLabel from external file
stages_df = pd.read_csv('./Output/super_May22/Segmented_Conversations_With_Conflicts.csv')
stage_label_col = 'Stage'


# 'Merge' StageLabel into the main dataset
merged_df[stage_label_col] = stages_df[stage_label_col]

# Step 0: Subset columns
merged_df = subset_columns(merged_df, audio_df)
print(list(merged_df))
# Define variables to include
variables_to_include = merged_df.columns.tolist()

# Perform PCA and clustering on the entire dataset
clustered_variables, silhouette_avg, pca = perform_pca_clustering(merged_df, variables_to_include, n_clusters=5)

# Rank clusters by variability across StageLabels
cluster_summary, stage_label_rankings = rank_clusters_by_stage_label(clustered_variables, merged_df, stage_label_col)

# Output results
print("Cluster Summary:")
print(cluster_summary)

print("\nStageLabel Rankings:")
for stage, ranking in stage_label_rankings.items():
    print(f"Stage {stage} Rankings:")
    for cluster_name, variability in ranking:
        print(f"  Cluster: {cluster_name}, Variability: {variability}")

      Loudness_Energy  Intonation_Patterns  Rhythm_Strength  \
0            0.047715           252.661747         0.857723   
1            0.173928           892.135910         1.639412   
2            0.072644           298.460373         1.338938   
3            0.032812           460.263498         1.472092   
4            0.043951           497.194370         1.085941   
...               ...                  ...              ...   
4769         0.045932          1169.090052         2.090178   
4770         0.031944           625.098561         0.679354   
4771         0.077407           734.287699         2.546044   
4772         0.061625           540.509055         1.148260   
4773         0.037241           963.820644         0.853649   

      Voiced_Unvoiced_Distinction  Spectral_Center  Spectral_Width  
0                        0.019842      1351.088601     1721.346359  
1                        0.043669      1330.734780     1150.572495  
2                        0.035137   

KeyError: 'Stage'