In [34]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from IPython.display import display, HTML
import plotly.express as px
import umap
from collections import defaultdict
import textwrap

## Load the posts with frames, and (separately) the frame-clusters

In [35]:
# def get_results_from_hyperparams(hyperparams, base_dir=os.path.join('/','zfs','disinfo','narratives','blm','clustering_calibration_results')):
#     # Format each hyperparameter correctly before joining
#     formatted_hyperparams = {key: f'{int(value)}' if key != 'cluster_selection_epsilon' else f'{value}' for key, value in hyperparams.items()}
#     filename = f"blm_data_clustered_{'_'.join([f'{key}{value}' for key, value in formatted_hyperparams.items()])}.csv"
#     full_path = os.path.join(base_dir, filename)
#     df_cluster_labels = pd.read_csv(full_path)
#     return df_cluster_labels

In [36]:
# # Get the cluster labels provided by each set of hyperparams, and add the dfs as elements inside df_hyperparams
# dfs = df_hyperparams.apply(lambda x: get_results_from_hyperparams(x.to_dict()), axis=1)

In [37]:
# Filepath where frame data is stored
frames_filepath = os.path.join('.', 'frame_extraction_results.csv')

df_posts_with_frames = pd.read_csv(frames_filepath)
df_posts_with_frames.head()

Unnamed: 0,id,text,is_needle,frames
0,746779150057365504,RT @PaulaAtlantaGA: Call and response at GA th...,0,['Black lives matter']
1,746779141777690624,RT @LivingOnChi: 4/7/16 Bill Clinton insulted ...,0,['Bill Clinton insulted Black Lives Matter']
2,746779085121036288,https://t.co/aihG9teRYP #BlackLivesMatter(too)...,0,['Black lives matter']
3,746778854505734144,#BlackLivesMatter https://t.co/eN2Ee264TG,0,['Black lives matter']
4,746778837585788929,Yooooooo 😂😂😂😂 #God #GodInMeAsMe #BlackLivesMat...,0,"['God is in everyone', 'Black lives matter', '..."


In [38]:
# Filepath where cluster label data is stored
clusters_filepath = os.path.join('.', 'frame_cluster_results.csv')

df_frames_with_cluster_labels = pd.read_csv(clusters_filepath)
df_frames_with_cluster_labels.head()

Unnamed: 0,frames,embeddings,id,cluster_labels
0,Black lives matter,"[-0.023681530033542852, -0.02645277340519284, ...",746779150057365504,0
1,Bill Clinton insulted Black Lives Matter,"[-0.044787591145812065, -0.02040063124356746, ...",746779141777690624,-1
2,Black lives matter,"[-0.023681530033542852, -0.02645277340519284, ...",746779085121036288,0
3,Black lives matter,"[-0.023681530033542852, -0.02645277340519284, ...",746778854505734144,0
4,God is in everyone,"[-0.0009141607887425742, 0.0023581552895802584...",746778837585788929,-1


## Gather sentiment analysis for each post (optional)

In [39]:
gather_sentiment = True

if gather_sentiment:
    # Use a pipeline as a high-level helper
    from transformers import pipeline
    import torch

    # Check if CUDA is available and set the device accordingly
    device = 0 if torch.cuda.is_available() else -1  # -1 indicates CPU

    pipe = pipeline("text-classification", 
                    model="cardiffnlp/twitter-roberta-base-sentiment-latest", 
                    return_all_scores=True,
                    device=device
                   )

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
sentiments = pipe(df_posts_with_frames.text.to_list())

In [41]:
# Process each element in 'sentiments' to create a list of dictionaries
sentiments_processed = []
for sentiment_group in sentiments:
    sentiment_dict = {d['label']: d['score'] for d in sentiment_group}
    sentiments_processed.append(sentiment_dict)

# Convert the processed list into a DataFrame
df_sentiments = pd.DataFrame(sentiments_processed)
df_sentiments.head()

Unnamed: 0,negative,neutral,positive
0,0.181578,0.768259,0.050163
1,0.651048,0.339297,0.009655
2,0.003891,0.014501,0.981608
3,0.266028,0.658572,0.075399
4,0.008426,0.130322,0.861251


In [42]:
# Concatenate this new DataFrame with existing df_tweets
df_posts_with_frames = pd.concat([df_posts_with_frames, df_sentiments], axis=1)
df_posts_with_frames.head()

Unnamed: 0,id,text,is_needle,frames,negative,neutral,positive
0,746779150057365504,RT @PaulaAtlantaGA: Call and response at GA th...,0,['Black lives matter'],0.181578,0.768259,0.050163
1,746779141777690624,RT @LivingOnChi: 4/7/16 Bill Clinton insulted ...,0,['Bill Clinton insulted Black Lives Matter'],0.651048,0.339297,0.009655
2,746779085121036288,https://t.co/aihG9teRYP #BlackLivesMatter(too)...,0,['Black lives matter'],0.003891,0.014501,0.981608
3,746778854505734144,#BlackLivesMatter https://t.co/eN2Ee264TG,0,['Black lives matter'],0.266028,0.658572,0.075399
4,746778837585788929,Yooooooo 😂😂😂😂 #God #GodInMeAsMe #BlackLivesMat...,0,"['God is in everyone', 'Black lives matter', '...",0.008426,0.130322,0.861251


## Combine clustering calibration results with post text, get dim-reduced embeddings (for visualization)

In [43]:
# For each df in df_hyperparams.df, add to it columns with text and is_needle
# Define col you're interested in analyzing (binary indicators)
col_of_interest= 'is_needle'

# Apply this function to each dataframe that will be in the 'df' column of df_hyperparams
df = pd.merge(df_frames_with_cluster_labels, 
              df_posts_with_frames[[col_of_interest] + ['id', 'text', 'negative', 'neutral', 'positive']], 
              on='id', how='left')

In [44]:
df.head()

Unnamed: 0,frames,embeddings,id,cluster_labels,is_needle,text,negative,neutral,positive
0,Black lives matter,"[-0.023681530033542852, -0.02645277340519284, ...",746779150057365504,0,0,RT @PaulaAtlantaGA: Call and response at GA th...,0.181578,0.768259,0.050163
1,Bill Clinton insulted Black Lives Matter,"[-0.044787591145812065, -0.02040063124356746, ...",746779141777690624,-1,0,RT @LivingOnChi: 4/7/16 Bill Clinton insulted ...,0.651048,0.339297,0.009655
2,Black lives matter,"[-0.023681530033542852, -0.02645277340519284, ...",746779085121036288,0,0,https://t.co/aihG9teRYP #BlackLivesMatter(too)...,0.003891,0.014501,0.981608
3,Black lives matter,"[-0.023681530033542852, -0.02645277340519284, ...",746778854505734144,0,0,#BlackLivesMatter https://t.co/eN2Ee264TG,0.266028,0.658572,0.075399
4,God is in everyone,"[-0.0009141607887425742, 0.0023581552895802584...",746778837585788929,-1,0,Yooooooo 😂😂😂😂 #God #GodInMeAsMe #BlackLivesMat...,0.008426,0.130322,0.861251


In [45]:
# Get dim-reduced version of embeddings

def string_to_float_list(s):
    # Remove the brackets and split the string into a list of strings
    float_strings = s.strip('[]').split(',')

    # Convert each string to a float
    return [float(item) for item in float_strings]

# Apply the conversion function to the 'embeddings' column
embeddings = df.embeddings.apply(string_to_float_list)

# Step 1: Identify and extract unique embeddings
unique_embeddings = defaultdict(list)
for idx, emb in enumerate(embeddings):
    unique_embeddings[tuple(emb)].append(idx) # Use tuple because numpy arrays are not hashable

unique_emb_list = list(unique_embeddings.keys())

umap_model = umap.UMAP(n_components=2, random_state=355)

# Step 2: Apply UMAP only to the unique embeddings
reduced_unique_embeddings = umap_model.fit_transform(unique_emb_list)

# Step 3: Map the non-unique embeddings to their UMAP-reduced counterparts
reduced_embeddings = [reduced_unique_embeddings[unique_emb_list.index(tuple(emb))] for emb in embeddings]


In [46]:
# Unpack each numpy array in reduced_embeddings outside the loop
umap_1, umap_2 = zip(*reduced_embeddings)

# Replace high-dimensional embeddings with 2-d ones in the DataFrame
# Add the UMAP dimensions as new columns
df['UMAP_1'] = umap_1
df['UMAP_2'] = umap_2

# Drop the existing 'embeddings' column
if 'embeddings' in df.columns:
    df.drop(columns='embeddings', inplace=True)

In [2]:
# Save/load

save_path = os.path.join('.', 'data', 'combined_results.json')
# df.to_json(save_path)
df = pd.read_json(save_path)

## Check correlation of sentiment with is_needle, both globally and within each cluster

In [47]:
# Step 1: Calculate and print the overall correlation
overall_corr = df[[col_of_interest, 'negative', 'neutral', 'positive']].corr()[col_of_interest]
print(f"Overall Correlation with {col_of_interest}:\n", overall_corr, "\n")

# Step 2: Calculate correlation within each cluster_label
cluster_corrs = df.groupby('cluster_labels')[[col_of_interest, 'negative', 'neutral', 'positive']].corr().loc[:, col_of_interest]

# Filter out the correlation of 'is_needle' with itself
cluster_corrs = cluster_corrs[cluster_corrs.index.get_level_values(1) != col_of_interest]

# Step 3: Find the five clusters with the highest magnitude of correlation
top_clusters = cluster_corrs.abs().groupby(level=0).mean().nlargest(5)
print("Top 5 Clusters with Highest Magnitude of Correlation:\n", top_clusters)

Overall Correlation with is_needle:
 is_needle   NaN
negative    NaN
neutral     NaN
positive    NaN
Name: is_needle, dtype: float64 

Top 5 Clusters with Highest Magnitude of Correlation:
 cluster_labels
-1   NaN
 0   NaN
 1   NaN
 2   NaN
 3   NaN
Name: is_needle, dtype: float64


## Visualize results

In [48]:
def plot_dim_red_interactive(embeddings, title=""):
    """
    Plots 2D dimension-reduced representation of embeddings colored by labels interactively.

    Parameters:
    - embeddings (list): The embeddings to be plotted.
    - labels (list): Cluster labels for each embedding.
    - theories (list): Theories corresponding to each embedding.
    - title (str): Title for the plot.
    """
    
    # Create a dataframe for plotting
    df_plot = embeddings[['UMAP_1', 'UMAP_2']].copy()
    df_plot['labels'] = embeddings['cluster_labels'].astype(str)  # Convert labels to string
    df_plot['text'] = embeddings['text']
    df_plot['frame'] = embeddings['theories']
    
    # Apply word wrapping to the 'text' and 'frame' columns using a lambda function
    df_plot['text'] = df_plot['text'].apply(lambda x: textwrap.fill(x, width=50).replace('\n', '<br>'))
    df_plot['frame'] = df_plot['frame'].fillna('MISSING').apply(lambda x: textwrap.fill(x, width=50).replace('\n', '<br>'))
    
    # Sort by 'labels'
    df_plot = df_plot.sort_values(by='labels')
    
    # Create an interactive plot
    fig = px.scatter(df_plot,
                    x='UMAP_1',
                    y='UMAP_2',
                    color='labels',
                    hover_name='text',
                    hover_data=['frame'],  # Add 'frame' to hover data
                    title=title,
                    category_orders={"labels": sorted(df_plot['labels'].unique())})  # Ensure the labels are treated as discrete values
    
    fig.update_layout(height=800, width=800)
    fig.update_traces(marker=dict(size=3))
    
    fig.show()


In [49]:
def get_and_plot_sentiment_corrs_with_cluster_label(df, n=10):
    # Compute the correlations
    cluster_corrs = df.groupby('cluster_labels')[[col_of_interest, 'negative', 'neutral', 'positive']].corr().loc[:, col_of_interest]
    cluster_corrs = cluster_corrs[cluster_corrs.index.get_level_values(1) != col_of_interest]

    # Find the top n clusters with the highest magnitude of correlation
    top_clusters = cluster_corrs.abs().groupby(level=0).mean().nlargest(n).reset_index()

    # Sorting the data by correlation strength in descending order
    top_clusters.columns = ['Cluster Label', 'Correlation Strength']
    sorted_top_clusters = top_clusters.sort_values('Correlation Strength', ascending=False)

    # Plotting with seaborn
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Cluster Label', y='Correlation Strength', data=sorted_top_clusters, palette='viridis', order=sorted_top_clusters['Cluster Label'])
    plt.title('Top {} Correlations of Sentiment with Cluster Labels'.format(n))
    plt.xlabel('Cluster Labels')
    plt.ylabel('Correlation Strength')
    plt.show()
    
    # Display correlations in a table format
    correlation_table = pd.DataFrame()

    for cluster_label in sorted_top_clusters['Cluster Label']:
        correlations = cluster_corrs.loc[cluster_label]
        correlation_table[cluster_label] = correlations

    # Transpose the table for better readability
    correlation_table = correlation_table.T
    print("\nCorrelations of cluster labels with sentiment:")
    print(correlation_table)

In [50]:
def run_analysis(df, n_bars, n_heat):
    # Compute the mean of 'is_needle' for each 'cluster_label'
    mean_needliness = df.groupby('cluster_labels')[col_of_interest].mean()

    # Display number of clusters
    print(f'Total clusters: {df.cluster_labels.nunique()}')

    # Reset index to make 'cluster_label' a column (for easier plotting)
    mean_needliness = mean_needliness.reset_index()

    # Sorting by 'is_needle' in descending order
    sorted_mean_needliness = pd.concat([mean_needliness.sort_values(col_of_interest, ascending=False).head(n_bars//2),
                                        mean_needliness.sort_values(col_of_interest, ascending=False).tail(n_bars//2)])

    # Convert 'cluster_labels' to a categorical with the order defined by 'is_needle' values
    sorted_mean_needliness['cluster_labels'] = pd.Categorical(
        sorted_mean_needliness['cluster_labels'], 
        categories=sorted_mean_needliness['cluster_labels'].unique(), 
        ordered=False
    )

    # Create a contingency table
    contingency_table = pd.crosstab(df[col_of_interest], df['cluster_labels'])

    # Perform the chi-squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)

    # Normalize the contingency table by the total counts of each 'is_needle' category
    normalized_table = contingency_table.div(contingency_table.sum(axis=1), axis=0)

    # Calculate the difference between the proportions of the two 'is_needle' groups
    difference = normalized_table.diff().iloc[1]  # Difference between the second row and the first row

    # Identify the top n categories with the largest magnitude differences
    top_n_categories = difference.abs().nlargest(n_heat).index

    # Filter the contingency table and the normalized table for the top n categories
    filtered_contingency_table = contingency_table[top_n_categories]
    filtered_normalized_table = normalized_table[top_n_categories]
    difference_filtered = difference[top_n_categories]

    # Create a DataFrame from the filtered difference series
    difference_df_filtered = pd.DataFrame(difference_filtered).T
    difference_df_filtered.index = ['Difference']

    # Round the values to two significant figures
    difference_df_filtered = difference_df_filtered.round(2)

    # Concatenate the filtered difference row to both the filtered contingency table and the normalized table
    filtered_contingency_table_with_diff = pd.concat([filtered_contingency_table, difference_df_filtered])
    filtered_normalized_table_with_diff = pd.concat([filtered_normalized_table, difference_df_filtered])

    # Create a figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))  # Adjust the figure size as needed

    # Plotting the bar chart in the first subplot
    sns.barplot(x='cluster_labels', y=col_of_interest, data=sorted_mean_needliness, ax=ax1)
    ax1.set_title(f'Mean of is_needle for each cluster label\nTop {n_bars//2} and bottom {n_bars//2} clusters')
    ax1.set_xlabel('Cluster Label')
    ax1.set_ylabel(f'Average of {col_of_interest}')
    ax1.tick_params(axis='x', rotation=45)

    # Plotting the heatmap in the second subplot
    sns.heatmap(filtered_normalized_table_with_diff, annot=filtered_contingency_table_with_diff, cmap="YlGnBu", fmt='g', annot_kws={"rotation": 45}, ax=ax2)
    ax2.set_title(f'Heatmap of Contingency Table\nwith Proportion Difference w/r/t {col_of_interest}\nfor Top {n_heat} Categories')
    ax2.set_xlabel('Cluster Labels')
    ax2.set_ylabel(f'{col_of_interest} and Difference')

    plt.tight_layout()  # Adjusts the subplots to fit into the figure area.
    plt.show()

    # Display the statistical results
    results_html = f"""
    Null hypothesis: `{col_of_interest}` is not associated with cluster label. Chi-squared test says:
    <table>
        <tr><th>Statistic</th><th>Value</th></tr>
        <tr><td>Chi-Squared Statistic</td><td>{chi2:.2f}</td></tr>
        <tr><td>P-Value</td><td>{p:.4f}</td></tr>
        <tr><td>Degrees of Freedom</td><td>{dof}</td></tr>
    </table>
    """
    display(HTML(results_html))
    
    get_and_plot_sentiment_corrs_with_cluster_label(df)

    plot_dim_red_interactive(df)
    
    


In [51]:
# Determine max number of clusters to display in barchart
n_bars=20
# Select how many clusters to show in heatmap
n_heat=10
# Select which of the four clusterings to use
selected_idx=0

run_analysis(df, n_bars=n_bars, n_heat=n_heat)

Total clusters: 5


IndexError: single positional indexer is out-of-bounds