# Calculate alignment measures

In this notebook, there is code to:
* Calculate all alignment metrics for all debates
* Look at the descriptive statistics of every metric on the full dataset
* Calculate the metrics' correlation with frequency
* Calculate the intercorrelation between metrics
* Look at results for the running example in the paper





In [1]:
import seaborn as sns
sns.set_style("whitegrid")
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from scipy.stats import spearmanr
import pickle
from scipy.spatial.distance import cosine
from utils import load_representations, load_similarities, load_cluster_info, load_tfidf
from metrics_utils import *
from scipy.stats import shapiro, mannwhitneyu
import pdb

mask_types = ['no-mask','one-mask','multi-mask']


In [2]:
all_reps = load_representations("bert_representations")
all_data = load_similarities("bert_representations")
cluster_data = load_cluster_info("debates_full_chains")

# Calculate all alignment metrics + dialign measures

In [3]:

def calculate_all_metrics_for_one_debate(debate_id, all_reps, cluster_data):
    all_measures_for_debate = []
    aggregated_measures = dict()
    masks = ['no-mask', 'one-mask', 'multi-mask']
    halves = ['first-half', 'second-half']
    sides = ['for', 'against']  
    all_mentions_overlap = []
    all_sides_overlap = []
    
    for clnum, cl in enumerate(all_reps[debate_id]):          
        measures_cluster = dict() 
        for mask in masks:            
            measures_cluster[mask] = dict()
            all_reps_per_side = dict()
            avg_reps_per_side = dict()
            avg_reps_per_side_and_half = dict()
            for side in cl[mask]:
                avg_reps_per_side_and_half[side] = dict()            
                all_reps_per_side[side] = []
                for half in halves:    
                    all_reps_this_side_and_half = []
                    for rep in cl[mask][side][half]: 
                        all_reps_per_side[side].append(rep['representation'])
                        all_reps_this_side_and_half.append(rep['representation'])                    
                    avg_reps_per_side_and_half[side][half] = np.average(all_reps_this_side_and_half, axis=0)
                    
                avg_reps_per_side[side] = np.average(all_reps_per_side[side], axis=0)     
            

                  
                
            #####################################
            ####### TIME-UNAWARE MEASURES #######
            
            # SV (Shared vocabulary of a concept) only for clusters of type "coref"
            
            if cluster_data[debate_id]['clusters'][clnum]["type"] == "coref": 
                measures_cluster[mask]['SV'] = dict()
                mentions = []
                sides_by_mention = []
                for j, (m_st, m_end) in enumerate(cluster_data[debate_id]['clusters'][clnum]['mentions']):
                    mentions.append(" ".join(cluster_data[debate_id]['document'][m_st:m_end+1]))
                    sides_by_mention.append(cluster_data[debate_id]['clusters'][clnum]['speaker_types'][j])
                all_mentions_overlap.append(mentions)
                all_sides_overlap.append(sides_by_mention)                
                overlap, total_freq  = shared_vocabulary_of_concept(mentions, sides_by_mention)                
                measures_cluster[mask]['SV']['value'] = overlap
                measures_cluster[mask]['SV']['freq'] = total_freq   
                
                
            ### For the other metrics, which will be used for normal words only,
            # we want to have at least one instance per word and per side.
            # calculate measures for this cluster only if vectors are available
            #pdb.set_trace()
            vecs_not_available = False
            for side in sides:
                if np.isnan(avg_reps_per_side[side]).any() or np.isnan(avg_reps_per_side_and_half[side]['first-half']).any() or np.isnan(avg_reps_per_side_and_half[side]['second-half']).any():
                    vecs_not_available = True
            if vecs_not_available:                
                continue         

            # Time-unaware self-similarity (SS_{TU})
            ss_tu = SS_TU(all_reps_per_side)
            measures_cluster[mask]['TUSS_cos'] = ss_tu['cos']
            measures_cluster[mask]['TUSS_eucl'] = ss_tu['eucl']


            ## Time-unaware other-similarity (OS_{TU})
            os_tu = OS_TU(all_reps_per_side)
            measures_cluster[mask]['TUOS_cos'] = os_tu['cos']
            measures_cluster[mask]['TUOS_eucl'] = os_tu['eucl']                
            
            
            #####################################
            ######## TIME AWARE MEASURES ########
            
            ## Time-aware self-similarity (SS_{TA})
            ss_ta = SS_TA(cl, mask)
            measures_cluster[mask]['TASS_cos'] = ss_ta['cos']
            measures_cluster[mask]['TASS_eucl'] = ss_ta['eucl']
 
            # sApp
            sapp = sApp(cl, mask)
            measures_cluster[mask]['sApp_cos'] = sapp['cos']
            measures_cluster[mask]['sApp_eucl'] = sapp['eucl']
            
            # asApp  
            asapp = asApp(cl, mask)
            measures_cluster[mask]['asApp_cos'] = asapp['cos']
            measures_cluster[mask]['asApp_eucl'] = asapp['eucl']       

            # DS
            measures_cluster[mask]['DS_cos'], measures_cluster[mask]['DS_eucl'] = dict(), dict()
            measures_cluster[mask]['DS_cos']['for'], measures_cluster[mask]['DS_cos']['against'] = DS(measures_cluster[mask]['asApp_cos']['for'], measures_cluster[mask]['asApp_cos']['against'])
            measures_cluster[mask]['DS_eucl']['for'], measures_cluster[mask]['DS_eucl']['against'] = DS(measures_cluster[mask]['asApp_eucl']['for'], measures_cluster[mask]['asApp_eucl']['against'])
            
        all_measures_for_debate.append(measures_cluster)
        
    return all_measures_for_debate 


In [4]:

def aggregate_metrics(debate_id, measures_by_word_one_debate, clusters, dialign_measures_this_debate):   
    '''Aggregate the values obtained for every metric in one debate according to different vocabulary definitions'''
    
    tfidfs = load_tfidf("tfidf_data/", debate_id)       
    highest_tfidf_words = [w for w, val in sorted(tfidfs.items(), key=lambda x: x[1], reverse=True)]

    
    all_measures_for_debate = []
    aggregated_measures = dict()
    masks = ['no-mask', 'one-mask', 'multi-mask']
    halves = ['first-half', 'second-half']
    sides = ['for', 'against']    
    vocabulary_options = ["all","all_words", "tfidf200", "tfidf200_C"]
   
    symmetric_measure_names = ["TUOS_cos", "TUOS_eucl", 'sApp_cos', 'sApp_eucl', 'SV']
    asymmetric_measure_names = ["TUSS_cos", "TUSS_eucl", 'TASS_cos', 'TASS_eucl', 'DS_cos','DS_eucl','asApp_cos','asApp_eucl']
    all_mentions_overlap = []
    all_sides_overlap = []
    
    
    all_cluster_names = [cl['cluster_name'] for cl in clusters]    
    
    # aggregate metrics according to the vocabulary chosen
    for vocabulary_option in vocabulary_options: 
        aggregated_measures[vocabulary_option] = dict()
        included_clnums = []            
        for clnum, cl in enumerate(clusters):
            included = False                                
            if cl['type'] == "word" and vocabulary_option in ["all", "all_words"]:
                included = True
            elif cl['type'] == "coref" and vocabulary_option in ["all", "tfidf200_C"]:
                included = True
            elif cl['type'] == "word" and vocabulary_option in ["tfidf200","tfidf200_C"]:
                if cl['cluster_name'] in highest_tfidf_words[:200]:      
                    included = True
            if included:
                included_clnums.append(clnum)
                

        for mask in masks:
            aggregated_measures[vocabulary_option][mask] = dict()
            for measure in symmetric_measure_names:
                if measure == "SV": # we take care of this one later, separately
                    continue
                all_values = []
                for clnum in included_clnums:                    
                    if measure in measures_by_word_one_debate[clnum][mask]:  
                        if np.isnan(measures_by_word_one_debate[clnum][mask][measure]):
                            print("omitting one nan, cluster", clnum)                             
                            continue
                        all_values.append(measures_by_word_one_debate[clnum][mask][measure])
                aggregated_measures[vocabulary_option][mask][measure] = np.average(all_values)
                
            for measure in asymmetric_measure_names:
                aggregated_measures[vocabulary_option][mask][measure] = dict()                    
                for side in sides:
                    all_values = []
                    for clnum in included_clnums:
                        if measure in measures_by_word_one_debate[clnum][mask]:                            
                            if np.isnan(measures_by_word_one_debate[clnum][mask][measure][side]):
                                print("omitting one nan, cluster", clnum)
                                continue
                            all_values.append(measures_by_word_one_debate[clnum][mask][measure][side])
                    aggregated_measures[vocabulary_option][mask][measure][side] = np.average(all_values)
                    
            # Now, SV            
            if vocabulary_option in ["all", "all_tfidf200_C"]:
                measure = "SV"
                sum_freqs_for_overlap = []
                for clnum in included_clnums:
                    if "SV" in measures_by_word_one_debate[clnum][mask]:
                        if not np.isnan(measures_by_word_one_debate[clnum][mask]['SV']['value']):
                            sum_freqs_for_overlap.append(measures_by_word_one_debate[clnum][mask]['SV']['freq'])
                sum_freqs_for_overlap = sum(sum_freqs_for_overlap)

                normalized_overlaps = []
                for clnum in included_clnums:
                    if "SV" in measures_by_word_one_debate[clnum][mask]:                    
                        ov = measures_by_word_one_debate[clnum][mask]['SV']['value']
                        fr = measures_by_word_one_debate[clnum][mask]['SV']['freq']
                        if not np.isnan(ov):
                            normalized_overlaps.append(ov*fr/sum_freqs_for_overlap)
                
                aggregated_measures[vocabulary_option][mask][measure] = np.sum(normalized_overlaps)                  

                
    # now finally, at the debate level, include the Dialign measures
    # they are the same regardless of the vocab option or the masking strategy
    for vocab_option in aggregated_measures:
        for mask in aggregated_measures[vocab_option]:
            for measure in dialign_measures_this_debate: 
                aggregated_measures[vocab_option][mask][measure] = dialign_measures_this_debate[measure] #dialign_measures[debate_id][measure]
            
    return aggregated_measures


In [5]:
def convert_sdep_measure_name(measure_name):
    if "Initiated Expression" in measure_name:
        return "Initiated Expression"
    elif "Expression Repetition" in measure_name:
        return "Expression Repetition"
    elif "tokens (%)" in measure_name:
        return "tokens (%)"
    elif measure_name == "Voc. Overlap":
        return "Voc. Overlap"
    elif "Voc. Overlap" in measure_name:
        return "SR/Voc. Overlap"
    elif "/ELS" in measure_name:
        return "SR/ELS"
    elif "/EV" in measure_name:
        return "SR/EV"
    elif "/ER" in measure_name:
        return "SR/ER"
    elif "/ENTR" in measure_name:
        return "SR/ENTR"
    elif measure_name.endswith("/L"):
        return "SR/L"
    elif "/LMAX" in measure_name:
        return "SR/LMAX"
    
def load_dialign_info(dialign_dir="dialign-output/"):#"dialign-output/"):    
    # S1 is against and S2 is for (as we can see in the metrics-speaker-dependent.tsv file)
    speaker_dependent_dialign_measure_names = ["S1/Initiated Expression (IE_S1)", "S1/Expression Repetition (ER_S1)", "S1/tokens (%)", "S2/Initiated Expression (IE_S2)", "S2/Expression Repetition (ER_S2)", "S2/tokens (%)", "Voc. Overlap S1", "Voc. Overlap S2", "SR/S1/ELS", "SR/S1/EV", "SR/S1/ER", "SR/S1/ENTR", "SR/S1/L", "SR/S1/LMAX", "SR/S2/ELS", "SR/S2/EV", "SR/S2/ER", "SR/S2/ENTR", "SR/S2/L", "SR/S2/LMAX"]    
    speaker_independent_dialign_measure_names = ["Num. utterances", "Num. tokens", "Expression Lexicon Size (ELS)", "Expression Variety (EV)", "Expression Repetition (ER)", "Voc. Overlap", "ENTR", "L", "LMAX"]
    sdep = pd.read_csv(dialign_dir + "metrics-speaker-dependent.tsv", sep="\t")
    sindep = pd.read_csv(dialign_dir + "metrics-speaker-independent.tsv", sep="\t")
    
    dialign_measures = dict()
    for i, r in sdep.iterrows():
        debate_id = r['ID'].split("_")[0]
        dialign_measures[debate_id] = dict()
        for k in speaker_dependent_dialign_measure_names:
            if "S1" in k:
                speaker = "against"
            elif "S2" in k:
                speaker = "for"
            simplek = convert_sdep_measure_name(k)
            if simplek not in dialign_measures[debate_id]:
                dialign_measures[debate_id][simplek] = dict()
            dialign_measures[debate_id][simplek][speaker] = r[k]
                
            
    for i, r in sindep.iterrows():
        debate_id = r['ID'].split("_")[0]
        for k in speaker_independent_dialign_measure_names:
            dialign_measures[debate_id][k] = r[k]
            
    return dialign_measures


In [6]:
measures_all_debates_by_word = dict()
measures_all_debates = dict()

# first, calculate dialign measures
dialign_measures = load_dialign_info() 

for debate_id in cluster_data:
    print(debate_id)
    measures_all_debates_by_word[debate_id] = calculate_all_metrics_for_one_debate(debate_id, all_reps, cluster_data)
    measures_all_debates[debate_id] = aggregate_metrics(debate_id, measures_all_debates_by_word[debate_id], cluster_data[debate_id]['clusters'], dialign_measures[debate_id])    
    pickle.dump(measures_all_debates_by_word, open("measures_all_debates_by_word.pkl", "wb") )
    pickle.dump(measures_all_debates, open("measures_all_debates.pkl", "wb") )

    
   

0
6860


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


24565
22130
8444
25403
19956
11767
24036
8958
4934
5180
2627
13878
21641
671
21166
10924
19135
23135
25160
4134
23602
2457
5347
23945
23707
3416
20577
14990
7859
22879
12524
25836
9973
15450
18423
11136
8115
2228
5703
356
12277
0
25635
1897
6389
19373
21535
20418
2807
14427
4626
19607
14199
9437
3735
16250
21906
6660
6098
24784
16500
22618
20148
15726
2960
5488
1177
10612
14787
18181
897
8797
26330
9739
4395
10331
17634
13265
26062
3141
15975
24367
9204
11368
7457
13452
3948
16966


  DS_a = asapp_a / denominator
  DS_b = asapp_b / denominator
  DS_a = asapp_a / denominator
  DS_b = asapp_b / denominator


omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
omitting one nan, cluster 21
17957
22390
20897
5977
15258
1406
23312
13608
12797
16749
18864
17262
7204
18672
24929
11978
20803
11620
1595


# Get statistics of each measure

In [6]:
measures_all_debates = pickle.load(open("measures_all_debates.pkl", "rb"))

mask_type = "no-mask"
             
halves = ['first-half', 'second-half']
accumulating_stats = dict()
almost_all_measures = ['TUOS','sApp','TUSS','TASS','asApp','DS']
for measure in almost_all_measures:
    accumulating_stats[measure + "_cos"] = []
    accumulating_stats[measure + "_eucl"] = []
accumulating_stats["SV"] = []
  
   
for debate_id in measures_all_debates:    
    for measure in measures_all_debates[debate_id]['tfidf200'][mask_type]:
        if measure not in accumulating_stats:
            accumulating_stats[measure] = []
        measure_data = measures_all_debates[debate_id]['tfidf200'][mask_type][measure]
        if type(measure_data) == type(dict()):
            accumulating_stats[measure].append(measure_data['for'])
            accumulating_stats[measure].append(measure_data['against'])
            if measure + "-for" not in accumulating_stats:
                accumulating_stats[measure + "-for"] = []
                accumulating_stats[measure + "-against"] = []
                
            accumulating_stats[measure + "-for"].append(measure_data['for'])
            accumulating_stats[measure + "-against"].append(measure_data['against'])
        else:
            accumulating_stats[measure].append(measure_data)
    measure_data = measures_all_debates[debate_id]['all'][mask_type]['SV']
    accumulating_stats['SV'].append(measure_data)
    
        
        
        

In [7]:
for measure in accumulating_stats:
    print(measure)
    d = accumulating_stats[measure]
    print("mean", np.mean(d))
    print("min", np.min(d))
    print("max", np.max(d))
    print("std", np.std(d))
    print("\n")


TUOS_cos
mean 0.6886966178368714
min 0.618149715610368
max 0.7249803863561425
std 0.0189999419941408


TUOS_eucl
mean 15.519588
min 14.334387
max 17.519615
std 0.5648696


sApp_cos
mean 0.00795657766178588
min -0.05223535190185884
max 0.06445713948689255
std 0.01769623973653361


sApp_eucl
mean 0.1669543
min -1.5076066
max 1.5792781
std 0.44769102


TUSS_cos
mean 0.7084713156007209
min 0.6261646579191129
max 0.7525489972119453
std 0.0204737220659174


TUSS_eucl
mean 14.922905
min 13.291168
max 17.365372
std 0.63624185


TASS_cos
mean 0.6968545302155168
min 0.6079818733851632
max 0.7480730904981894
std 0.0223999595513355


TASS_eucl
mean 15.29295
min 13.735255
max 17.829676
std 0.66253674


asApp_cos
mean 0.0011667664488276667
min -0.04432927819687995
max 0.052375381222946264
std 0.012434878674773818


asApp_eucl
mean 0.004654603
min -1.1618063
max 1.0265634
std 0.31641713


DS_cos
mean 0.021289792539728083
min -0.25173833963621783
max 0.3383622891513086
std 0.1147307076325762


DS_eucl

## Correlation with frequency

In [8]:
measures_all_debates_by_word = pickle.load(open("measures_all_debates_by_word.pkl", "rb"))

In [9]:
mask_type = "no-mask"    
    

halves = ['first-half', 'second-half']
# for each measure: accumulate, for each word, the value of the measure and the number of instances that were used
accumulating = dict()
for measure in ['TUOS_cos','sApp_cos','TUSS_cos','TASS_cos','asApp_cos','DS_cos']:
    accumulating[measure] = {'value':[], 'freq':[]}
  
   
for debate_id in measures_all_debates_by_word:
    for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[debate_id], all_data[debate_id])):    
        # first, get the number of instances
        cl2 = all_reps[debate_id][clnum][mask_type]
        all_reps_per_side = dict()        
        for side in cl2:        
            all_reps_per_side[side] = []
            for half in halves:    
                all_reps_this_side_and_half = []
                for rep in cl2[side][half]: 
                    all_reps_per_side[side].append(rep['representation'])
                    
        
        
        for sym_m in ["TUOS_cos","sApp_cos"]:
            if sym_m in clmeasures[mask_type]:
                accumulating[sym_m]['value'].append(clmeasures[mask_type][sym_m])
                accumulating[sym_m]['freq'].append(len(all_reps_per_side['for']) + len(all_reps_per_side['against']))
        
        for asym_m in ["TUSS_cos", "TASS_cos", "asApp_cos", "DS_cos"]:
            if asym_m in clmeasures[mask_type]:
                if not np.isnan(clmeasures[mask_type][asym_m]['for']):
                    accumulating[asym_m]['value'].append(clmeasures[mask_type][asym_m]['for'])
                    accumulating[asym_m]['freq'].append(len(all_reps_per_side['for']))
                if not np.isnan(clmeasures[mask_type][asym_m]['against']):
                    accumulating[asym_m]['value'].append(clmeasures[mask_type][asym_m]['against'])
                    accumulating[asym_m]['freq'].append(len(all_reps_per_side['against']))

    

In [10]:
from scipy.stats import spearmanr
for measure in accumulating:
    rho, p = spearmanr(accumulating[measure]['freq'], accumulating[measure]['value'])
    print(measure, "rho:",  rho.round(3), "p-value:", p.round(3))

TUOS_cos rho: 0.006 p-value: 0.577
sApp_cos rho: 0.006 p-value: 0.572
TUSS_cos rho: -0.034 p-value: 0.0
TASS_cos rho: -0.013 p-value: 0.076
asApp_cos rho: 0.015 p-value: 0.047
DS_cos rho: 0.015 p-value: 0.046


# Intercorrelations between measures

In [11]:
# Check how different TU other-similarity is from TA self-similarity and TU self-similarity

# check for normality
print(shapiro(accumulating_stats['TUOS_cos']))
print(shapiro(accumulating_stats['TUSS_cos']))
print(shapiro(accumulating_stats['TASS_cos']))


ShapiroResult(statistic=0.9708045721054077, pvalue=0.01771170273423195)
ShapiroResult(statistic=0.9834761619567871, pvalue=0.012576954439282417)
ShapiroResult(statistic=0.9760144948959351, pvalue=0.0009683974785730243)


In [12]:

# not normal -> mann whitney
print("time-unaware other-similarity vs time-unaware self-similarity:")
print(mannwhitneyu(accumulating_stats['TUOS_cos'], accumulating_stats['TUSS_cos']))

print("time-unaware other-similarity vs time-aware self-similarity:")
print(mannwhitneyu(accumulating_stats['TUOS_cos'], accumulating_stats['TASS_cos']))

print("time-unaware self-similarity vs time-aware self-similarity:")
print(mannwhitneyu(accumulating_stats['TUSS_cos'], accumulating_stats['TASS_cos']))

time-unaware other-similarity vs time-unaware self-similarity:
MannwhitneyuResult(statistic=5419.0, pvalue=3.962297598859403e-15)
time-unaware other-similarity vs time-aware self-similarity:
MannwhitneyuResult(statistic=8894.0, pvalue=0.0004934849880349604)
time-unaware self-similarity vs time-aware self-similarity:
MannwhitneyuResult(statistic=30328.0, pvalue=6.867307628487382e-08)


In [13]:
##### symmetric measures

sms = ['TUOS_cos','sApp_cos']

# if we want to include Dialign symmetric measures:
sms += ["Num. utterances", "Num. tokens", "Expression Lexicon Size (ELS)", "Voc. Overlap", "Expression Variety (EV)", "Expression Repetition (ER)", "ENTR", "L", "LMAX"]

for j, sm in enumerate(sms):
    for othersm in sms[j+1:]:                
        rho, p = spearmanr(accumulating_stats[sm], accumulating_stats[othersm])    
        print(sm, othersm, "rho:",  rho.round(3), "p-value:", p.round(3))

TUOS_cos sApp_cos rho: -0.07 p-value: 0.474
TUOS_cos Num. utterances rho: 0.033 p-value: 0.738
TUOS_cos Num. tokens rho: -0.166 p-value: 0.086
TUOS_cos Expression Lexicon Size (ELS) rho: -0.161 p-value: 0.095
TUOS_cos Voc. Overlap rho: 0.226 p-value: 0.019
TUOS_cos Expression Variety (EV) rho: 0.179 p-value: 0.064
TUOS_cos Expression Repetition (ER) rho: 0.04 p-value: 0.68
TUOS_cos ENTR rho: 0.011 p-value: 0.911
TUOS_cos L rho: -0.013 p-value: 0.893
TUOS_cos LMAX rho: 0.056 p-value: 0.566
sApp_cos Num. utterances rho: 0.125 p-value: 0.198
sApp_cos Num. tokens rho: 0.031 p-value: 0.751
sApp_cos Expression Lexicon Size (ELS) rho: -0.024 p-value: 0.803
sApp_cos Voc. Overlap rho: -0.121 p-value: 0.212
sApp_cos Expression Variety (EV) rho: -0.101 p-value: 0.299
sApp_cos Expression Repetition (ER) rho: -0.161 p-value: 0.095
sApp_cos ENTR rho: -0.002 p-value: 0.987
sApp_cos L rho: -0.012 p-value: 0.906
sApp_cos LMAX rho: -0.002 p-value: 0.987
Num. utterances Num. tokens rho: 0.419 p-value: 0.

In [14]:
# Asymmetric measures

asms = ['TUSS_cos','TASS_cos','asApp_cos','DS_cos']

# if we want to include Dialign asymmetric measures:
asms += ["Initiated Expression", "Expression Repetition", "tokens (%)", "SR/ELS", "SR/EV", "SR/ER", "SR/ENTR", "SR/L", "SR/LMAX"]
for j, asm in enumerate(asms):
    for otherasm in asms[j+1:]:                
        rho, p = spearmanr(accumulating_stats[asm], accumulating_stats[otherasm])    
        print(asm, otherasm, "rho:",  rho.round(3), "p-value:", p.round(3))
        


TUSS_cos TASS_cos rho: 0.934 p-value: 0.0
TUSS_cos asApp_cos rho: -0.158 p-value: 0.02
TUSS_cos DS_cos rho: -0.185 p-value: 0.006
TUSS_cos Initiated Expression rho: 0.063 p-value: 0.358
TUSS_cos Expression Repetition rho: -0.05 p-value: 0.465
TUSS_cos tokens (%) rho: -0.014 p-value: 0.837
TUSS_cos SR/ELS rho: -0.057 p-value: 0.402
TUSS_cos SR/EV rho: 0.174 p-value: 0.01
TUSS_cos SR/ER rho: 0.263 p-value: 0.0
TUSS_cos SR/ENTR rho: 0.196 p-value: 0.004
TUSS_cos SR/L rho: 0.166 p-value: 0.015
TUSS_cos SR/LMAX rho: 0.072 p-value: 0.295
TASS_cos asApp_cos rho: -0.091 p-value: 0.182
TASS_cos DS_cos rho: -0.134 p-value: 0.048
TASS_cos Initiated Expression rho: 0.032 p-value: 0.644
TASS_cos Expression Repetition rho: -0.029 p-value: 0.667
TASS_cos tokens (%) rho: -0.043 p-value: 0.534
TASS_cos SR/ELS rho: -0.074 p-value: 0.277
TASS_cos SR/EV rho: 0.306 p-value: 0.0
TASS_cos SR/ER rho: 0.198 p-value: 0.003
TASS_cos SR/ENTR rho: 0.123 p-value: 0.071
TASS_cos SR/L rho: 0.04 p-value: 0.558
TASS_co

# Looking at results for a running example

In [15]:
chosen_debate = '8444'
mask_type = "no-mask"
tfidf = load_tfidf("tfidf_data/", chosen_debate)    
highest_tfidf_words = [w for w, val in sorted(tfidf.items(), key=lambda x: x[1], reverse=True)] 
measures_all_debates_by_word = pickle.load(open("measures_all_debates_by_word.pkl", "rb"))
sides = ['for','against']

In [16]:
print("Time-unaware other-similarity")

sims = []
for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[chosen_debate], all_data[chosen_debate])):    
    if cl['type'] == "word" and 'TUOS_cos' in clmeasures[mask_type]:        
        sims.append([cl['cluster_name'], clmeasures[mask_type]['TUOS_cos']])

sims = sorted(sims, key=lambda k: k[1])

print([(w,s) for w,s  in sims if w in highest_tfidf_words[:200]])      


Time-unaware other-similarity
[('life_NOUN', 0.4962246185541153), ('attack_NOUN', 0.5296459520856539), ('grow_VERB', 0.5799237829115655), ('die_VERB', 0.6015136301517486), ('study_NOUN', 0.653983896665084), ('cow_NOUN', 0.6578269640604655), ('health_NOUN', 0.6587509968451091), ('kill_VERB', 0.677951312901681), ('fat_NOUN', 0.680682917435964), ('heart_NOUN', 0.7008274895804268), ('eat_VERB', 0.7012726261600879), ('farm_NOUN', 0.7052367757473673), ('anything_NOUN', 0.7076885513961315), ('plant_NOUN', 0.7094313116300673), ('diet_NOUN', 0.712994020515018), ('soil_NOUN', 0.7235075639826911), ('food_NOUN', 0.7275728983756823), ('corn_NOUN', 0.7345363242285592), ('vegetable_NOUN', 0.7383664300044378), ('animal_NOUN', 0.74377097497518), ('vegan_NOUN', 0.7611156387461556), ('vegetarian_NOUN', 0.7614651247859001), ('human_NOUN', 0.7784551461537679), ('cancer_NOUN', 0.7810741012942963), ('farming_NOUN', 0.7826763954427507), ('meat_NOUN', 0.7911718515714923), ('factory_NOUN', 0.7913300457100073), 

In [17]:
print("sApp")

sims = []
for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[chosen_debate], all_data[chosen_debate])):    
    if cl['type'] == "word" and 'sApp_cos' in clmeasures[mask_type]:        
        sims.append([cl['cluster_name'], clmeasures[mask_type]['sApp_cos']])

sims = sorted(sims, key=lambda k: k[1])

print([(w,s) for w,s  in sims if w in highest_tfidf_words[:200]])  


sApp
[('study_NOUN', -0.10353074556764941), ('fat_NOUN', -0.0723167359828949), ('vegetarian_NOUN', -0.066703466574351), ('health_NOUN', -0.06241555412610378), ('soil_NOUN', -0.025006538850289806), ('plant_NOUN', -0.024028644202247595), ('food_NOUN', -0.012455905228853204), ('farming_NOUN', -0.0051993876695632935), ('farm_NOUN', -0.0029923066496849726), ('vegetable_NOUN', 0.00337044894695282), ('diet_NOUN', 0.0047436047823001815), ('factory_NOUN', 0.017411991528102355), ('kill_VERB', 0.02145541674560969), ('human_NOUN', 0.02726541956265771), ('vegan_NOUN', 0.027536392211914062), ('animal_NOUN', 0.027579561992505996), ('cancer_NOUN', 0.03358164954753151), ('heart_NOUN', 0.042348772287368774), ('eat_VERB', 0.05339107786278863), ('meat_NOUN', 0.05380304044459283), ('corn_NOUN', 0.07488928834597275), ('die_VERB', 0.09598531041826519), ('life_NOUN', 0.09731096277634305), ('face_NOUN', 0.19236883922265124), ('anything_NOUN', 0.2578541029776845), ('attack_NOUN', 0.2705521285533905), ('grow_VER

In [18]:
print("Time-unaware self-similarity")

for side in sides:
    print(side)
    sims = []
    for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[chosen_debate], all_data[chosen_debate])):    
        if cl['type'] == "word" and 'TUSS_cos' in clmeasures[mask_type]:        
            sims.append([cl['cluster_name'], clmeasures[mask_type]['TUSS_cos'][side]])
    
    sims = sorted(sims, key=lambda k: k[1])   
    print([(w,s) for w,s  in sims if w in highest_tfidf_words[:200]])  
    


Time-unaware self-similarity
for
[('life_NOUN', 0.472505646944046), ('grow_VERB', 0.5247118292432843), ('cow_NOUN', 0.5716675619284312), ('die_VERB', 0.608405898917805), ('study_NOUN', 0.6701231104987008), ('kill_VERB', 0.6769836443906639), ('health_NOUN', 0.6777468979358673), ('fat_NOUN', 0.7122666835784912), ('plant_NOUN', 0.7189967825299217), ('farm_NOUN', 0.721231592237634), ('eat_VERB', 0.7213188296431428), ('diet_NOUN', 0.722347134635562), ('corn_NOUN', 0.7306215763092041), ('food_NOUN', 0.7326622009277344), ('vegetable_NOUN', 0.7354978720347086), ('heart_NOUN', 0.772115979875837), ('soil_NOUN', 0.7749007741610209), ('face_NOUN', 0.7776305585661355), ('animal_NOUN', 0.7811320628426067), ('human_NOUN', 0.7933973471323649), ('vegetarian_NOUN', 0.7934302091598511), ('meat_NOUN', 0.7943228853127313), ('cancer_NOUN', 0.7986196738929145), ('attack_NOUN', 0.8130845824877421), ('factory_NOUN', 0.8202720302523989), ('vegan_NOUN', 0.8390943924585978), ('farming_NOUN', 0.8852119843165079), 

In [19]:
print('Time-aware self-similarity')

for side in sides:
    print(side)
    sims = []
    for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[chosen_debate], all_data[chosen_debate])):    
        if cl['type'] == "word" and 'TASS_cos' in clmeasures[mask_type]:        
            sims.append([cl['cluster_name'], clmeasures[mask_type]['TASS_cos'][side]])
    
    sims = sorted(sims, key=lambda k: k[1])   
    
    print([(w,s) for w,s  in sims if w in highest_tfidf_words[:200]])   
    print("\n")



Time-aware self-similarity
for
[('grow_VERB', 0.36258082538843156), ('cow_NOUN', 0.36976204812526703), ('life_NOUN', 0.4519957850376765), ('die_VERB', 0.5695393204689025), ('kill_VERB', 0.6465546071529389), ('study_NOUN', 0.6603089548074282), ('health_NOUN', 0.6638030707836151), ('face_NOUN', 0.6641981481359556), ('corn_NOUN', 0.6731112798055013), ('vegetable_NOUN', 0.6748226881027222), ('fat_NOUN', 0.6761690179506937), ('farm_NOUN', 0.714560283968846), ('diet_NOUN', 0.7153155318012944), ('eat_VERB', 0.7159358366524301), ('plant_NOUN', 0.7174114540771201), ('food_NOUN', 0.7190590369701385), ('cancer_NOUN', 0.7698458078361693), ('animal_NOUN', 0.7809303918159131), ('vegetarian_NOUN', 0.7894826730092367), ('meat_NOUN', 0.7942012401060624), ('attack_NOUN', 0.7955980896949768), ('human_NOUN', 0.8038281202316284), ('heart_NOUN', 0.8065096040566763), ('soil_NOUN', 0.8124979734420776), ('factory_NOUN', 0.8205675278391157), ('vegan_NOUN', 0.8470671847462654), ('farming_NOUN', 0.877090752124786

In [20]:
print('asApp')
side_idcs = {'for':0,'against':1}
for side in sides:    
    print(side)
    sims = []
    for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[chosen_debate], all_data[chosen_debate])):    
        if cl['type'] == "word" and 'asApp_cos' in clmeasures[mask_type]:        
            sims.append([cl['cluster_name'], clmeasures[mask_type]['asApp_cos'][side]])
    
    sims = sorted(sims, key=lambda k: k[1])    
    print([(w,s) for w,s  in sims if w in highest_tfidf_words[:200]])   
    print("\n")



asApp
for
[('vegetarian_NOUN', -0.07820104757944746), ('fat_NOUN', -0.07737517356872559), ('study_NOUN', -0.047620566750344984), ('vegan_NOUN', -0.02243252843618393), ('farming_NOUN', -0.0216200053691864), ('farm_NOUN', -0.012796506782372874), ('plant_NOUN', -0.01197079107874921), ('health_NOUN', -0.004005060151771267), ('eat_VERB', -0.00044260350669311777), ('animal_NOUN', 3.9223688443335014e-05), ('vegetable_NOUN', 0.0009212444225946692), ('cancer_NOUN', 0.003389846569015864), ('soil_NOUN', 0.006523732785825409), ('heart_NOUN', 0.007257193326950073), ('diet_NOUN', 0.008224873970716473), ('factory_NOUN', 0.010246568066733186), ('meat_NOUN', 0.014662886919913354), ('anything_NOUN', 0.015271116580281907), ('corn_NOUN', 0.015841225783030155), ('food_NOUN', 0.017951076850295067), ('kill_VERB', 0.01878183053599458), ('die_VERB', 0.02298169944967543), ('life_NOUN', 0.03330994832019013), ('attack_NOUN', 0.06580100953578949), ('human_NOUN', 0.08351597189903259), ('face_NOUN', 0.22390310466289

In [21]:
print('DS')
side_idcs = {'for':0,'against':1}
for side in sides:    
    print(side)
    sims = []
    for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[chosen_debate], all_data[chosen_debate])):    
        if cl['type'] == "word" and 'DS_cos' in clmeasures[mask_type]:        
            sims.append([cl['cluster_name'], clmeasures[mask_type]['DS_cos'][side]])
 
    sims = sorted(sims, key=lambda k: k[1])

    print([(w,s) for w,s  in sims if w in highest_tfidf_words[:200]])   
    print("\n")



DS
for
[('vegan_NOUN', -0.904938619660704), ('fat_NOUN', -0.8776229934060302), ('vegetarian_NOUN', -0.7164694864586839), ('farming_NOUN', -0.713286885181427), ('farm_NOUN', -0.5036910364604185), ('plant_NOUN', -0.31195121641830575), ('study_NOUN', -0.25351503247235874), ('health_NOUN', -0.02962895197304177), ('eat_VERB', -0.008918008615810973), ('animal_NOUN', 0.001362899732334417), ('vegetable_NOUN', 0.009861694036038437), ('anything_NOUN', 0.05928999338034457), ('cancer_NOUN', 0.09237859646593513), ('heart_NOUN', 0.16305741270042684), ('soil_NOUN', 0.24134317564299634), ('meat_NOUN', 0.279248632359523), ('attack_NOUN', 0.31830921074051177), ('corn_NOUN', 0.3828848091617086), ('food_NOUN', 0.39648253784764087), ('diet_NOUN', 0.47633619988239706), ('factory_NOUN', 0.4794943578349211), ('human_NOUN', 0.8140345403847936), ('die_VERB', 0.8573944338984882), ('face_NOUN', 0.9217015392500288), ('kill_VERB', 0.9519268716882943), ('life_NOUN', 0.981083412566036), ('grow_VERB', 0.99011768066524

In [22]:
print('DS (grouping by type of behavior)')
side_idcs = {'for':0,'against':1}
sims = dict()

for clnum, (clmeasures, cl) in enumerate(zip(measures_all_debates_by_word[chosen_debate], all_data[chosen_debate])):    
    sims_here = dict()
    for side in sides:  
        if cl['type'] == "word" and 'DS_cos' in clmeasures[mask_type]:        
            sims_here[side] = clmeasures[mask_type]['DS_cos'][side]
    if sims_here:    
        if cl['cluster_name'] in highest_tfidf_words[:200]:
            sims[cl["cluster_name"]] = sims_here

print("common approaching:")
for cl in sims:
    if sims[cl]['for'] > 0 and sims[cl]['against'] > 0:
        print(cl, sims[cl]['for'].round(4), sims[cl]['against'].round(4), abs(sims[cl]['for'] - sims[cl]['against']).round(4) )
        
print("\ncommon distancing:")
for cl in sims:
    if sims[cl]['for'] < 0 and sims[cl]['against'] < 0:
        print(cl, sims[cl]['for'], sims[cl]['against'])

        
print("\nopposite behavior:")
for cl in sims:
    if (sims[cl]['for'] < 0 and sims[cl]['against'] > 0) or (sims[cl]['for'] > 0 and sims[cl]['against'] < 0):
        print(cl, sims[cl]['for'].round(4), sims[cl]['against'].round(4), abs(abs(sims[cl]['for']) - abs(sims[cl]['against'])).round(4) )

DS (grouping by type of behavior)
common approaching:
meat_NOUN 0.2792 0.7208 0.4415
die_VERB 0.8574 0.1426 0.7148
cancer_NOUN 0.0924 0.9076 0.8152
heart_NOUN 0.1631 0.8369 0.6739
life_NOUN 0.9811 0.0189 0.9622
corn_NOUN 0.3829 0.6171 0.2342
attack_NOUN 0.3183 0.6817 0.3634
animal_NOUN 0.0014 0.9986 0.9973
cow_NOUN 0.9957 0.0043 0.9913
face_NOUN 0.9217 0.0783 0.8434
vegetable_NOUN 0.0099 0.9901 0.9803
anything_NOUN 0.0593 0.9407 0.8814
factory_NOUN 0.4795 0.5205 0.041

common distancing:
health_NOUN -0.02962895197304177 -0.9703710480269583
study_NOUN -0.25351503247235874 -0.7464849675276413
plant_NOUN -0.31195121641830575 -0.6880487835816942

opposite behavior:
farm_NOUN -0.5037 0.4963 0.0074
eat_VERB -0.0089 0.9911 0.9822
grow_VERB 0.9901 -0.0099 0.9802
food_NOUN 0.3965 -0.6035 0.207
diet_NOUN 0.4763 -0.5237 0.0473
fat_NOUN -0.8776 0.1224 0.7552
vegetarian_NOUN -0.7165 0.2835 0.4329
human_NOUN 0.814 -0.186 0.6281
kill_VERB 0.9519 -0.0481 0.9039
soil_NOUN 0.2413 -0.7587 0.5173
vegan_NO

### SV

In [23]:
overlap_results = []
freqs = []
mask = "no-mask"
clnums = []
for clnum, cl in enumerate(measures_all_debates_by_word[chosen_debate]):
    if 'SV' in cl[mask]: # mask type is irrelevant    
        overlap_results.append(cl[mask]['SV']['value'])
        freqs.append(cl[mask]['SV']['freq'])
        clnums.append(clnum)


        
sum_freqs = sum([f for f in freqs if not np.isnan(f)])

normalized_overlaps = [ov/sum_freqs if not np.isnan(ov) else np.nan for ov in overlap_results]

normalized_overlaps_and_titles = list(zip(normalized_overlaps, [cluster_data[chosen_debate]['clusters'][clnum]['cluster_name'] for clnum in clnums])) #, range(len(normalized_overlaps))))

# sorting a list with nans is problematic, we convert them to -1000 just so we can sort it
sorted_normalized_overlaps_and_titles = sorted([(x[0], x[1]) if not np.isnan(x[0]) else (-1000, x[1]) for x in normalized_overlaps_and_titles], key= lambda k: k[0])


for n in sorted_normalized_overlaps_and_titles:
    print(n)


(-1000, 'us // we')
(-1000, 'a massive research study // The NIH-AARP study // it')
(-1000, 'we')
(0.0006321112515802781, 'vitamin B12 // B12')
(0.0011061946902654867, "the nation 's // the country")
(0.0029498525073746312, "this motion // the motion 's // the motion")
(0.0029498525073746312, 'humans // human beings')
(0.003982300884955752, 'animals // animals raised for meat')
(0.004424778761061947, 'meat')
(0.004424778761061947, 'cancer')
(0.004424778761061947, 'vegetarians // the vegetarians')
(0.004424778761061947, 'faces // a face')
(0.004424778761061947, 'the globe // this world ’ s // the world')
(0.004424778761061947, 'plants // Plants')
(0.004424778761061947, 'corn // that corn')
(0.004424778761061947, 'fish')


With the code below we can check all mentions of a cluster by half and by speaker type

In [24]:
word_of_interest = "meat_NOUN"

doclen = len(cluster_data[chosen_debate]['document'])
midpoint = doclen // 2

print("HALF\tSPEAKER TYPE\tMENTION")
for clnum, cl in enumerate(cluster_data[chosen_debate]['clusters']):
    if cl['cluster_name'] == word_of_interest:
        for j, (ms, me) in enumerate(cl['mentions']):
            if ms < midpoint:
                h = 'first'
            else:
                h = 'second'
            print(h + "\t" + cl['speaker_types'][j] + "\t\t" + " ".join(cluster_data[chosen_debate]['document'][ms-10:me+10]))
        break

HALF	SPEAKER TYPE	MENTION
first	mod		that meat-eating is just nature 's way , and that meat in the diet makes a lot of sense .
first	mod		take the vegan or the vegetarian 's view that eating meat is just wrong for your health , for your
first	mod		effects of diet on health . You do not eat meat now , but you come from a family of
first	for		of doubt you have as you 're getting away from meat are quickly replaced by being very , very glad
first	mod		, whereas you are a guy who not only eats meat , but you also like to eat the bones
first	for		for 10 years . Some of them did n't eat meat or did n't eat very much . Some ate
first	for		eat very much . Some ate quite a lot of meat . And what they showed was that among those
first	for		what they showed was that among those eating the most meat , the risk of dying of cancer was increased
first	for		the same thing : if you ate a lot of meat , your risk of dying of heart disease was
first	for		studies have clearly shown that people who do n't e