## Imports

In [1]:
import sqlite3
import glob
import time
import itertools 

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


_____

## 1. Data Ingestion 

Read the data from the database.

In [2]:
annotated_dbs = glob.glob(f'./dataset/fang_system/*.json')

print("Found the following dbs : " , annotated_dbs)

synchronous_annotators = ['group_1' , 'group_2'] 
asynchronous_annotators = ['async_1' , 'async_2' , 'async_3'] 
annotators = synchronous_annotators + asynchronous_annotators
annotator2df = {}

for db_path in  annotated_dbs : 
    merged_df = pd.read_json(db_path)

    for key in annotators : 
        if key in db_path : 
            annotator2df[key] = merged_df

Found the following dbs :  ['./dataset/fang_system/async_1.json', './dataset/fang_system/group_1.json', './dataset/fang_system/group_2.json', './dataset/fang_system/async_2.json', './dataset/fang_system/async_3.json']


### 1.a. Random Sampling rows for manual annotations

Pull 200 samples from both synchronous and asynchronous groups to manually review label quality. Pulling from top 25th percentile is done later.

In [3]:
synchronous_random_samples = []
asynchronous_random_samples = []

for annotator , df in annotator2df.items(): 
    if annotator in synchronous_annotators: 
        synchronous_random_samples.append(df.sample(n=100))
    if annotator in asynchronous_annotators: 
        asynchronous_random_samples.append(df.sample(n=67))

res_df = pd.concat(synchronous_random_samples)
async_df = pd.concat(asynchronous_random_samples)

res_df = res_df[['text' , 'name']].sample(frac=1)
async_df = async_df[['text' , 'name']].sample(frac=1)

res_df.to_csv('./dataset/generated_samples/fang_sync_sample_all.csv' , index=False, sep='\t')
async_df.to_csv('./dataset/generated_samples/fang_async_sample_all.csv' , index=False, sep='\t')

_____

## 2. Jaccard Similarity 

Jaccard similarity for two themes is calculated by the union of their documents divided by the intersection of their documents.

In [4]:
results = []

for anno_1 , anno_2 in itertools.permutations(annotators , 2): 
    anno_1_themes = annotator2df[anno_1]['name'].unique()
    anno_2_themes = annotator2df[anno_2]['name'].unique()
    for anno_1_theme , anno_2_theme in itertools.product(anno_1_themes , anno_2_themes): 
        result = {'anno_1' : anno_1 , 
                  'anno_2' : anno_2 , 
                  'anno_1_theme' : anno_1_theme ,
                  'anno_2_theme' : anno_2_theme}
        anno_1_tweet_ids = set(annotator2df[anno_1][annotator2df[anno_1]['name']==anno_1_theme]['tweet_id'])
        anno_2_tweet_ids = set(annotator2df[anno_2][annotator2df[anno_2]['name']==anno_2_theme]['tweet_id'])
        intersection = anno_1_tweet_ids.intersection(anno_2_tweet_ids)
        union = anno_1_tweet_ids.union(anno_2_tweet_ids)
        jaccard_sim = len(intersection) / len(union)
        result['jaccard_sim'] = jaccard_sim
        results.append(result)

jaccard_df = pd.DataFrame(results)

### 2.a. Getting max jaccard similarity for synchronous experiments

In [5]:
max_jacc_sims = []
filtered_df = jaccard_df[(jaccard_df['anno_1'].isin(synchronous_annotators)) 
                         &(jaccard_df['anno_2'].isin(synchronous_annotators))]

for anno_1_theme in filtered_df['anno_1_theme'].unique(): 
    if ('kmeans' not in anno_1_theme.lower()) and  ('Unknown' not in anno_1_theme.strip()):
        theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
        max_jacc_sims.append(theme_filtered_df.loc[(theme_filtered_df['jaccard_sim'].idxmax())].to_dict())
res_df = pd.DataFrame(max_jacc_sims)

print("Asynchronous Jaccard Similarity")
print(f"Average Max Jaccard Similarity: {res_df['jaccard_sim'].mean():.2f}")
print(f"Standard Deviation of Jaccard Similarity: {res_df['jaccard_sim'].std():.2f}")
print(res_df.count())

Asynchronous Jaccard Similarity
Average Max Jaccard Similarity: 0.56
Standard Deviation of Jaccard Similarity: 0.23
anno_1          31
anno_2          31
anno_1_theme    31
anno_2_theme    31
jaccard_sim     31
dtype: int64


In [6]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,jaccard_sim
0,group_1,group_2,trump supporters saying biden is wrongfully ta...,criticism of governments' response to vaccine,0.366044
1,group_1,group_2,side effects of vaccine,immediate personal side effects,0.830508
2,group_1,group_2,announcing vaccine eligibility,state vaccine eligibility,0.71203
3,group_1,group_2,wholesome reactions to getting the vaccine,family covid vaccine experience,0.680976
4,group_1,group_2,inefficacy of vaccine,non-covid vaccine comparision,0.670501
5,group_1,group_2,corelation between trauma and covid,institutional adverse childhood experiences,0.499351
6,group_1,group_2,halting vaccination due to adverse effects,Error,0.424947
7,group_1,group_2,proof of vaccine,vaccine certification,0.70015
8,group_1,group_2,appeal to get vaccinated,anti-vaccine conspiracy outcomes,0.004349
9,group_1,group_2,covid related deaths,anti-vaccine conspiracy outcomes,0.567097


### 2.b. Getting max jaccard similarity for asynchronous experiments

In [7]:
max_jacc_sims = []
filtered_df = jaccard_df[(jaccard_df['anno_1'].isin(asynchronous_annotators)) 
                         &(jaccard_df['anno_2'].isin(asynchronous_annotators))]
for anno_1_theme in filtered_df['anno_1_theme'].unique():
    if ('kmeans' not in anno_1_theme.lower()) and  ('Unknown' not in anno_1_theme.strip()):
        theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
        max_jacc_sims.append(theme_filtered_df.loc[(theme_filtered_df['jaccard_sim'].idxmax())].to_dict())
res_df = pd.DataFrame(max_jacc_sims)

print("Synchronous Jaccard Similarity")
print(f"Average Max Jaccard Similarity: {res_df['jaccard_sim'].mean():.2f}")
print(f"Standard Deviation of Jaccard Similarity: {res_df['jaccard_sim'].std():.2f}")

Synchronous Jaccard Similarity
Average Max Jaccard Similarity: 0.30
Standard Deviation of Jaccard Similarity: 0.17


In [8]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,jaccard_sim
0,async_1,async_3,Pro Trump and against Biden/Obama,Anti-Biden and Pro-Trump Sentiment,0.211271
1,async_1,async_2,Dispelling misinformation regarding death,VaccineTestScience,0.204602
2,async_1,async_2,Vaccine availability,VaccineRolloutNews,0.557431
3,async_1,async_2,2nd dose side effects,VaccineSideEffects,0.626506
4,async_1,async_3,Comparisons to other virus vaccines,Anger towards COVID Deniers/COVID Deniers Pass...,0.351531
5,async_1,async_3,Anger over mask mandates,Proof of Vaccination(vaccine card/passport),0.292924
6,async_1,async_3,Reminders of the 2nd vaccine,Family Members Receiving the COVID Vaccine,0.282486
7,async_1,async_3,Origin of the vaccine,Adverse Childhood Experience's being casued by...,0.308894
8,async_1,async_2,Discussing vaccines in schools,VaccinePassportNews,0.055118
9,async_1,async_3,Positive outcomes of vaccine,Family Members Receiving the COVID Vaccine,0.256587


______

## 3. Centroid Cosine Similarity 

### 3.a. Loading SBERT Vectors 

In [9]:
sbert_vectors = np.load('./dataset/sbert.npy')

### 3.b. Calculating centroids for sync + async experiments

In [10]:
results = []

for annotator, df in annotator2df.items(): 
    cosine_sims = []
    themes = []
    for i , theme in enumerate(df['name'].unique()): 
        if 'kmeans' not in theme.lower(): 
            result = {'annotator' : annotator}
            ids = df[df['name'] == theme]['tweet_id'].tolist()
            result['theme'] = theme
            theme_vectors = sbert_vectors[ids]
            theme_centroid = np.average(theme_vectors, axis=0, keepdims=True)

            result['theme_centroid'] = theme_centroid
            result['theme_vectors'] = theme_vectors

            dot_prod = np.dot(theme_centroid , theme_vectors.T).squeeze(0)
            dot_prod = np.expand_dims(dot_prod , axis=-1)

            norm = np.linalg.norm(theme_vectors , axis=1, keepdims=True)
            cosine_sim = (dot_prod/norm)
            result['cosine_sim']  = cosine_sim 
            results.append(result)


### 3.c. Calculating synchronous centroid cosine similarity

In [11]:
cosine_sim_results = []

for anno_1 , anno_2 in itertools.permutations(synchronous_annotators , 2):
    anno_1_results = [r for r in results if r['annotator']==anno_1]
    anno_2_results = [r for r in results if r['annotator']==anno_2]
    for anno_1_result in anno_1_results: 
        for anno_2_result in anno_2_results:
            if ('kmeans' not in anno_1_result['theme'].lower()) and  ('kmeans' not in anno_2_result['theme'].lower()): 
                cosine_sim = cosine_similarity(anno_1_result['theme_centroid'] , anno_2_result['theme_centroid'])
                cosine_sim_result = {'anno_1' : anno_1 , 
                                    'anno_2' : anno_2 , 
                                    'anno_1_theme' : anno_1_result['theme'] , 
                                    'anno_2_theme' : anno_2_result['theme'] , 
                                    'cosine_sim' : cosine_sim.squeeze()}
                cosine_sim_results.append(cosine_sim_result)
                
cosine_sim_df = pd.DataFrame(cosine_sim_results)

In [12]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(synchronous_annotators , 2):
    filtered_df = cosine_sim_df[(cosine_sim_df['anno_1']== anno_1) & (cosine_sim_df['anno_2']== anno_2)]
    for anno_1_theme in filtered_df['anno_1_theme'].unique():
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['cosine_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)


print("Synchronous Centroid Cosine Similarity")
print(f"Average Max Centroid Cosine Similarity: {res_df['cosine_sim'].mean():.2f}")
print(f"Standard Deviation of Centroid Cosine Similarity: {res_df['cosine_sim'].std():.2f}")
print(res_df.count())

Synchronous Centroid Cosine Similarity
Average Max Centroid Cosine Similarity: 0.98
Standard Deviation of Centroid Cosine Similarity: 0.05
anno_1          31
anno_2          31
anno_1_theme    31
anno_2_theme    31
cosine_sim      31
dtype: int64


In [13]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,cosine_sim
0,group_1,group_2,trump supporters saying biden is wrongfully ta...,criticism of governments' response to vaccine,0.9865336
1,group_1,group_2,side effects of vaccine,immediate personal side effects,0.99987227
2,group_1,group_2,announcing vaccine eligibility,state vaccine eligibility,0.9990996
3,group_1,group_2,wholesome reactions to getting the vaccine,family covid vaccine experience,0.99775624
4,group_1,group_2,inefficacy of vaccine,non-covid vaccine comparision,0.9993112
5,group_1,group_2,corelation between trauma and covid,institutional adverse childhood experiences,0.99786264
6,group_1,group_2,halting vaccination due to adverse effects,Error,0.9898355
7,group_1,group_2,proof of vaccine,vaccine certification,0.99753714
8,group_1,group_2,appeal to get vaccinated,anti-vaccine conspiracy outcomes,0.93612516
9,group_1,group_2,covid related deaths,anti-vaccine conspiracy outcomes,0.99549055


### 3.d. Calculating asynchronous centroid cosine similarity

In [14]:
cosine_sim_results = []

for anno_1 , anno_2 in itertools.permutations(asynchronous_annotators , 2):
    anno_1_results = [r for r in results if r['annotator']==anno_1]
    anno_2_results = [r for r in results if r['annotator']==anno_2]
    for anno_1_result in anno_1_results : 
        for anno_2_result in anno_2_results :
            if ('kmeans' not in anno_1_result['theme'].lower()) and  ('kmeans' not in anno_2_result['theme'].lower()): 
                cosine_sim = cosine_similarity(anno_1_result['theme_centroid'] , anno_2_result['theme_centroid'])
                cosine_sim_result = {'anno_1' : anno_1 , 
                                    'anno_2' : anno_2 , 
                                    'anno_1_theme' : anno_1_result['theme'] , 
                                    'anno_2_theme' : anno_2_result['theme'] , 
                                    'cosine_sim' : cosine_sim.squeeze()}
                cosine_sim_results.append(cosine_sim_result)
                
cosine_sim_df = pd.DataFrame(cosine_sim_results)

In [15]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(asynchronous_annotators , 2):
    filtered_df = cosine_sim_df[(cosine_sim_df['anno_1']== anno_1) & (cosine_sim_df['anno_2']== anno_2)]
    for anno_1_theme in filtered_df['anno_1_theme'].unique():
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['cosine_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)

print("Asynchronous Centroid Cosine Similarity")
print(f"Average Max Centroid Cosine Similarity: {res_df['cosine_sim'].mean():.2f}")
print(f"Standard Deviation of Centroid Cosine Similarity: {res_df['cosine_sim'].std():.2f}")
print(res_df.count())

Asynchronous Centroid Cosine Similarity
Average Max Centroid Cosine Similarity: 0.96
Standard Deviation of Centroid Cosine Similarity: 0.05
anno_1          102
anno_2          102
anno_1_theme    102
anno_2_theme    102
cosine_sim      102
dtype: int64


In [16]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,cosine_sim
0,async_1,async_2,Pro Trump and against Biden/Obama,AntiDemocrat,0.9638065
1,async_1,async_2,Dispelling misinformation regarding death,VaccineTestScience,0.9884341
2,async_1,async_2,Vaccine availability,VaccineRolloutNews,0.9930198
3,async_1,async_2,2nd dose side effects,VaccineSideEffects,0.99872065
4,async_1,async_2,Comparisons to other virus vaccines,AntiTrump,0.98132145
...,...,...,...,...,...
97,async_3,async_2,Become a Pharmacy Technician Alerts,VaccineRolloutNews,0.9550743
98,async_3,async_2,COVID FDA Approval Status,FDANews&Discourse,0.99019384
99,async_3,async_2,Anger Towards Republicans Lying about COVID,AntiTrump,0.90378076
100,async_3,async_2,Giving Info about COVID and the Vaccine to Min...,POCVaccineAwareness,0.9945967


____

## 4. Group Average Cosine Similarity

In [17]:
results = {}
annotators = []

for annotator, df in annotator2df.items():
  results[annotator] = {}
  annotators.append(annotator)
  for i , theme in enumerate(df['name'].unique()):
      if ('kmeans' not in theme.lower()) and ('unknown' not in theme.lower()):
          ids = df[df['name'] == theme]['tweet_id'].tolist()
          theme_vectors = sbert_vectors[ids]
          results[annotator][theme] = theme_vectors

### 4.a. Calculating Group Average Similarities

In [18]:
global_average_sims = []
total_combos = len(list(itertools.combinations_with_replacement(annotators, 2)))

for ctr , (anno1 , anno2) in enumerate(itertools.combinations_with_replacement(annotators, 2)):
  anno1_themes = results[anno1]
  anno2_themes = results[anno2]
  for anno1_theme, anno1_theme_vectors in anno1_themes.items():
    for anno2_theme, anno2_theme_vectors in anno2_themes.items():
      s_time = time.time()
      cosine_sims = cosine_similarity(anno1_theme_vectors , anno2_theme_vectors)
      average_sim = np.average(cosine_sims)
      std_deviation = np.std(cosine_sims)
      global_average_sims.append({'anno1' : anno1 ,
                                  'anno2' : anno2 ,
                                  'anno1_theme' : anno1_theme ,
                                  'anno2_theme' : anno2_theme ,
                                  'average_sim' : average_sim ,
                                  'std_deviation' : std_deviation
                                  })
      print(f"Completed {ctr+1}/{total_combos} combinations.\nTotal time to calc : {time.time() - s_time} seconds. \n{global_average_sims[-1]}\n--------\n")
      # save for reference since this calculation runs much slower than for pacheco data
      pd.DataFrame(global_average_sims).to_csv('./dataset/fang_system/std_dev_aggregated_sims.csv', index=False)

Completed 1/15 combinations.
Total time to calc : 1.6969120502471924 seconds. 
{'anno1': 'async_1', 'anno2': 'async_1', 'anno1_theme': 'Pro Trump and against Biden/Obama', 'anno2_theme': 'Pro Trump and against Biden/Obama', 'average_sim': 0.38501024, 'std_deviation': 0.1547931}
--------

Completed 1/15 combinations.
Total time to calc : 0.9480109214782715 seconds. 
{'anno1': 'async_1', 'anno2': 'async_1', 'anno1_theme': 'Pro Trump and against Biden/Obama', 'anno2_theme': 'Dispelling misinformation regarding death', 'average_sim': 0.3898052, 'std_deviation': 0.14515024}
--------

Completed 1/15 combinations.
Total time to calc : 1.361253023147583 seconds. 
{'anno1': 'async_1', 'anno2': 'async_1', 'anno1_theme': 'Pro Trump and against Biden/Obama', 'anno2_theme': 'Vaccine availability', 'average_sim': 0.37891957, 'std_deviation': 0.14455523}
--------

Completed 1/15 combinations.
Total time to calc : 0.11123299598693848 seconds. 
{'anno1': 'async_1', 'anno2': 'async_1', 'anno1_theme': 'P

In [None]:
# Read from csv
# global_average_sims = pd.read_csv('./dataset/fang_system/std_dev_aggregated_sims.csv')

In [19]:
global_average_sims

[{'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Pro Trump and against Biden/Obama',
  'anno2_theme': 'Pro Trump and against Biden/Obama',
  'average_sim': 0.38501024,
  'std_deviation': 0.1547931},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Pro Trump and against Biden/Obama',
  'anno2_theme': 'Dispelling misinformation regarding death',
  'average_sim': 0.3898052,
  'std_deviation': 0.14515024},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Pro Trump and against Biden/Obama',
  'anno2_theme': 'Vaccine availability',
  'average_sim': 0.37891957,
  'std_deviation': 0.14455523},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Pro Trump and against Biden/Obama',
  'anno2_theme': '2nd dose side effects',
  'average_sim': 0.3363732,
  'std_deviation': 0.14243443},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Pro Trump and against Biden/Obama',
  'anno2_theme': 'Comparisons to other virus vaccines',
  'average_sim

In [20]:
global_average_df = pd.DataFrame(global_average_sims)

### 4.b. Calculating Synchronous Group Average Cosine Similarities

In [21]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(synchronous_annotators , 2):
    filtered_df = global_average_df[(global_average_df['anno1'] == anno_1)&
                                    (global_average_df['anno2'] == anno_2)]
    for anno_1_theme in filtered_df['anno1_theme'].unique(): 
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['average_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)

print("Synchronous Group Average Cosine Similarity")
print(f"Average Max Group Cosine Similarity: {res_df['average_sim'].mean():.2f}")
print(f"Standard Deviation of Group Cosine Similarity: { res_df['average_sim'].std():.2f}")

print(res_df.count())

Synchronous Group Average Cosine Similarity
Average Max Group Cosine Similarity: 0.51
Standard Deviation of Group Cosine Similarity: 0.10
anno1            16
anno2            16
anno1_theme      16
anno2_theme      16
average_sim      16
std_deviation    16
dtype: int64


### 4.c. Calculating Asynchronous Group Average Cosine Similarities

In [22]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(asynchronous_annotators , 2):
    filtered_df = global_average_df[(global_average_df['anno1'] == anno_1)&
                                    (global_average_df['anno2'] == anno_2)]
    for anno_1_theme in filtered_df['anno1_theme'].unique(): 
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['average_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)


print("Asynchronous Group Average Cosine Similarity")
print(f"Average Max Group Cosine Similarity: {res_df['average_sim'].mean():.2f}")
print(f"Standard Deviation of Group Cosine Similarity: { res_df['average_sim'].std():.2f}")

print(res_df.count())

Asynchronous Group Average Cosine Similarity
Average Max Group Cosine Similarity: 0.50
Standard Deviation of Group Cosine Similarity: 0.09
anno1            56
anno2            56
anno1_theme      56
anno2_theme      56
average_sim      56
std_deviation    56
dtype: int64


____

## 5. Getting top 25th percentile of highest weights

In [28]:
results = []

for annotator, df in annotator2df.items(): 
    cosine_sims = []
    themes = []
    for i , theme in enumerate(df['name'].unique()): 
        if ('kmeans' not in theme.lower()) and ('unknown' not in theme.lower()): 
            result = {'annotator' : annotator}
            ids = df[df['name'] == theme]['tweet_id'].tolist()
            result['theme'] = theme
            result['tweets'] = df[df['name'] == theme]['text'].tolist()
            result['weights'] = df[df['name'] == theme]['weight'].tolist()

            theme_vectors = sbert_vectors[ids]
            theme_centroid = np.average(theme_vectors, axis=0, keepdims=True)

            result['theme_centroid'] = theme_centroid
            result['theme_vectors'] = theme_vectors

            dot_prod = np.dot(theme_centroid , theme_vectors.T).squeeze(0)
            dot_prod = np.expand_dims(dot_prod , axis=-1)

            norm = np.linalg.norm(theme_vectors , axis=1, keepdims=True)
            cosine_sim = (dot_prod/norm)
            result['cosine_sim']  = cosine_sim.squeeze() 
            results.append(result)

In [29]:
top_25_results = {'top_25_tweets':[],
                  'top_25_vectors' : [], 
                  'theme' : [], 
                  'annotator' : []}

for result in results: 

    m = np.percentile(result['weights'] , 75)
    top_25_indices = np.where(result['weights']>m)
    top_25_vectors = result['theme_vectors'][top_25_indices]
    top_25_tweets = np.array(result['tweets'])[top_25_indices]
    annotator=result['annotator']
    theme=result['theme']

    top_25_results['top_25_tweets'].extend(top_25_tweets)
    top_25_results['top_25_vectors'].extend(top_25_vectors )
    top_25_results['theme'].extend([theme]*top_25_tweets.shape[0])
    top_25_results['annotator'].extend([annotator]*top_25_tweets.shape[0])


top_25_df = pd.DataFrame(top_25_results, columns=['theme' , 'annotator' , 'top_25_tweets'])

### 4.a. Random Sampling rows for manual annotations

In [30]:
sync_top_25 = top_25_df[top_25_df['annotator'].isin(synchronous_annotators)].sample(n=200)[['theme' , 'top_25_tweets']]
async_top_25 = top_25_df[top_25_df['annotator'].isin(asynchronous_annotators)].sample(n=200)[['theme' , 'top_25_tweets']]

sync_top_25.to_csv('./dataset/generated_samples/fang_sync_sample_top25.csv' , index=False, sep='\t')
async_top_25.to_csv('./dataset/generated_samples/fang_async_sample_top25.csv' , index=False, sep='\t')

____

## 5. Calculating Intra- and Inter-cluster Similarity

In [32]:
top_25_results = {}

for result in results: 

    top_25_results[result['theme']] = {}

    m = np.percentile(result['weights'] , 75)
    top_25_indices = np.where(result['weights']>m)
    top_25_vectors = result['theme_vectors'][top_25_indices]
    top_25_tweets = np.array(result['tweets'])[top_25_indices]
    annotator=result['annotator']
    theme=result['theme']

    top_25_results[theme]['top_25_tweets'] = (top_25_tweets)
    top_25_results[theme]['top_25_vectors'] = (top_25_vectors )
    top_25_results[theme]['annotator'] = (annotator)

### 5.a. Calculating similarities for top 25th percentile clusters

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

top_25_average_sims = []

for anno_1_theme , anno1_top_25 in tqdm(top_25_results.items()): 
    for anno_2_theme , anno2_top_25 in top_25_results.items(): 

        anno1_top_25_vectors = anno1_top_25['top_25_vectors']
        anno2_top_25_vectors = anno2_top_25['top_25_vectors']

        cosine_sims = cosine_similarity(anno1_top_25_vectors , anno2_top_25_vectors)
        average_sim = np.average(cosine_sims)
        std_deviation = np.std(cosine_sims)

        top_25_average_sims.append({'anno_1' : anno1_top_25['annotator'] , 
                                   'anno_2' : anno2_top_25['annotator'], 
                                   'anno_1_theme' : anno_1_theme,
                                   'anno_2_theme' : anno_2_theme,
                                   'average_sim' : average_sim , 
                                   'std_dev_sim' : std_deviation})


  0%|          | 0/82 [00:00<?, ?it/s]

In [34]:
top_25_average_df = pd.DataFrame(top_25_average_sims)
top_25_average_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,average_sim,std_dev_sim
0,async_1,async_1,Pro Trump and against Biden/Obama,Pro Trump and against Biden/Obama,0.436329,0.130776
1,async_1,async_1,Pro Trump and against Biden/Obama,Dispelling misinformation regarding death,0.382954,0.124816
2,async_1,async_1,Pro Trump and against Biden/Obama,Vaccine availability,0.340331,0.121193
3,async_1,async_1,Pro Trump and against Biden/Obama,2nd dose side effects,0.297146,0.105271
4,async_1,async_1,Pro Trump and against Biden/Obama,Comparisons to other virus vaccines,0.393644,0.123845
...,...,...,...,...,...,...
6719,async_3,async_3,Walgreens Vaccine Appointment Updates,Become a Pharmacy Technician Alerts,0.508613,0.086481
6720,async_3,async_3,Walgreens Vaccine Appointment Updates,COVID FDA Approval Status,0.422161,0.078194
6721,async_3,async_3,Walgreens Vaccine Appointment Updates,Anger Towards Republicans Lying about COVID,0.273490,0.080248
6722,async_3,async_3,Walgreens Vaccine Appointment Updates,Giving Info about COVID and the Vaccine to Min...,0.407098,0.075590


In [35]:
top_25_intra_cluster_df = top_25_average_df[(top_25_average_df['anno_1']==top_25_average_df['anno_2']) & (top_25_average_df['anno_1_theme']==top_25_average_df['anno_2_theme'])] 

### 5.b. Intra- and Inter-theme similarities for top 25th percentile subsets in synchronous and asynchronous experiments

In [36]:
print(f"Synchronous top 25% intra theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous top 25% intra theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous top 25% intra theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous top 25% intra theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous top 25% intra theme similarity average: 0.56
Synchronous top 25% intra theme similarity standard deviation: 0.11
Asynchronous top 25% intra theme similarity average: 0.56
Asynchronous top 25% intra theme similarity standard deviation: 0.11


In [37]:
top_25_intra_cluster_df = top_25_average_df[(top_25_average_df['anno_1']==top_25_average_df['anno_2']) & (top_25_average_df['anno_1_theme']!=top_25_average_df['anno_2_theme'])] 

In [38]:
print(f"Synchronous top 25% inter theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous top 25% inter theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous top 25% inter theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous top 25% inter theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous top 25% inter theme similarity average: 0.39
Synchronous top 25% inter theme similarity standard deviation: 0.05
Asynchronous top 25% inter theme similarity average: 0.39
Asynchronous top 25% inter theme similarity standard deviation: 0.05


### 5.c. Intra- and Inter-theme similarities for whole set in synchronous and asynchronous experiments

In [39]:
global_average_df = pd.DataFrame(global_average_sims)

intra_global_average_df = global_average_df[(global_average_df['anno1']==global_average_df['anno2']) & (global_average_df['anno1_theme']==global_average_df['anno2_theme'])] 


print(f"Synchronous global intra theme similarity average: {intra_global_average_df[intra_global_average_df['anno1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous global intra theme similarity standard deviation: {intra_global_average_df[intra_global_average_df['anno1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous global intra theme similarity average: {intra_global_average_df[intra_global_average_df['anno1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous global intra theme similarity standard deviation: {intra_global_average_df[intra_global_average_df['anno1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous global intra theme similarity average: 0.52
Synchronous global intra theme similarity standard deviation: 0.10
Asynchronous global intra theme similarity average: 0.52
Asynchronous global intra theme similarity standard deviation: 0.10


In [40]:
inter_global_avg_df = global_average_df[(global_average_df['anno1']==global_average_df['anno2']) & (global_average_df['anno1_theme']!=global_average_df['anno2_theme'])] 

print(f"Synchronous global inter theme similarity average: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous global inter theme similarity standard deviation: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous global inter theme similarity average: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous global inter theme similarity standard deviation: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous global inter theme similarity average: 0.40
Synchronous global inter theme similarity standard deviation: 0.04
Asynchronous global inter theme similarity average: 0.40
Asynchronous global inter theme similarity standard deviation: 0.04
