## Imports

In [1]:
import sqlite3
import glob
import time
import itertools 

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


_____

## 1. Data Ingestion 

Read the data from the database.

In [2]:
annotated_dbs = glob.glob(f'./dataset/pacheco_system/*.db')

print("Found the following dbs : " , annotated_dbs)

synchronous_annotators = ['group_1' , 'group_2'] 
asynchronous_annotators = ['async_1' , 'async_2' , 'async_3'] 
annotators = synchronous_annotators + asynchronous_annotators
annotator2df = {}

for db_path in  annotated_dbs : 

    con = sqlite3.connect(db_path)
    tweet_df = pd.read_sql_query('SELECT * FROM tweet;' , con) 
    theme_df = pd.read_sql_query('SELECT * FROM theme;' , con) 
    theme_df = theme_df.rename(columns={'id' : 'theme_id'})
    merged_df = tweet_df.merge(theme_df , how='left' , on='theme_id')

    for key in annotators : 
        if key in db_path : 
            annotator2df[key] = merged_df

Found the following dbs :  ['./dataset/pacheco_system/async_1.db', './dataset/pacheco_system/group_1.db', './dataset/pacheco_system/async_3.db', './dataset/pacheco_system/group_2.db', './dataset/pacheco_system/async_2.db']


### 1.a. Random Sampling rows for manual annotations

Pull 200 samples from both synchronous and asynchronous groups to manually review label quality. Pulling from top 25th percentile is done later.

In [3]:
synchronous_random_samples = []
asynchronous_random_samples = []

for annotator , df in annotator2df.items(): 
    if annotator in synchronous_annotators: 
        synchronous_random_samples.append(df.sample(n=100))
    if annotator in asynchronous_annotators: 
        asynchronous_random_samples.append(df.sample(n=67))

res_df = pd.concat(synchronous_random_samples)
async_df = pd.concat(asynchronous_random_samples)

res_df = res_df[['text' , 'name']].sample(frac=1)
async_df = async_df[['text' , 'name']].sample(frac=1)

res_df.to_csv('./dataset/generated_samples/pacheco_sync_sample_all.csv' , index=False, sep='\t')
async_df.to_csv('./dataset/generated_samples/pacheco_async_sample_all.csv' , index=False, sep='\t')

_____

## 2. Jaccard Similarity 

Jaccard similarity for two themes is calculated by the union of their documents divided by the intersection of their documents.

In [4]:
results = []

for anno_1 , anno_2 in itertools.permutations(annotators , 2): 
    anno_1_themes = annotator2df[anno_1]['name'].unique()
    anno_2_themes = annotator2df[anno_2]['name'].unique()
    for anno_1_theme , anno_2_theme in itertools.product(anno_1_themes , anno_2_themes): 
        result = {'anno_1' : anno_1 , 
                  'anno_2' : anno_2 , 
                  'anno_1_theme' : anno_1_theme ,
                  'anno_2_theme' : anno_2_theme}
        anno_1_tweet_ids = set(annotator2df[anno_1][annotator2df[anno_1]['name']==anno_1_theme]['tweet_id'])
        anno_2_tweet_ids = set(annotator2df[anno_2][annotator2df[anno_2]['name']==anno_2_theme]['tweet_id'])
        intersection = anno_1_tweet_ids.intersection(anno_2_tweet_ids)
        union = anno_1_tweet_ids.union(anno_2_tweet_ids)
        jaccard_sim = len(intersection) / len(union)
        result['jaccard_sim'] = jaccard_sim
        results.append(result)

jaccard_df = pd.DataFrame(results)

### 2.a. Getting max jaccard similarity for synchronous experiments

In [5]:
max_jacc_sims = []
filtered_df = jaccard_df[(jaccard_df['anno_1'].isin(synchronous_annotators)) 
                         &(jaccard_df['anno_2'].isin(synchronous_annotators))]

for anno_1_theme in filtered_df['anno_1_theme'].unique(): 
    if ('kmeans' not in anno_1_theme.lower()) and  ('Unknown' not in anno_1_theme.strip()):
        theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
        max_jacc_sims.append(theme_filtered_df.loc[(theme_filtered_df['jaccard_sim'].idxmax())].to_dict())
res_df = pd.DataFrame(max_jacc_sims)

print("Asynchronous Jaccard Similarity")
print(f"Average Max Jaccard Similarity: {res_df['jaccard_sim'].mean():.2f}")
print(f"Standard Deviation of Jaccard Similarity: {res_df['jaccard_sim'].std():.2f}")
print(res_df.count())

Asynchronous Jaccard Similarity
Average Max Jaccard Similarity: 0.36
Standard Deviation of Jaccard Similarity: 0.20
anno_1          16
anno_2          16
anno_1_theme    16
anno_2_theme    16
jaccard_sim     16
dtype: int64


In [6]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,jaccard_sim
0,group_1,group_2,VaxSymptoms,PostVaxSymptoms,0.650243
1,group_1,group_2,GovGoodPolicies,VaxDistributionIssueDueToLocalPolicy,0.188921
2,group_1,group_2,VaxAppointmentInfo,VaxAvailabilityInfo,0.568418
3,group_1,group_2,VaxApprovalInfo,FDAapproval,0.381999
4,group_1,group_2,VaxDoesntWork,VaxDoesMoreHarmThanGood,0.155343
5,group_1,group_2,UnjustifiedFearOfVax,VaxDoesMoreHarmThanGood,0.058224
6,group_1,group_2,IGotTheVax,#IGotMyVaccine,0.487331
7,group_1,group_2,GovBadPolicies,ReasonsForUSLaggingOnVaccines,0.208917
8,group_1,group_2,VaxLessensSymptoms,VaxLessensSymptoms,0.417115
9,group_2,group_1,PostVaxSymptoms,VaxSymptoms,0.650243


### 2.b. Getting max jaccard similarity for asynchronous experiments

In [7]:
max_jacc_sims = []
filtered_df = jaccard_df[(jaccard_df['anno_1'].isin(asynchronous_annotators)) 
                         &(jaccard_df['anno_2'].isin(asynchronous_annotators))]
for anno_1_theme in filtered_df['anno_1_theme'].unique():
    if ('kmeans' not in anno_1_theme.lower()) and  ('Unknown' not in anno_1_theme.strip()):
        theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) & 
                                            ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
        max_jacc_sims.append(theme_filtered_df.loc[(theme_filtered_df['jaccard_sim'].idxmax())].to_dict())
res_df = pd.DataFrame(max_jacc_sims)

print("Synchronous Jaccard Similarity")
print(f"Average Max Jaccard Similarity: {res_df['jaccard_sim'].mean():.2f}")
print(f"Standard Deviation of Jaccard Similarity: {res_df['jaccard_sim'].std():.2f}")

Synchronous Jaccard Similarity
Average Max Jaccard Similarity: 0.30
Standard Deviation of Jaccard Similarity: 0.23


In [8]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,jaccard_sim
0,async_1,async_3,Receiving_first_dose_of_covid_vaccine,ReceivedfirstdoseofCOVIDvaccine,0.621659
1,async_1,async_3,Criticising_political_figures,RepublicansdownplayingCOVID,0.272509
2,async_1,async_3,Promoting_covid_related_news_articles,NewsarticlesaboutCOVIDvaccineprogress,0.290772
3,async_1,async_3,Vaccine_accessibility_and_distribution,ProCOVIDvaccine,0.221095
4,async_1,async_3,Information_about_covid_vaccine_availability_a...,GettingtheCOVIDvaccine,0.716777
5,async_1,async_3,Encouraging_people_to_get_the_covid_vaccine,GetyourCOVIDvaccine,0.395771
6,async_1,async_2,The_vaccine_does_not_work_because_you_can_stil...,VaccineEfficacyDenial,0.059856
7,async_1,async_3,Skepticism_over_the_covid_vaccine,AntiCovidVaccine,0.433612
8,async_1,async_3,Experiences_of_vaccine_side_effects,Covidvaccineaftereffects,0.775336
9,async_1,async_3,Praising_frontline_healthcare_workers,ProCOVIDvaccine,0.093837


______

## 3. Centroid Cosine Similarity 

### 3.a. Loading SBERT Vectors 

In [9]:
sbert_vectors = np.load('./dataset/sbert.npy')

### 3.b. Calculating centroids for sync + async experiments

In [10]:
results = []

for annotator, df in annotator2df.items(): 
    cosine_sims = []
    themes = []
    for i , theme in enumerate(df['name'].unique()): 
        if 'kmeans' not in theme.lower(): 
            result = {'annotator' : annotator}
            ids = df[df['name'] == theme]['tweet_id'].tolist()
            result['theme'] = theme
            theme_vectors = sbert_vectors[ids]
            theme_centroid = np.average(theme_vectors, axis=0, keepdims=True)

            result['theme_centroid'] = theme_centroid
            result['theme_vectors'] = theme_vectors

            dot_prod = np.dot(theme_centroid , theme_vectors.T).squeeze(0)
            dot_prod = np.expand_dims(dot_prod , axis=-1)

            norm = np.linalg.norm(theme_vectors , axis=1, keepdims=True)
            cosine_sim = (dot_prod/norm)
            result['cosine_sim']  = cosine_sim 
            results.append(result)


### 3.c. Calculating synchronous centroid cosine similarity

In [11]:
cosine_sim_results = []

for anno_1 , anno_2 in itertools.permutations(synchronous_annotators , 2):
    anno_1_results = [r for r in results if r['annotator']==anno_1]
    anno_2_results = [r for r in results if r['annotator']==anno_2]
    for anno_1_result in anno_1_results: 
        for anno_2_result in anno_2_results:
            if ('kmeans' not in anno_1_result['theme'].lower()) and  ('kmeans' not in anno_2_result['theme'].lower()): 
                cosine_sim = cosine_similarity(anno_1_result['theme_centroid'] , anno_2_result['theme_centroid'])
                cosine_sim_result = {'anno_1' : anno_1 , 
                                    'anno_2' : anno_2 , 
                                    'anno_1_theme' : anno_1_result['theme'] , 
                                    'anno_2_theme' : anno_2_result['theme'] , 
                                    'cosine_sim' : cosine_sim.squeeze()}
                cosine_sim_results.append(cosine_sim_result)
                
cosine_sim_df = pd.DataFrame(cosine_sim_results)

In [12]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(synchronous_annotators , 2):
    filtered_df = cosine_sim_df[(cosine_sim_df['anno_1']== anno_1) & (cosine_sim_df['anno_2']== anno_2)]
    for anno_1_theme in filtered_df['anno_1_theme'].unique():
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['cosine_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)


print("Synchronous Centroid Cosine Similarity")
print(f"Average Max Centroid Cosine Similarity: {res_df['cosine_sim'].mean():.2f}")
print(f"Standard Deviation of Centroid Cosine Similarity: {res_df['cosine_sim'].std():.2f}")
print(res_df.count())

Synchronous Centroid Cosine Similarity
Average Max Centroid Cosine Similarity: 0.99
Standard Deviation of Centroid Cosine Similarity: 0.01
anno_1          17
anno_2          17
anno_1_theme    17
anno_2_theme    17
cosine_sim      17
dtype: int64


In [13]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,cosine_sim
0,group_1,group_2,VaxSymptoms,PostVaxSymptoms,0.99895835
1,group_1,group_2,GovGoodPolicies,VaxDistributionIssueDueToLocalPolicy,0.96528137
2,group_1,group_2,VaxAppointmentInfo,VaxAvailabilityInfo,0.99799
3,group_1,group_2,VaxApprovalInfo,FDAapproval,0.9812027
4,group_1,group_2,VaxDoesntWork,VaxDoesMoreHarmThanGood,0.97540057
5,group_1,group_2,UnjustifiedFearOfVax,VaxDoesMoreHarmThanGood,0.9572651
6,group_1,group_2,IGotTheVax,#IGotMyVaccine,0.995323
7,group_1,group_2,GovBadPolicies,ReasonsForUSLaggingOnVaccines,0.98667836
8,group_1,group_2,VaxLessensSymptoms,VaxLessensSymptoms,0.99518436
9,group_2,group_1,PostVaxSymptoms,VaxSymptoms,0.99895835


### 3.d. Calculating asynchronous centroid cosine similarity

In [14]:
cosine_sim_results = []

for anno_1 , anno_2 in itertools.permutations(asynchronous_annotators , 2):
    anno_1_results = [r for r in results if r['annotator']==anno_1]
    anno_2_results = [r for r in results if r['annotator']==anno_2]
    for anno_1_result in anno_1_results : 
        for anno_2_result in anno_2_results :
            if ('kmeans' not in anno_1_result['theme'].lower()) and  ('kmeans' not in anno_2_result['theme'].lower()): 
                cosine_sim = cosine_similarity(anno_1_result['theme_centroid'] , anno_2_result['theme_centroid'])
                cosine_sim_result = {'anno_1' : anno_1 , 
                                    'anno_2' : anno_2 , 
                                    'anno_1_theme' : anno_1_result['theme'] , 
                                    'anno_2_theme' : anno_2_result['theme'] , 
                                    'cosine_sim' : cosine_sim.squeeze()}
                cosine_sim_results.append(cosine_sim_result)
                
cosine_sim_df = pd.DataFrame(cosine_sim_results)

In [15]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(asynchronous_annotators , 2):
    filtered_df = cosine_sim_df[(cosine_sim_df['anno_1']== anno_1) & (cosine_sim_df['anno_2']== anno_2)]
    for anno_1_theme in filtered_df['anno_1_theme'].unique():
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno_1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno_2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['cosine_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)

print("Asynchronous Centroid Cosine Similarity")
print(f"Average Max Centroid Cosine Similarity: {res_df['cosine_sim'].mean():.2f}")
print(f"Standard Deviation of Centroid Cosine Similarity: {res_df['cosine_sim'].std():.2f}")
print(res_df.count())

Asynchronous Centroid Cosine Similarity
Average Max Centroid Cosine Similarity: 0.95
Standard Deviation of Centroid Cosine Similarity: 0.08
anno_1          94
anno_2          94
anno_1_theme    94
anno_2_theme    94
cosine_sim      94
dtype: int64


In [16]:
# To view individual similarities
res_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,cosine_sim
0,async_1,async_2,Receiving_first_dose_of_covid_vaccine,GotVaccinated,0.993977
1,async_1,async_2,Criticising_political_figures,AntiRepublican,0.98237693
2,async_1,async_2,Promoting_covid_related_news_articles,AdvocateForVaccine,0.95669174
3,async_1,async_2,Vaccine_accessibility_and_distribution,WhereToGetVaccine,0.9413523
4,async_1,async_2,Information_about_covid_vaccine_availability_a...,WhereToGetVaccine,0.982838
...,...,...,...,...,...
89,async_3,async_2,HealthcareWorkers,BlameFauci,0.71381944
90,async_3,async_2,AntiBiden,AntiDemocrat,0.9753418
91,async_3,async_2,NegativeviewonTrump,BlameFauci,0.72613084
92,async_3,async_2,RepublicansresponsibleforCOVIDdeaths,BlameFauci,0.7145619


____

## 4. Group Average Cosine Similarity

In [17]:
results = {}
annotators = []

for annotator, df in annotator2df.items():
  results[annotator] = {}
  annotators.append(annotator)
  for i , theme in enumerate(df['name'].unique()):
      if ('kmeans' not in theme.lower()) and ('unknown' not in theme.lower()):
          ids = df[df['name'] == theme]['tweet_id'].tolist()
          theme_vectors = sbert_vectors[ids]
          results[annotator][theme] = theme_vectors

### 4.a. Calculating Group Average Similarities

In [18]:
global_average_sims = []

for (anno1 , anno2) in tqdm(itertools.combinations_with_replacement(annotators, 2)):
  anno1_themes = results[anno1]
  anno2_themes = results[anno2]
  for anno1_theme, anno1_theme_vectors in anno1_themes.items():
    for anno2_theme, anno2_theme_vectors in anno2_themes.items():
      s_time = time.time()
      cosine_sims = cosine_similarity(anno1_theme_vectors , anno2_theme_vectors)
      average_sim = np.average(cosine_sims)
      std_deviation = np.std(cosine_sims)
      global_average_sims.append({'anno1' : anno1 ,
                                  'anno2' : anno2 ,
                                  'anno1_theme' : anno1_theme ,
                                  'anno2_theme' : anno2_theme ,
                                  'average_sim' : average_sim ,
                                  'std_deviation' : std_deviation
                                  })

0it [00:00, ?it/s]

In [19]:
global_average_sims

[{'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Receiving_first_dose_of_covid_vaccine',
  'anno2_theme': 'Receiving_first_dose_of_covid_vaccine',
  'average_sim': 0.5871065,
  'std_deviation': 0.13545135},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Receiving_first_dose_of_covid_vaccine',
  'anno2_theme': 'Criticising_political_figures',
  'average_sim': 0.3610672,
  'std_deviation': 0.11115836},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Receiving_first_dose_of_covid_vaccine',
  'anno2_theme': 'Promoting_covid_related_news_articles',
  'average_sim': 0.43287352,
  'std_deviation': 0.13017313},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Receiving_first_dose_of_covid_vaccine',
  'anno2_theme': 'Vaccine_accessibility_and_distribution',
  'average_sim': 0.41418797,
  'std_deviation': 0.113564715},
 {'anno1': 'async_1',
  'anno2': 'async_1',
  'anno1_theme': 'Receiving_first_dose_of_covid_vaccine',
  'anno2_theme': 'Info

In [20]:
global_average_df = pd.DataFrame(global_average_sims)

### 4.b. Calculating Synchronous Group Average Cosine Similarities

In [21]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(synchronous_annotators , 2):
    filtered_df = global_average_df[(global_average_df['anno1'] == anno_1)&
                                    (global_average_df['anno2'] == anno_2)]
    for anno_1_theme in filtered_df['anno1_theme'].unique(): 
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['average_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)

print("Synchronous Group Average Cosine Similarity")
print(f"Average Max Group Cosine Similarity: {res_df['average_sim'].mean():.2f}")
print(f"Standard Deviation of Group Cosine Similarity: { res_df['average_sim'].std():.2f}")

print(res_df.count())

Synchronous Group Average Cosine Similarity
Average Max Group Cosine Similarity: 0.52
Standard Deviation of Group Cosine Similarity: 0.08
anno1            9
anno2            9
anno1_theme      9
anno2_theme      9
average_sim      9
std_deviation    9
dtype: int64


### 4.c. Calculating Asynchronous Group Average Cosine Similarities

In [22]:
max_cosine_sims = []

for anno_1 , anno_2 in itertools.permutations(asynchronous_annotators , 2):
    filtered_df = global_average_df[(global_average_df['anno1'] == anno_1)&
                                    (global_average_df['anno2'] == anno_2)]
    for anno_1_theme in filtered_df['anno1_theme'].unique(): 
        if ('kmeans' not in anno_1_theme.lower()) and ('unknown' not in anno_1_theme.lower()):
            theme_filtered_df  = filtered_df[(filtered_df['anno1_theme'] == anno_1_theme) &
                                              ~(filtered_df['anno2_theme'].str.contains('Kmeans')) &
                                              ~(filtered_df['anno2_theme'].str.contains('Unknown'))]
            max_cosine_sims.append(theme_filtered_df.loc[(theme_filtered_df['average_sim'].idxmax())].to_dict())
 
res_df = pd.DataFrame(max_cosine_sims)


print("Asynchronous Group Average Cosine Similarity")
print(f"Average Max Group Cosine Similarity: {res_df['average_sim'].mean():.2f}")
print(f"Standard Deviation of Group Cosine Similarity: { res_df['average_sim'].std():.2f}")

print(res_df.count())

Asynchronous Group Average Cosine Similarity
Average Max Group Cosine Similarity: 0.45
Standard Deviation of Group Cosine Similarity: 0.10
anno1            56
anno2            56
anno1_theme      56
anno2_theme      56
average_sim      56
std_deviation    56
dtype: int64


____

## 5. Getting top 25th percentile of closest vectors to each centroid

In [23]:
results = []

for annotator, df in annotator2df.items(): 
    cosine_sims = []
    themes = []
    for i , theme in enumerate(df['name'].unique()): 
        if ('kmeans' not in theme.lower()) and ('unknown' not in theme.lower()): 
            result = {'annotator' : annotator}
            ids = df[df['name'] == theme]['tweet_id'].tolist()
            result['theme'] = theme
            result['tweets'] = df[df['name'] == theme]['text'].tolist()

            theme_vectors = sbert_vectors[ids]
            theme_centroid = np.average(theme_vectors, axis=0, keepdims=True)

            result['theme_centroid'] = theme_centroid
            result['theme_vectors'] = theme_vectors

            dot_prod = np.dot(theme_centroid , theme_vectors.T).squeeze(0)
            dot_prod = np.expand_dims(dot_prod , axis=-1)

            norm = np.linalg.norm(theme_vectors , axis=1, keepdims=True)
            cosine_sim = (dot_prod/norm)
            result['cosine_sim']  = cosine_sim.squeeze() 
            results.append(result)

In [24]:
top_25_results = {'top_25_tweets':[],
                  'top_25_vectors' : [], 
                  'theme' : [], 
                  'annotator' : []}

for result in results: 

    m = np.percentile(result['cosine_sim'] , 75)
    top_25_indices = np.where(result['cosine_sim']>m)
    top_25_vectors = result['theme_vectors'][top_25_indices]
    top_25_tweets = np.array(result['tweets'])[top_25_indices]
    annotator=result['annotator']
    theme=result['theme']

    top_25_results['top_25_tweets'].extend(top_25_tweets)
    top_25_results['top_25_vectors'].extend(top_25_vectors )
    top_25_results['theme'].extend([theme]*top_25_tweets.shape[0])
    top_25_results['annotator'].extend([annotator]*top_25_tweets.shape[0])


top_25_df = pd.DataFrame(top_25_results, columns=['theme' , 'annotator' , 'top_25_tweets'])

### 4.a. Random Sampling rows for manual annotations

In [25]:
sync_top_25 = top_25_df[top_25_df['annotator'].isin(synchronous_annotators)].sample(n=200)[['theme' , 'top_25_tweets']]
async_top_25 = top_25_df[top_25_df['annotator'].isin(asynchronous_annotators)].sample(n=200)[['theme' , 'top_25_tweets']]

sync_top_25.to_csv('./dataset/generated_samples/pacheco_sync_sample_top25.csv' , index=False, sep='\t')
async_top_25.to_csv('./dataset/generated_samples/pacheco_async_sample_top25.csv' , index=False, sep='\t')

____

## 5. Calculating Intra- and Inter-cluster Similarity

In [26]:
top_25_results = {}

for result in results: 

    top_25_results[result['theme']] = {}

    m = np.percentile(result['cosine_sim'] , 75)
    top_25_indices = np.where(result['cosine_sim']>m)
    top_25_vectors = result['theme_vectors'][top_25_indices]
    top_25_tweets = np.array(result['tweets'])[top_25_indices]
    annotator=result['annotator']
    theme=result['theme']

    top_25_results[theme]['top_25_tweets'] = (top_25_tweets)
    top_25_results[theme]['top_25_vectors'] = (top_25_vectors )
    top_25_results[theme]['annotator'] = (annotator)

### 5.a. Calculating similarities for top 25th percentile clusters

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

top_25_average_sims = []

for anno_1_theme , anno1_top_25 in tqdm(top_25_results.items()): 
    for anno_2_theme , anno2_top_25 in top_25_results.items(): 

        anno1_top_25_vectors = anno1_top_25['top_25_vectors']
        anno2_top_25_vectors = anno2_top_25['top_25_vectors']

        cosine_sims = cosine_similarity(anno1_top_25_vectors , anno2_top_25_vectors)
        average_sim = np.average(cosine_sims)
        std_deviation = np.std(cosine_sims)

        top_25_average_sims.append({'anno_1' : anno1_top_25['annotator'] , 
                                   'anno_2' : anno2_top_25['annotator'], 
                                   'anno_1_theme' : anno_1_theme,
                                   'anno_2_theme' : anno_2_theme,
                                   'average_sim' : average_sim , 
                                   'std_dev_sim' : std_deviation})


  0%|          | 0/63 [00:00<?, ?it/s]

In [28]:
top_25_average_df = pd.DataFrame(top_25_average_sims)
top_25_average_df

Unnamed: 0,anno_1,anno_2,anno_1_theme,anno_2_theme,average_sim,std_dev_sim
0,async_1,async_1,Receiving_first_dose_of_covid_vaccine,Receiving_first_dose_of_covid_vaccine,0.782726,0.056958
1,async_1,async_1,Receiving_first_dose_of_covid_vaccine,Criticising_political_figures,0.435086,0.076976
2,async_1,async_1,Receiving_first_dose_of_covid_vaccine,Promoting_covid_related_news_articles,0.564126,0.083993
3,async_1,async_1,Receiving_first_dose_of_covid_vaccine,Vaccine_accessibility_and_distribution,0.508773,0.074464
4,async_1,async_1,Receiving_first_dose_of_covid_vaccine,Information_about_covid_vaccine_availability_a...,0.491469,0.071028
...,...,...,...,...,...,...
3964,async_2,async_2,BlameFauci,VaccineEfficacyDenial,0.484998,0.094103
3965,async_2,async_2,BlameFauci,VaccineRefusal,0.475791,0.086528
3966,async_2,async_2,BlameFauci,VaccineKills,0.469780,0.077280
3967,async_2,async_2,BlameFauci,AntiDemocrat,0.511179,0.070955


In [29]:
top_25_intra_cluster_df = top_25_average_df[(top_25_average_df['anno_1']==top_25_average_df['anno_2']) & (top_25_average_df['anno_1_theme']==top_25_average_df['anno_2_theme'])] 

### 5.b. Intra- and Inter-theme similarities for top 25th percentile subsets in synchronous and asynchronous experiments

In [30]:
print(f"Synchronous top 25% intra theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous top 25% intra theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous top 25% intra theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous top 25% intra theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous top 25% intra theme similarity average: 0.70
Synchronous top 25% intra theme similarity standard deviation: 0.09
Asynchronous top 25% intra theme similarity average: 0.64
Asynchronous top 25% intra theme similarity standard deviation: 0.09


In [31]:
top_25_intra_cluster_df = top_25_average_df[(top_25_average_df['anno_1']==top_25_average_df['anno_2']) & (top_25_average_df['anno_1_theme']!=top_25_average_df['anno_2_theme'])] 

In [32]:
print(f"Synchronous top 25% inter theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous top 25% inter theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous top 25% inter theme similarity average: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous top 25% inter theme similarity standard deviation: {top_25_intra_cluster_df[top_25_intra_cluster_df['anno_1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous top 25% inter theme similarity average: 0.52
Synchronous top 25% inter theme similarity standard deviation: 0.07
Asynchronous top 25% inter theme similarity average: 0.46
Asynchronous top 25% inter theme similarity standard deviation: 0.13


### 5.c. Intra- and Inter-theme similarities for whole set in synchronous and asynchronous experiments

In [33]:
global_average_df = pd.DataFrame(global_average_sims)

intra_global_average_df = global_average_df[(global_average_df['anno1']==global_average_df['anno2']) & (global_average_df['anno1_theme']==global_average_df['anno2_theme'])] 


print(f"Synchronous global intra theme similarity average: {intra_global_average_df[intra_global_average_df['anno1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous global intra theme similarity standard deviation: {intra_global_average_df[intra_global_average_df['anno1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous global intra theme similarity average: {intra_global_average_df[intra_global_average_df['anno1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous global intra theme similarity standard deviation: {intra_global_average_df[intra_global_average_df['anno1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous global intra theme similarity average: 0.51
Synchronous global intra theme similarity standard deviation: 0.08
Asynchronous global intra theme similarity average: 0.45
Asynchronous global intra theme similarity standard deviation: 0.10


In [34]:
inter_global_avg_df = global_average_df[(global_average_df['anno1']==global_average_df['anno2']) & (global_average_df['anno1_theme']!=global_average_df['anno2_theme'])] 

print(f"Synchronous global inter theme similarity average: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(synchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Synchronous global inter theme similarity standard deviation: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(synchronous_annotators)]['average_sim'].std():.2f}")


print(f"Asynchronous global inter theme similarity average: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(asynchronous_annotators)]['average_sim'].mean():.2f}")
print(f"Asynchronous global inter theme similarity standard deviation: {inter_global_avg_df[inter_global_avg_df['anno1'].isin(asynchronous_annotators)]['average_sim'].std():.2f}")

Synchronous global inter theme similarity average: 0.42
Synchronous global inter theme similarity standard deviation: 0.05
Asynchronous global inter theme similarity average: 0.34
Asynchronous global inter theme similarity standard deviation: 0.11
