In [2]:
import pandas as pd
import json
from pathlib import Path 

In [3]:
ANN_PATH = Path('../data/span_annotations_v2')

targets = ['Alevi', 'Arab', 'Armenian', 'Greek', 'Immigrant-Refugee', 'Jew', 'Kurdish', 'LGBTI+']
dfs = []
for file in ANN_PATH.iterdir():
    if file.stem in targets and file.suffix == '.csv':
        dfs.append(pd.read_csv(file).assign(target=file.stem))


df = pd.concat(dfs)

In [4]:
df.shape

(3530, 6)

In [27]:
import numpy as np
import pandas as pd

num_annotators = 6
batch_size = 50
batches_per_annotator = 24  # Each annotator gets exactly 24 batches

output_dir = ANN_PATH / 'batches'
# Annotator names
annotator_names = ['Pelin', 'Elif', 'Burak', 'İrem', 'Murat', 'Didem']

# Shuffle the dataframe
df_shuffled = df.sample(frac=1).reset_index(drop=True)

# Split the dataframe into batches of 50
batches = [df_shuffled[i:i + batch_size] for i in range(0, len(df_shuffled), batch_size)]

# Ensure equal distribution of batches to annotators
num_batches = len(batches)

# Create an array to assign each annotator 24 batches
annotator_assignments = np.repeat(annotator_names, batches_per_annotator)

# Shuffle the annotator assignment list
np.random.shuffle(annotator_assignments)

# Create pairs of annotators for each batch, ensuring no annotator is assigned to a batch twice
assignments = []
used_annotators_per_batch = set()

for i, batch in enumerate(batches, 1):
    # Assign two different annotators to each batch
    available_annotators = list(set(annotator_assignments) - used_annotators_per_batch)
    annotator_1, annotator_2 = np.random.choice(available_annotators, size=2, replace=False)
    
    # Save the batch and assignment
    batch_filename = output_dir / f'batch_{i}.csv'
    batch.to_csv(batch_filename, index=False)
    
    # Add the assignment to the list
    assignments.append({
        'batch_number': i, 
        'annotator_1': annotator_1, 
        'annotator_2': annotator_2, 
        'filename': batch_filename.name
    })
    
    # Mark these annotators as used for this batch
    used_annotators_per_batch.update([annotator_1, annotator_2])
    
    # Reset the used set if we've assigned annotators to enough batches
    if len(used_annotators_per_batch) == num_annotators:
        used_annotators_per_batch = set()

# Save assignments to a CSV file
assignments_df = pd.DataFrame(assignments)
assignments_df.to_csv(output_dir / 'assignments.csv', index=False)

In [5]:
target = 'Immigrant-Refugee_Arabic'
file = ANN_PATH / f'{target}.csv'
arabic_df = pd.read_csv(file).assign(target=file.stem)

In [6]:
arabic_df.shape

(245, 5)

In [8]:
import numpy as np
import pandas as pd

num_annotators = 2
batch_size = 50
batches_per_annotator = 5

output_dir = ANN_PATH / 'batches_ar'
output_dir.mkdir(parents=True, exist_ok=True)
# Annotator names
annotator_names = ['Neda', 'Majd']

# Shuffle the dataframe
df_shuffled = arabic_df.sample(frac=1).reset_index(drop=True)

# Split the dataframe into batches of 50
batches = [df_shuffled[i:i + batch_size] for i in range(0, len(df_shuffled), batch_size)]

# Ensure equal distribution of batches to annotators
num_batches = len(batches)

# Create an array to assign each annotator 24 batches
annotator_assignments = np.repeat(annotator_names, batches_per_annotator)

# Shuffle the annotator assignment list
np.random.shuffle(annotator_assignments)

# Create pairs of annotators for each batch, ensuring no annotator is assigned to a batch twice
assignments = []
used_annotators_per_batch = set()

for i, batch in enumerate(batches, 1):
    # Assign two different annotators to each batch
    available_annotators = list(set(annotator_assignments) - used_annotators_per_batch)
    annotator_1, annotator_2 = np.random.choice(available_annotators, size=2, replace=False)
    
    # Save the batch and assignment
    batch_filename = output_dir / f'batch_{i}.csv'
    batch.to_csv(batch_filename, index=False)
    
    # Add the assignment to the list
    assignments.append({
        'batch_number': i, 
        'annotator_1': annotator_1, 
        'annotator_2': annotator_2, 
        'filename': batch_filename.name
    })
    
    # Mark these annotators as used for this batch
    used_annotators_per_batch.update([annotator_1, annotator_2])
    
    # Reset the used set if we've assigned annotators to enough batches
    if len(used_annotators_per_batch) == num_annotators:
        used_annotators_per_batch = set()

# Save assignments to a CSV file
assignments_df = pd.DataFrame(assignments)
assignments_df.to_csv(output_dir / 'assignments.csv', index=False)