In [1]:
import random


def swap_entities(example):
    text = example['text']
    candidates = example['candidates'].copy()
    correct_idx = example['correct_candidate_idx']
    
    # Create pool of replacement entities by gender/type
    male_names = ["John", "Michael", "David", "Robert", "James", "Thomas", "William", "Richard","Mohamed","Ahmed","Omar","Osama","Amr","Magdy","Waleed","Ayman"]
    female_names = ["Mary", "Jennifer", "Linda", "Patricia", "Elizabeth", "Susan", "Jessica", "Sarah"]
    organizations = ["Microsoft", "Google", "Amazon", "Apple", "IBM", "Tesla", "Facebook", "Twitter"]
    locations = ["New York", "London", "Paris", "Tokyo", "Berlin", "Sydney", "Rome", "Moscow"]
    
    # Determine entity type for each candidate
    entity_types = []
    for candidate in candidates:
        if candidate in male_names:
            entity_types.append("male")
        elif candidate in female_names:
            entity_types.append("female")
        elif candidate in organizations:
            entity_types.append("organization")
        elif candidate in locations:
            entity_types.append("location")
        else:
            entity_types.append("unknown")
    
    # Create substitution map
    substitutions = {}
    for i, candidate in enumerate(candidates):
        entity_type = entity_types[i]
        if entity_type == "male":
            replacement = random.choice([n for n in male_names if n != candidate])
        elif entity_type == "female":
            replacement = random.choice([n for n in female_names if n != candidate])
        elif entity_type == "organization":
            replacement = random.choice([o for o in organizations if o != candidate])
        elif entity_type == "location":
            replacement = random.choice([l for l in locations if l != candidate])
        else:
            replacement = candidate  # No replacement for unknown types
        
        substitutions[candidate] = replacement
    
    # Apply substitutions
    new_text = text
    new_candidates = []
    for old_entity, new_entity in substitutions.items():
        new_text = new_text.replace(old_entity, new_entity)
        new_candidates.append(new_entity)
    
    # Create augmented example
    augmented = example.copy()
    augmented['text'] = new_text
    augmented['candidates'] = new_candidates
    return augmented

In [2]:
def vary_pronouns(example):
    text = example['text']
    pronoun = example['pronoun']
    pronoun_position = example['pronoun_position']
    
    # Define pronoun mapping groups
    subject_pronouns = {"he": ["he", "they"], "she": ["she", "they"], "they": ["they", "he", "she"]}
    object_pronouns = {"him": ["him", "them"], "her": ["her", "them"], "them": ["them", "him", "her"]}
    possessive_pronouns = {"his": ["his", "their"], "her": ["her", "their"], "their": ["their", "his", "her"]}
    
    # Determine pronoun type
    pronoun_lower = pronoun.lower()
    if pronoun_lower in subject_pronouns:
        alternatives = subject_pronouns[pronoun_lower]
    elif pronoun_lower in object_pronouns:
        alternatives = object_pronouns[pronoun_lower]
    elif pronoun_lower in possessive_pronouns:
        alternatives = possessive_pronouns[pronoun_lower]
    else:
        return [example]  # No variations if pronoun type not recognized
    
    # Create variations
    variations = []
    for alt_pronoun in alternatives:
        if alt_pronoun == pronoun_lower:
            continue
            
        # Match case
        if pronoun[0].isupper():
            alt_pronoun = alt_pronoun.capitalize()
            
        # Replace pronoun in text
        new_text = text[:pronoun_position] + alt_pronoun + text[pronoun_position + len(pronoun):]
        
        # Create new example
        new_example = example.copy()
        new_example['text'] = new_text
        new_example['pronoun'] = alt_pronoun
        variations.append(new_example)
        
    return variations

In [3]:
import re


def modify_sentence_structure(example):
    text = example['text']
    pronoun = example['pronoun']
    candidates = example['candidates']
    correct_idx = example['correct_candidate_idx']
    
    # Split into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    if len(sentences) < 2:
        return [example]  # Can't reorder if only one sentence
    
    # Create variations by reordering sentences
    variations = []
    
    # Simple sentence order reversal
    if len(sentences) == 2:
        new_text = sentences[1] + " " + sentences[0]
        
        # Need to recalculate pronoun position
        original_pronoun_pos = example['pronoun_position']
        if original_pronoun_pos < len(sentences[0]):
            # Pronoun was in first sentence, now it's after second sentence
            new_pronoun_pos = len(sentences[1]) + 1 + original_pronoun_pos
        else:
            # Pronoun was in second sentence, now it's at the beginning
            new_pronoun_pos = original_pronoun_pos - len(sentences[0]) - 1
        
        new_example = example.copy()
        new_example['text'] = new_text
        new_example['pronoun_position'] = new_pronoun_pos
        variations.append(new_example)
    
    # For longer texts, create more complex variations
    # (This would require more sophisticated NLP to ensure coreference is preserved)
    
    return variations

In [4]:
def add_distractors(example):
    text = example['text']
    pronoun = example['pronoun']
    candidates = example['candidates'].copy()
    correct_idx = example['correct_candidate_idx']
    
    # Determine pronoun gender/type
    pronoun_lower = pronoun.lower()
    is_male = pronoun_lower in ["he", "him", "his"]
    is_female = pronoun_lower in ["she", "her", "hers"]
    is_plural = pronoun_lower in ["they", "them", "their"]
    is_neutral = pronoun_lower in ["it", "its"]
    
    # Choose appropriate distractors
    distractors = []
    if is_male:
        distractors = ["James", "Robert", "Michael", "William", "David"]
    elif is_female:
        distractors = ["Mary", "Patricia", "Jennifer", "Linda", "Elizabeth"]
    elif is_plural:
        distractors = ["The team", "The group", "The committee", "The family", "The couple"]
    elif is_neutral:
        distractors = ["The car", "The book", "The phone", "The computer", "The house"]
    
    # If no appropriate distractors were found, return the original example
    if not distractors:
        return example
    
    # Add 1-2 distractors to the text, but no more than available
    num_distractors = min(random.randint(1, 2), len(distractors))
    selected_distractors = random.sample(distractors, num_distractors)
    
    # Rest of the function remains the same...
    
    # Find appropriate places to insert distractors
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    if len(sentences) >= 2:
        # Insert distractors in a way that doesn't change correct resolution
        augmented_sentences = sentences.copy()
        
        for distractor in selected_distractors:
            insert_idx = random.randint(0, len(augmented_sentences) - 1)
            
            # Create a distractor phrase
            actions = ["said", "mentioned", "noted", "explained", "suggested"]
            action = random.choice(actions)
            distractor_phrase = f" {distractor} {action} something important. "
            
            # Insert at beginning or end of the selected sentence
            if random.choice([True, False]):
                augmented_sentences[insert_idx] = distractor_phrase + augmented_sentences[insert_idx]
            else:
                augmented_sentences[insert_idx] = augmented_sentences[insert_idx] + distractor_phrase
        
        new_text = " ".join(augmented_sentences)
        
        # Add distractors to candidates list
        new_candidates = candidates + selected_distractors
        
        # Create new example
        new_example = example.copy()
        new_example['text'] = new_text
        new_example['candidates'] = new_candidates
        
        # Adjust correct_candidate_idx if needed
        new_example['correct_candidate_idx'] = correct_idx
        
        return new_example
    
    return example

In [5]:
def generate_cross_domain(examples, domain_type):

    # Domain-specific patterns and vocabulary
    domains = {
        "news": {
            "patterns": [
                "{candidate} stated that {rest}",
                "According to {candidate}, {rest}",
                "Sources close to {candidate} revealed that {rest}"
            ],
            "verbs": ["reported", "announced", "confirmed", "stated", "claimed"],
        },
        "academic": {
            "patterns": [
                "{candidate}'s research suggests that {rest}",
                "In the study conducted by {candidate}, {rest}",
                "As {candidate} theorized, {rest}"
            ],
            "verbs": ["analyzed", "hypothesized", "concluded", "investigated", "examined"],
        },
        "fiction": {
            "patterns": [
                "{candidate} gazed out the window as {rest}",
                "With a sigh, {candidate} realized that {rest}",
                "Walking slowly, {candidate} thought about how {rest}"
            ],
            "verbs": ["whispered", "thought", "wondered", "felt", "dreamed"],
        },
        "conversation": {
            "patterns": [
                "{candidate} was like, '{rest}'",
                "'Hey,' {candidate} said, '{rest}'",
                "{candidate} told me that {rest}"
            ],
            "verbs": ["said", "mentioned", "talked about", "brought up", "discussed"],
        }
    }
    
    if domain_type not in domains:
        return examples
    
    domain_data = domains[domain_type]
    patterns = domain_data["patterns"]
    verbs = domain_data["verbs"]
    
    domain_examples = []
    
    for example in examples:
        candidates = example['candidates']
        correct_idx = example['correct_candidate_idx']
        correct_candidate = candidates[correct_idx]
        
        # Choose a random pattern
        pattern = random.choice(patterns)
        verb = random.choice(verbs)
        
        # Create a simple sentence with the pattern
        text_parts = example['text'].split('.')
        if len(text_parts) > 1:
            main_text = text_parts[0].strip()
            rest_text = '.'.join(text_parts[1:]).strip()
        else:
            main_text = example['text']
            rest_text = ""
        
        # Replace the pronoun with appropriate text
        pronoun = example['pronoun'].lower()
        if pronoun in ["he", "she", "they"]:
            pronoun_text = f"{pronoun} {verb}"
        elif pronoun in ["him", "her", "them"]:
            pronoun_text = f"{pronoun}self"
        else:
            pronoun_text = pronoun
        
        # Generate domain-specific text
        new_text = pattern.format(candidate=correct_candidate, rest=main_text)
        if rest_text:
            new_text += f". {rest_text}"
        
        # Find position of the pronoun in the new text
        pronoun_pos = new_text.lower().find(pronoun.lower())
        if pronoun_pos == -1:
            # If we can't find the pronoun, use a default position
            pronoun_pos = len(new_text) // 2
        
        # Create new example
        new_example = example.copy()
        new_example['text'] = new_text
        new_example['pronoun_position'] = pronoun_pos
        
        domain_examples.append(new_example)
    
    return domain_examples

In [6]:
def augment_dataset(original_data, augmentation_factor=2):
    augmented_data = original_data.copy()
    
    # Track original example count
    original_count = len(original_data)
    target_count = original_count * augmentation_factor
    
    # Apply different augmentation techniques
    while len(augmented_data) < target_count:
        # Randomly select an original example to augment
        example = random.choice(original_data)
        
        # Randomly select augmentation technique
        technique = random.choice([
            "entity_swap", 
            "pronoun_variation", 
            "sentence_structure", 
            "add_distractors",
            "cross_domain"
        ])
        
        # Apply selected technique
        if technique == "entity_swap":
            new_example = swap_entities(example)
            augmented_data.append(new_example)
        
        elif technique == "pronoun_variation":
            variations = vary_pronouns(example)
            augmented_data.extend(variations)
        
        elif technique == "sentence_structure":
            variations = modify_sentence_structure(example)
            augmented_data.extend(variations)
        
        elif technique == "add_distractors":
            new_example = add_distractors(example)
            augmented_data.append(new_example)
        
        elif technique == "cross_domain":
            domain = random.choice(["news", "academic", "fiction", "conversation"])
            domain_examples = generate_cross_domain([example], domain)
            augmented_data.extend(domain_examples)
    
    # Balance the dataset by pronoun type
    augmented_data = balance_by_pronoun(augmented_data)
    
    # Shuffle the augmented dataset
    random.shuffle(augmented_data)
    
    # Trim to target size
    return augmented_data[:target_count]

def balance_by_pronoun(examples):
    """Ensure balanced representation of different pronoun types."""
    pronoun_counts = {}
    
    # Count examples by pronoun type
    for example in examples:
        pronoun = example['pronoun'].lower()
        if pronoun not in pronoun_counts:
            pronoun_counts[pronoun] = []
        pronoun_counts[pronoun].append(example)
    
    # Find the maximum number of examples per pronoun type
    max_count = max(len(examples) for examples in pronoun_counts.values())
    
    # Balance the dataset
    balanced_examples = []
    for pronoun, pronoun_examples in pronoun_counts.items():
        # If we have too few examples, duplicate some
        if len(pronoun_examples) < max_count:
            needed = max_count - len(pronoun_examples)
            additional = random.choices(pronoun_examples, k=needed)
            pronoun_examples.extend(additional)
        
        # Take max_count examples for this pronoun
        balanced_examples.extend(pronoun_examples[:max_count])
    
    return balanced_examples

In [13]:
# Load your original data
import json

def load_data(data_path):
    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # validation
    for item in data:
        assert 'text' in item, "Missing 'text' field"
        assert 'pronoun' in item, "Missing 'pronoun' field"
        assert 'candidates' in item, "Missing 'candidates' field"
        assert 'pronoun_position' in item, "Missing 'pronoun_position' field"
        assert 'correct_candidate_idx' in item, "Missing 'correct_candidate_idx' field"
    
    random.shuffle(data)

    return data

original_data = load_data("pronoun__dataset.json")

# Apply augmentation (5x increase in dataset size)
augmented_data = augment_dataset(original_data, augmentation_factor=3)

# Save augmented dataset
with open("augmented_pronoun_resolution_data.json", "w", encoding="utf-8") as f:
    json.dump(augmented_data, f, ensure_ascii=False, indent=2)

print(f"Original dataset: {len(original_data)} examples")
print(f"Augmented dataset: {len(augmented_data)} examples")

Original dataset: 1200 examples
Augmented dataset: 3600 examples
