In [1]:
import random
import numpy as np
from typing import List, Tuple, Dict
import json
import csv
from pathlib import Path

In [2]:
class AlgorithmicNoiseInjector:
    def __init__(self,
                 hold_mu: float = 3.0,
                 hold_sigma: float = 1.0,
                 p_sub: float = 0.10,
                 p_del: float = 0.05,
                 max_pause_frames: int = 2,
                 frame_jitter: bool = True):
        """
        Initialize the noise injector with configurable parameters.

        Args:
            hold_mu: Mean for normal distribution of letter repetitions
            hold_sigma: Standard deviation for letter repetitions
            p_sub: Probability of letter substitution
            p_del: Probability of letter deletion
            max_pause_frames: Maximum pause frames between letters
            frame_jitter: Whether to add frame counter jitter
        """
        self.hold_mu = hold_mu
        self.hold_sigma = hold_sigma
        self.p_sub = p_sub
        self.p_del = p_del
        self.max_pause_frames = max_pause_frames
        self.frame_jitter = frame_jitter

        # Visually similar letter mappings for substitution
        self.similar_letters = {
            'A': ['A', 'H'],
            'B': ['B', 'P', 'R'],
            'C': ['C', 'O'],
            'D': ['D', 'G', 'O'],
            'E': ['E', 'F'],
            'F': ['F', 'E', 'P'],
            'G': ['G', 'D', 'O'],
            'H': ['H', 'A', 'N'],
            'I': ['I', 'J', 'L'],
            'J': ['J', 'I'],
            'K': ['K', 'H'],
            'L': ['L', 'I'],
            'M': ['M', 'N', 'W'],
            'N': ['N', 'M', 'H'],
            'O': ['O', 'C', 'D', 'G'],
            'P': ['P', 'B', 'F', 'R'],
            'Q': ['Q', 'O'],
            'R': ['R', 'B', 'P'],
            'S': ['S', 'Z'],
            'T': ['T', 'F'],
            'U': ['U', 'V'],
            'V': ['V', 'U'],
            'W': ['W', 'M'],
            'X': ['X', 'Y'],
            'Y': ['Y', 'X'],
            'Z': ['Z', 'S']
        }

    def sample_hold_duration(self) -> int:
        """Sample hold duration from normal distribution, minimum 1."""
        return max(1, round(np.random.normal(self.hold_mu, self.hold_sigma)))

    def substitute_letter(self, letter: str) -> str:
        """Substitute a letter with a visually similar one."""
        if letter.upper() in self.similar_letters:
            candidates = self.similar_letters[letter.upper()]
            # Keep original case
            substitution = random.choice(candidates)
            return substitution.lower() if letter.islower() else substitution
        return letter

    def inject_noise(self, clean_text: str) -> Tuple[str, List[Dict]]:
        """
        Inject noise into clean text following the algorithmic approach.

        Args:
            clean_text: Original clean text

        Returns:
            Tuple of (noisy_text, metadata_list)
        """
        noisy_chars = []
        metadata = []
        frame_counter = 0

        for i, char in enumerate(clean_text.upper()):
            char_metadata = {
                'original_char': char,
                'position': i,
                'operations': []
            }

            # Skip spaces and punctuation - just add them as-is
            if not char.isalpha():
                noisy_chars.append(char)
                char_metadata['final_char'] = char
                char_metadata['frame_start'] = frame_counter
                char_metadata['frame_end'] = frame_counter
                frame_counter += 1
                metadata.append(char_metadata)
                continue

            # Step 1: Sample hold duration
            hold = self.sample_hold_duration()
            char_metadata['hold_duration'] = hold
            char_metadata['operations'].append(f'hold_{hold}')

            # Step 2: Deletion check
            if random.random() < self.p_del:
                char_metadata['operations'].append('deleted')
                char_metadata['final_char'] = ''
                char_metadata['frame_start'] = frame_counter
                char_metadata['frame_end'] = frame_counter
                metadata.append(char_metadata)
                continue

            # Step 3: Substitution check
            final_char = char
            if random.random() < self.p_sub:
                final_char = self.substitute_letter(char)
                char_metadata['operations'].append(f'substituted_to_{final_char}')

            # Step 4: Apply hold (repeat the character)
            char_metadata['frame_start'] = frame_counter
            for _ in range(hold):
                noisy_chars.append(final_char)
                if self.frame_jitter and random.random() < 0.1:  # 10% chance of jitter
                    jitter = random.choice([-1, 1])
                    frame_counter += jitter
                    char_metadata['operations'].append(f'jitter_{jitter}')
                frame_counter += 1
            char_metadata['frame_end'] = frame_counter - 1
            char_metadata['final_char'] = final_char

            # Step 5: Insert random pause (no-sign tokens)
            if i < len(clean_text) - 1:  # Don't add pause after last character
                pause_length = random.randint(0, self.max_pause_frames)
                if pause_length > 0:
                    for _ in range(pause_length):
                        noisy_chars.append('[NO_SIGN]')
                        frame_counter += 1
                    char_metadata['operations'].append(f'pause_{pause_length}')

            metadata.append(char_metadata)

        return ''.join(noisy_chars), metadata

    def create_dataset(self,
                      clean_texts: List[str],
                      output_path: str = "noisy_dataset.json") -> Dict:
        """
        Create a complete dataset with noise injection applied to all texts.

        Args:
            clean_texts: List of clean text strings
            output_path: Path to save the dataset

        Returns:
            Dictionary containing the complete dataset
        """
        dataset = {
            'config': {
                'hold_mu': self.hold_mu,
                'hold_sigma': self.hold_sigma,
                'p_sub': self.p_sub,
                'p_del': self.p_del,
                'max_pause_frames': self.max_pause_frames,
                'frame_jitter': self.frame_jitter
            },
            'data': []
        }

        for i, clean_text in enumerate(clean_texts):
            noisy_text, metadata = self.inject_noise(clean_text)

            dataset['data'].append({
                'id': i,
                'clean_text': clean_text,
                'noisy_text': noisy_text,
                'metadata': metadata,
                'stats': {
                    'original_length': len(clean_text),
                    'noisy_length': len(noisy_text),
                    'expansion_ratio': len(noisy_text) / len(clean_text) if len(clean_text) > 0 else 0,
                    'operations_count': sum(len(m['operations']) for m in metadata)
                }
            })

        # Save to file
        with open(output_path, 'w') as f:
            json.dump(dataset, f, indent=2)

        return dataset

In [5]:
import random

def load_common_english_dataset():
    """
    Returns a list of common English sentences, each ~3-5 words.

    Args:
        sample_size (int): Number of sentences to return.

    Returns:
        List[str]: List of sentences.
    """
    # Example pool of common English sentences
    sentences = [
        "How are you today",
        "I love reading books",
        "Can you help me",
        "This is my friend",
        "What is your name",
        "I need some water",
        "Where is the station",
        "She is very kind",
        "I am feeling happy",
        "He likes to run",
        "Do you know this",
        "It is a sunny day",
        "Please give me that",
        "I am going home",
        "We are learning Python",
        "They are very nice",
        "Can I ask something",
        "This is very easy",
        "I want some coffee",
        "What time is it",
        "I am reading now",
        "He is my brother",
        "She is my sister",
        "We are best friends",
        "It is very cold",
        "I like this song",
        "Do you like it",
        "Please sit over there",
        "I will call you",
        "That is my book"
    ]

    return random.sample(sentences, min(sample_size, len(sentences)))

In [6]:
def demonstrate_noise_injection():
    """Demonstrate the noise injection system."""
    print("=== ASL-Based Algorithmic Noise Injection Demo ===\n")

    # Initialize the noise injector
    injector = AlgorithmicNoiseInjector(
        hold_mu=3.0,
        hold_sigma=1.0,
        p_sub=0.10,
        p_del=0.05,
        max_pause_frames=2,
        frame_jitter=True
    )

    # Load some sample texts.
    print("Loading some English sample sentences. (not from any dataset, this is just simple stuff for now.)")
    sample_texts = load_common_english_dataset()

    # Demonstrate on a few examples
    print(f"\nLoaded {len(sample_texts)} sentences. Here are some examples:")
    print("=" * 60)
    for i, text in enumerate(sample_texts[:5]):
        noisy, metadata = injector.inject_noise(text)
        print(f"Example {i+1}:")
        print(f"Clean:  {text}")
        print(f"Noisy:  {noisy}")
        print(f"Ops:    {sum(len(m['operations']) for m in metadata)} operations applied")
        print("-" * 60)

    # Create full dataset
    print(f"Creating full dataset with {len(sample_texts)} samples...")
    dataset = injector.create_dataset(sample_texts, "asl_noisy_dataset.json")

    # Print statistics
    total_samples = len(dataset['data'])
    avg_expansion = np.mean([d['stats']['expansion_ratio'] for d in dataset['data']])
    total_ops = sum([d['stats']['operations_count'] for d in dataset['data']])

    print(f"\nDataset Statistics:")
    print(f"- Total samples: {total_samples}")
    print(f"- Average expansion ratio: {avg_expansion:.2f}x")
    print(f"- Total operations applied: {total_ops}")
    print(f"- Dataset saved to: asl_noisy_dataset.json")

    # Show some ASL substitution examples
    print(f"\nASL Hand Shape Substitutions Used:")
    print("(Common confusions in ASL fingerspelling)")
    print("- D ↔ F (commonly confused)")
    print("- M ↔ N (very similar hand shapes)")
    print("- A ↔ S ↔ T (all closed fist variations)")
    print("- K ↔ P (thumb on middle finger)")
    print("- H ↔ U (similar orientations)")
    print("- B ↔ U ↔ V (extended fingers)")

    return dataset

In [7]:
random.seed(42)
np.random.seed(42)
demonstrate_noise_injection()

=== ASL-Based Algorithmic Noise Injection Demo ===

Loading some English sample sentences. (not from any dataset, this is just simple stuff for now.)

Loaded 30 sentences. Here are some examples:
Example 1:
Clean:  I am reading now
Noisy:  III[NO_SIGN] AAAMMMM[NO_SIGN][NO_SIGN] RRRRREEE[NO_SIGN][NO_SIGN]AAA[NO_SIGN][NO_SIGN]DDDDDJJJJ[NO_SIGN][NO_SIGN]NNN[NO_SIGN][NO_SIGN]GGGG NNN[NO_SIGN]OOOWWW
Ops:    24 operations applied
------------------------------------------------------------
Example 2:
Clean:  This is my friend
Noisy:  TH[NO_SIGN][NO_SIGN]II[NO_SIGN][NO_SIGN]SS[NO_SIGN][NO_SIGN] III[NO_SIGN]SS MM[NO_SIGN][NO_SIGN]YYYY[NO_SIGN][NO_SIGN] FFF[NO_SIGN][NO_SIGN]RRREENNN[NO_SIGN][NO_SIGN]DD
Ops:    26 operations applied
------------------------------------------------------------
Example 3:
Clean:  How are you today
Noisy:  HHHOOWWW AARRRRR[NO_SIGN]EEE[NO_SIGN] YY[NO_SIGN][NO_SIGN]OOOO[NO_SIGN][NO_SIGN]UU TTT[NO_SIGN]O[NO_SIGN]DD[NO_SIGN][NO_SIGN]AAAYYYY
Ops:    30 operations applie

{'config': {'hold_mu': 3.0,
  'hold_sigma': 1.0,
  'p_sub': 0.1,
  'p_del': 0.05,
  'max_pause_frames': 2,
  'frame_jitter': True},
 'data': [{'id': 0,
   'clean_text': 'I am reading now',
   'noisy_text': 'III AAAAA[NO_SIGN][NO_SIGN]M RRRR[NO_SIGN][NO_SIGN]EEE[NO_SIGN]AAA[NO_SIGN]DDD[NO_SIGN]INNN[NO_SIGN]GGG[NO_SIGN] NNNN[NO_SIGN]OO[NO_SIGN]WW',
   'metadata': [{'original_char': 'I',
     'position': 0,
     'operations': ['hold_3'],
     'hold_duration': 3,
     'frame_start': 0,
     'frame_end': 2,
     'final_char': 'I'},
    {'original_char': ' ',
     'position': 1,
     'operations': [],
     'final_char': ' ',
     'frame_start': 3,
     'frame_end': 3},
    {'original_char': 'A',
     'position': 2,
     'operations': ['hold_5', 'pause_2'],
     'hold_duration': 5,
     'frame_start': 4,
     'frame_end': 8,
     'final_char': 'A'},
    {'original_char': 'M',
     'position': 3,
     'operations': ['hold_1'],
     'hold_duration': 1,
     'frame_start': 11,
     'frame_end': 

In [8]:
import os
os.makedirs("/content/drive/MyDrive/QHack", exist_ok = True)

In [9]:
## move asl_noisy_dataset.json from root to '/content/drive/MyDrive/QHack/asl_noisy_dataset.json'
import shutil

src = "asl_noisy_dataset.json"
dst = "/content/drive/MyDrive/QHack/asl_noisy_dataset.json"
shutil.move(src, dst)

'/content/drive/MyDrive/QHack/asl_noisy_dataset.json'