In [1]:
import random
import re
import numpy as np
import pandas as pd
from typing import Tuple

In [2]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [3]:
# Define slang dictionary
SLANG_DICT = {
    "great": "gr8",
    "you": "u",
    "before": "b4",
    "are": "r",
    "to": "2",
    "for": "4",
    "please": "pls",
    "people": "ppl",
    "with": "w/",
    "thanks": "thx",
}



In [4]:
def introduce_typos(text: str, typo_rate: float = 0.05) -> str:
    """Introduce random typos into a given text."""
    chars = list(text)
    n_typos = int(len(chars) * typo_rate)
    for _ in range(n_typos):
        if len(chars) < 2:
            continue
        idx = random.randint(0, len(chars) - 2)
        chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx]
    return ''.join(chars)



In [5]:
def replace_with_slang(text: str) -> str:
    """Replace random words with slangs."""
    words = text.split()
    new_words = []
    for word in words:
        if word.lower() in SLANG_DICT and random.random() < 0.3:
            new_words.append(SLANG_DICT[word.lower()])
        else:
            new_words.append(word)
    return ' '.join(new_words)




In [6]:
def flip_sentiment(y: np.ndarray, flip_rate: float = 0.1) -> np.ndarray:
    """
    Randomly flip a portion of sentiment labels to introduce noise.
    """
    flipped_y = y.copy()
    n_samples = int(len(y) * flip_rate)
    indices = np.random.choice(len(y), n_samples, replace=False)
    valid_labels = [-2, -1, 0, 1, 2]
    
    for idx in indices:
        current_label = flipped_y[idx]
        
        # Check if current label is valid
        if current_label in valid_labels:
            possible_labels = valid_labels.copy()
            possible_labels.remove(current_label)
            flipped_y[idx] = random.choice(possible_labels)
            
    return flipped_y

In [7]:
def inject_noise(df: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray]:
    """Apply noise injection to comments and target."""
    noisy_df = df.copy()
    
    # Apply typos
    noisy_df['Comment'] = noisy_df['Comment'].apply(lambda x: introduce_typos(x, typo_rate=0.05))
    
    # Apply slangs
    noisy_df['Comment'] = noisy_df['Comment'].apply(replace_with_slang)
    
    # Flip labels
    y_noisy = flip_sentiment(noisy_df['Sentiment_Score'].values, flip_rate=0.05)
    
    return noisy_df, y_noisy



In [8]:
# ========================
# Example usage
# ========================
# Load your existing processed data
df = pd.read_csv("../data/processed/processed_sentiments.csv")

# Apply noise
noisy_df, noisy_y = inject_noise(df)

# Save noisy data if needed
noisy_df['Sentiment_Score_Noisy'] = noisy_y
noisy_df.to_csv("../data/processed/processed_sentiment_comments_noisy.csv", index=False)

print("[INFO] Noise injected and new dataset saved.")

[INFO] Noise injected and new dataset saved.
