In [2]:
import pandas as pd
import json

def augment_text(text, hinglish_mappings):
    """Convert Hinglish text to English using mappings"""
    if pd.isna(text) or not isinstance(text, str):
        return ''
    words = text.lower().strip().split()
    augmented_words = [hinglish_mappings.get(word.lower(), word) for word in words]
    return ' '.join(augmented_words)

# Read the dataset
df = pd.read_csv('combined_shuffled train wo augment.csv')

# Create augmented dataframe with the same structure
augmented_df = df.copy()

# Load Hinglish mappings from JSON file
with open('hinglish_mappings.json', 'r', encoding='utf-8') as f:
    hinglish_mappings = json.load(f)

# Apply Hinglish mapping only to crimeaditionalinfo column
augmented_df['crimeaditionalinfo'] = augmented_df['crimeaditionalinfo'].apply(
    lambda x: augment_text(x, hinglish_mappings)
)

# Concatenate original and augmented dataframes
final_df = pd.concat([df, augmented_df], ignore_index=True)

# Remove duplicate rows based on crimeaditionalinfo to avoid exact duplicates
final_df = final_df.drop_duplicates(subset=['crimeaditionalinfo'])

# Save augmented dataset
final_df.to_csv('train_augmented.csv', index=False)

print(f"Original rows: {len(df)}")
print(f"Final rows after augmentation: {len(final_df)}")

Original rows: 92463
Final rows after augmentation: 117334
