In [42]:
import os
from datasets import load_dataset, load_from_disk, DatasetDict

### COMI-LINGUA
- COMI-LINGUA (COde-MIxing and LINGuistic Insights on Natural Hinglish Usage and Annotation) is a high-quality Hindi-English code-mixed dataset, manually annotated by three annotators. It serves as a benchmark for multilingual NLP models by covering multiple foundational tasks.
- Machine Translation (MT): Parallel translation of sentences in Romanized Hindi and Devanagari Hindi and English languages. Initial translation predictions were generated using the Llama 3.3 LLM, which annotators then refined and corrected.

In [46]:
import os
from datasets import load_dataset, load_from_disk
from difflib import SequenceMatcher
import numpy as np

dataset_name = "LingoIITGN/COMI-LINGUA"
config_name = "MT"
dataset_path = "./datasets/comi_lingua_mt"

# load raw dataset as is
if os.path.exists(os.path.join(dataset_path, "dataset_dict.json")):
    print("Loading dataset from local disk...")
    dataset = load_from_disk(dataset_path)
else:
    print("Downloading dataset...")
    dataset = load_dataset(dataset_name, config_name)

train_data = dataset["train"]

# similarity function
def similarity(a, b):
    return SequenceMatcher(None, str(a), str(b)).ratio()

# compute average similarity
def compute_similarity(col1, col2, sample_size=5000):
    sims = []
    for i in range(min(sample_size, len(train_data))):
        sims.append(
            similarity(train_data[i][col1], train_data[i][col2])
        )
    return np.mean(sims)

# English similarity
sim_en_1_2 = compute_similarity("Annotator_1_en_translation",
                                "Annotator_2_en_translation")

sim_en_1_3 = compute_similarity("Annotator_1_en_translation",
                                "Annotator_3_en_translation")

sim_en_2_3 = compute_similarity("Annotator_2_en_translation",
                                "Annotator_3_en_translation")

# Hinglish similarity
sim_rh_1_2 = compute_similarity("Annotator_1_RH_translation",
                                "annotator2_RH_translation")

sim_rh_1_3 = compute_similarity("Annotator_1_RH_translation",
                                "annotator3_RH_translation")

sim_rh_2_3 = compute_similarity("annotator2_RH_translation",
                                "annotator3_RH_translation")

print("\nEnglish similarity:")
print("1 vs 2:", sim_en_1_2)
print("1 vs 3:", sim_en_1_3)
print("2 vs 3:", sim_en_2_3)

print("\nHinglish similarity:")
print("1 vs 2:", sim_rh_1_2)
print("1 vs 3:", sim_rh_1_3)
print("2 vs 3:", sim_rh_2_3)

# choose Annotator 1
for split in dataset.keys():
    dataset[split] = dataset[split].rename_columns({
        "Annotator_1_en_translation": "english",
        "Annotator_1_RH_translation": "hinglish"
    })

print("\nColumns after selection:")
print(dataset)


Loading dataset from local disk...

English similarity:
1 vs 2: 0.9983840981273635
1 vs 3: 0.9961508136951346
2 vs 3: 0.9962505055414138

Hinglish similarity:
1 vs 2: 0.9813452195198115
1 vs 3: 0.9728847464060675
2 vs 3: 0.9748953744740767

Columns after selection:
DatasetDict({
    train: Dataset({
        features: ['Sentences', 'Predicted_en_translation', 'Predicted_RH_translation', 'Predicted_DH_translation', 'english', 'hinglish', 'Annotator_1_DH_translation', 'Annotator_2_en_translation', 'annotator2_RH_translation', 'Annotator_2_DH_translation', 'Annotator_3_en_translation', 'annotator3_RH_translation', 'Annotator_3_DH_translation'],
        num_rows: 19558
    })
    test: Dataset({
        features: ['Sentences', 'Predicted_en_translation', 'Predicted_RH_translation', 'Predicted_DH_translation', 'english', 'hinglish', 'Annotator_1_DH_translation', 'Annotator_2_en_translation', 'annotator2_RH_translation', 'Annotator_2_DH_translation', 'Annotator_3_en_translation', 'annotator3_

- using Annotator_1_en_translation column as **English**
- using Annotator_1_RH_translation column as **Hinglish**

In [47]:
import os
from datasets import load_from_disk, DatasetDict

raw_dataset_path = "./datasets/comi_lingua_mt"
processed_path = "./datasets_final/comi_lingua_mt"
hf_marker = os.path.join(processed_path, "dataset_dict.json")

# if processed dataset already exists → load
if os.path.exists(hf_marker):
    print("Loading processed dataset...")
    dataset = load_from_disk(processed_path)

else:
    print("Processed dataset not found. Building...")

    # load raw dataset
    raw_dataset = load_from_disk(raw_dataset_path)

    # split 10% of train as validation
    split_train = raw_dataset["train"].train_test_split(test_size=0.10, seed=42)

    train_dataset = split_train["train"]
    val_dataset = split_train["test"]
    test_dataset = raw_dataset["test"]

    dataset = DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset
    })

    # rename and keep selected columns
    for split in dataset.keys():
        dataset[split] = dataset[split].rename_columns({
            "Annotator_1_en_translation": "english",
            "Annotator_1_RH_translation": "hinglish"
        })

        dataset[split] = dataset[split].remove_columns(
            [col for col in dataset[split].column_names
             if col not in ["english", "hinglish"]]
        )

    dataset.save_to_disk(processed_path)
    print("Saved processed dataset.")

print(dataset)

Loading processed dataset...
DatasetDict({
    train: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 17602
    })
    validation: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 1956
    })
    test: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 5000
    })
})


In [49]:
dataset['train'][0]

{'english': 'Nawazuddin Siddiqui was recently seen at the 22nd Lions Gold Awards, where he was asked if you have any sorrow about this?',
 'hinglish': 'Navaazuddin Siddiqui haal hi mein 22nd Lions Gold Awards mein nazar aaye, vahaan unse poochha gaya ki kya aapko is baat ka koi dukh nahin hai?'}

### PHINC
PHINC (Parallel Hinglish Social Media Code-Mixed Corpus for Machine Translation): The dataset tackles challenges in translating noisy, informal, code-mixed social media text, offering 13,738 Hinglish-English sentence pairs manually annotated by 54 annotators for low-resource machine translation task.

The dataset contains the following fields:
- Hinglish Code-Mixed Sentence: The original sentence in Romanized Hindi-English (Hinglish).
- Human Translated English Sentence: The corresponding English translation provided by human annotators.

In [51]:
from ai4bharat.transliteration import XlitEngine

dataset_name = "LingoIITGN/PHINC"
dataset_path = "./datasets_final/PHINC"
hf_marker = os.path.join(dataset_path, "dataset_dict.json")


def add_transliteration_column(dataset):

    engine = XlitEngine("hi", beam_width=5)

    def transliterate_batch(batch):

        transliterated = []

        for s in batch["hinglish"]:
            if s is None:
                transliterated.append("")
            else:
                result = engine.translit_sentence(s)

                # If output is dict like {"hi": "..."}
                if isinstance(result, dict):
                    transliterated.append(result.get("hi", ""))
                else:
                    transliterated.append(result)

        return {"hinglish_dev": transliterated}

    for split in dataset.keys():
        if "hinglish_dev" not in dataset[split].column_names:
            dataset[split] = dataset[split].map(
                transliterate_batch,
                batched=True,
                batch_size=128,
            )

    return dataset


# Case 1: Dataset already saved locally
if os.path.exists(hf_marker):
    print("Loading dataset from local disk...")
    dataset = load_from_disk(dataset_path)

    if "hinglish_dev" not in dataset["train"].column_names:
        dataset = add_transliteration_column(dataset)
        dataset.save_to_disk(dataset_path)


# Case 2: Dataset not present locally → download
else:
    print("Dataset not found locally. Downloading...")
    os.makedirs(dataset_path, exist_ok=True)

    dataset = load_dataset(dataset_name)

    # If only train split exists → create 70/10/20 split
    if "train" in dataset and len(dataset) == 1:
        full_dataset = dataset["train"]

        split_1 = full_dataset.train_test_split(test_size=0.30, seed=42)
        temp_dataset = split_1["test"]

        split_2 = temp_dataset.train_test_split(test_size=2/3, seed=42)

        dataset = DatasetDict({
            "train": split_1["train"],
            "validation": split_2["train"],
            "test": split_2["test"]
        })

    # Rename columns if needed
    for split in dataset.keys():
        if "Sentence" in dataset[split].column_names:
            dataset[split] = dataset[split].rename_columns({
                "Sentence": "hinglish",
                "English_Translation": "english"
            })

    dataset = add_transliteration_column(dataset)

    dataset.save_to_disk(dataset_path)
    print("Saved dataset.")


print(dataset)


Loading dataset from local disk...
DatasetDict({
    train: Dataset({
        features: ['hinglish', 'english', 'hinglish_dev'],
        num_rows: 9616
    })
    validation: Dataset({
        features: ['hinglish', 'english', 'hinglish_dev'],
        num_rows: 1374
    })
    test: Dataset({
        features: ['hinglish', 'english', 'hinglish_dev'],
        num_rows: 2748
    })
})


In [52]:
dataset['train'][0]

{'hinglish': 'Gayatri Mantra se kaam chal jaega kya @anupamamathur1 ?pic.twitter.com/Tq8EMU5P0h',
 'english': 'would gayatri mantra be enough @anupamamathur1 ?pic.twitter.com/Tq8EMU5P0h',
 'hinglish_dev': 'गायत्री मंत्र से काम चल जाएगा क्या @अनुपममथुर१ ?पिक.ट्विटर.कॉम/टीक्यू८ईएमयू५प०ह'}

### HINMIX

We construct a large synthetic Hinglish-English dataset by leveraging a bilingual Hindi-English corpus. Split: Train, test, valid Subsets:

- Hi - Hindi in devanagiri script (Example: अमेरिकी लोग अब पहले जितनी गैस नहीं खरीदते।)
- Hicm - Hindi sentences with codemix words substituted in English (Example: American people अब पहले जितनी gas नहीं खरीदते।)
- Hicmrom - Hicm with romanized hindi words (Example: American people ab pahle jitni gas nahin kharidte.)
- Hicmdvg - Hicm with transliterated english words to devangiri (Example: अमेरिकन पेओपल अब पहले जितनी गैस नहीं खरीदते।)
- NoisyHicmrom - synthetic noise added to Hicmrom sentences to improve model robustness (Example: Aerican people ab phle jtni gas nain khridte.)

In [55]:
from datasets import load_dataset, load_from_disk
import os

dataset_name = "kartikagg98/HINMIX_hi-en"
configs = [
    "lcsalign-en",
    "lcsalign-hi",
    "lcsalign-hicm",
    "lcsalign-hicmdvg",
    "lcsalign-hicmrom",
    "lcsalign-noisyhicmrom",
]

base_path = "./datasets/HINMIX_hi-en"
os.makedirs(base_path, exist_ok=True)

for config in configs:
    save_path = os.path.join(base_path, config)

    if os.path.exists(save_path):
        print(f"Loading existing config: {config}")
        ds = load_from_disk(save_path)
    else:
        print(f"Downloading config: {config}")
        ds = load_dataset(dataset_name, config)
        ds.save_to_disk(save_path)

print("All configs ready.")

Loading existing config: lcsalign-en
Loading existing config: lcsalign-hi
Loading existing config: lcsalign-hicm
Loading existing config: lcsalign-hicmdvg
Loading existing config: lcsalign-hicmrom
Loading existing config: lcsalign-noisyhicmrom
All configs ready.


In [56]:
ds['train'][7]

{'text': 'kripya is action ke smaarthan men uddeshya aur reasons ko prastut karen.'}

In [74]:
from datasets import load_from_disk, Dataset, DatasetDict
from tqdm import tqdm
import os

# Paths
old_base_path = "./datasets/HINMIX_hi-en"
new_base_path = "./datasets_final/HINMIX_hi-en"

# If dataset already exists, skip processing
if os.path.exists(new_base_path):
    print("Final dataset already exists. Skipping creation.")
else:
    os.makedirs(new_base_path, exist_ok=True)

    # Load required configs
    hicmrom_path = os.path.join(old_base_path, "lcsalign-hicmrom")
    en_path = os.path.join(old_base_path, "lcsalign-en")

    hicmrom_ds = load_from_disk(hicmrom_path)
    en_ds = load_from_disk(en_path)

    final_dataset = {}

    for split in hicmrom_ds.keys():

        # Rename 'valid' to 'validation'
        new_split_name = "validation" if split == "valid" else split

        hicmrom_split = hicmrom_ds[split]
        en_split = en_ds[split]

        assert len(hicmrom_split) == len(en_split), "Mismatch in dataset lengths"

        hinglish_data = []
        english_data = []

        for i in tqdm(range(len(hicmrom_split)), desc=f"Processing {new_split_name}"):
            hinglish_data.append(hicmrom_split[i]["text"])
            english_data.append(en_split[i]["text"])

        combined = Dataset.from_dict({
            "hinglish": hinglish_data,
            "english": english_data
        })

        final_dataset[new_split_name] = combined

    final_dataset = DatasetDict(final_dataset)
    final_dataset.save_to_disk(new_base_path)

    print("Final dataset saved successfully.")

Final dataset already exists. Skipping creation.


In [75]:
final_dataset

DatasetDict({
    test: Dataset({
        features: ['hinglish', 'english'],
        num_rows: 2507
    })
    train: Dataset({
        features: ['hinglish', 'english'],
        num_rows: 4200000
    })
    validation: Dataset({
        features: ['hinglish', 'english'],
        num_rows: 280
    })
})

### English-Hinglish-TOP

In [57]:
import os
from datasets import load_dataset, load_from_disk

dataset_path = "./datasets/english_hinglish_top"

if os.path.exists(dataset_path):
    print("Loading dataset from local disk...")
    ds = load_from_disk(dataset_path)
else:
    print("Dataset not found locally. Downloading...")
    ds = load_dataset("rvv-karma/English-Hinglish-TOP")
    ds.save_to_disk(dataset_path)

print(ds)

Loading dataset from local disk...
DatasetDict({
    train: Dataset({
        features: ['en', 'hi_en', 'en_parse', 'hi_en_parse', 'domain', 'generated_by'],
        num_rows: 176596
    })
    val: Dataset({
        features: ['en', 'hi_en', 'en_parse', 'hi_en_parse', 'domain', 'generated_by'],
        num_rows: 1390
    })
    test: Dataset({
        features: ['en', 'hi_en', 'en_parse', 'hi_en_parse', 'domain', 'generated_by'],
        num_rows: 6513
    })
})


In [58]:
ds['train'][7]

{'en': 'What is the UV rating today?',
 'hi_en': 'Today ki UV rating kya hai?',
 'en_parse': '[IN:GET_WEATHER What is the [SL:WEATHER_ATTRIBUTE UV rating ] [SL:DATE_TIME today ] ? ]',
 'hi_en_parse': '[IN:GET_WEATHER [SL:DATE_TIME Today ] ki [SL:WEATHER_ATTRIBUTE UV rating ] kya hai? ]',
 'domain': 'weather',
 'generated_by': 'human'}

In [59]:
import os
from datasets import load_from_disk

# Original dataset path
dataset_path = "./datasets/english_hinglish_top"

# New dataset path
new_dataset_path = "./datasets_final/english_hinglish_top"

# Load original dataset
ds = load_from_disk(dataset_path)

# Keep only required columns and rename them
def transform_dataset(dataset):
    dataset = dataset.remove_columns(
        [col for col in dataset.column_names if col not in ["en", "hi_en"]]
    )
    dataset = dataset.rename_column("en", "english")
    dataset = dataset.rename_column("hi_en", "hinglish")
    return dataset

ds = ds.map(lambda x: x, batched=True)

# Apply transformation split-wise
for split in ds.keys():
    ds[split] = transform_dataset(ds[split])

# Create directory if needed
os.makedirs("./datasets_final", exist_ok=True)

# Save new dataset
ds.save_to_disk(new_dataset_path)

print("New dataset saved at:", new_dataset_path)
print(ds)

Map:   0%|          | 0/176596 [00:00<?, ? examples/s]

Map:   0%|          | 0/1390 [00:00<?, ? examples/s]

Map:   0%|          | 0/6513 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/176596 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1390 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6513 [00:00<?, ? examples/s]

New dataset saved at: ./datasets_final/english_hinglish_top
DatasetDict({
    train: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 176596
    })
    val: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 1390
    })
    test: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 6513
    })
})


#### HINGE

In [77]:
import os
from datasets import load_dataset, load_from_disk

dataset_path = "./datasets/hinge"

if os.path.exists(dataset_path):
    print("Loading dataset from local disk...")
    ds = load_from_disk(dataset_path)
else:
    print("Dataset not found locally. Downloading...")
    ds = load_dataset("LingoIITGN/HinGE")
    ds.save_to_disk(dataset_path)

print(ds)


Loading dataset from local disk...
DatasetDict({
    train: Dataset({
        features: ['English', 'Hindi', 'Human-generated Hinglish', 'WAC', 'WAC rating1', 'WAC rating2', 'PAC', 'PAC rating1', 'PAC rating2'],
        num_rows: 1976
    })
})


In [80]:
ds['train'][0]

{'English': 'It was presented to the Legislative Council in 1856 and was passed in 1860.',
 'Hindi': 'इसे 1856 में विधायी परिषद के समक्ष प्रस्तुत किया गया और1860 में पारित किया गया।',
 'Human-generated Hinglish': "['Ise 1856 mein legislative council ke samaksh prastut kiya gya and 1860 mein paarit kiya gya.', 'Ise 1856 mein vidhai parishad ko present kiya and 1860 mein pass kiya.', 'It was presented to vidhayi parishad in 1856 aur 1860 me parit kiya gaya.', 'Ise 1856 me legislative council ke samaksh present kiya gaya aur 1860  me pass kiya.', '1856 me it was presented to the legislative council aur 1860 me it was passed.', '1856 me ise legislative council ke samaksh prensent kiya gaya aur 1860 me parit kiya gaya.']",
 'WAC': 'ise 1856 men legislative council ke samaksh prastut kiya gaya aur1860 men parit kiya gaya.',
 'WAC rating1': 9,
 'WAC rating2': 6,
 'PAC': 'ise 1856 men legislative council ke samaksh prastut kiya gaya aur1860 men parit kiya gaya.',
 'PAC rating1': 9,
 'PAC ratin

In [95]:
import os
import ast
from datasets import load_from_disk, DatasetDict

# Paths
dataset_path = "./datasets/hinge"
new_dataset_path = "./datasets_final/hinge"

# If final dataset already exists, load it
if os.path.exists(new_dataset_path):
    print("Loading processed dataset from disk...")
    dataset = load_from_disk(new_dataset_path)

else:
    print("Processing dataset...")

    # Load original dataset
    ds = load_from_disk(dataset_path)
    dataset = ds["train"]

    # Transform columns safely
    def transform(example):
        hinglish_value = example["Human-generated Hinglish"]

        # Convert string representation of list into actual list
        if isinstance(hinglish_value, str):
            try:
                hinglish_value = ast.literal_eval(hinglish_value)
            except (ValueError, SyntaxError):
                hinglish_value = [hinglish_value]

        # If it's a list, take the first element
        if isinstance(hinglish_value, list):
            hinglish_value = hinglish_value[0]

        return {
            "english": example["English"],
            "hinglish": hinglish_value
        }

    dataset = dataset.map(transform, remove_columns=dataset.column_names)

    # First split: 70 percent train, 30 percent temp
    split_1 = dataset.train_test_split(test_size=0.30, seed=42)

    # Split 30 percent temp into 10 percent validation and 20 percent test
    # Since 10/30 = 1/3, validation gets 1/3 of temp
    split_2 = split_1["test"].train_test_split(test_size=2/3, seed=42)

    dataset = DatasetDict({
        "train": split_1["train"],
        "validation": split_2["train"],
        "test": split_2["test"]
    })

    os.makedirs("./datasets_final", exist_ok=True)
    dataset.save_to_disk(new_dataset_path)

print(dataset)

Processing dataset...


Map:   0%|          | 0/1976 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1383 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/197 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/396 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 1383
    })
    validation: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 197
    })
    test: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 396
    })
})


In [96]:
dataset['train'][1]

{'english': 'Inevitable losses is pushing the profit in unfavourable direction.',
 'hinglish': 'Inevitable losses labh ko pratikul direction mein dhakel rhe hai.'}

#### Lince_benchmark

In [88]:
from datasets import load_dataset, DatasetDict, load_from_disk
import os

train_valid_name = "Huggmachas/Lince_benchmark_mt_enghinglish_train_valid"
test_name = "Huggmachas/Lince_benchmark_mt_enghinglish_test"

save_path = "./datasets/Lince_benchmark_mt"

if os.path.exists(save_path):
    print("Loading dataset from disk...")
    ds = load_from_disk(save_path)
else:
    print("Downloading and saving dataset...")
    train_valid = load_dataset(train_valid_name)
    test = load_dataset(test_name)

    ds = DatasetDict({
        "train": train_valid["train"],
        "validation": train_valid["dev"],
        "test": test["test"]
    })

    ds.save_to_disk(save_path)

print(ds)

Loading dataset from disk...
DatasetDict({
    train: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 8060
    })
    validation: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 942
    })
    test: Dataset({
        features: ['english'],
        num_rows: 960
    })
})


In [89]:
ds['train'][0]

{'english': 'Batman vs Superman', 'hinglish': 'batman vs superman'}

In [86]:
from datasets import load_dataset, DatasetDict, load_from_disk
import os

train_valid_name = "Huggmachas/Lince_benchmark_mt_enghinglish_train_valid"
test_name = "Huggmachas/Lince_benchmark_mt_enghinglish_test"

save_path = "./datasets_final/Lince_benchmark_mt"

if os.path.exists(save_path):
    print("Loading dataset from disk...")
    ds = load_from_disk(save_path)
else:
    print("Downloading and saving dataset...")
    train_valid = load_dataset(train_valid_name)
    test = load_dataset(test_name)

    ds = DatasetDict({
        "train": train_valid["train"],
        "validation": train_valid["dev"],
        "test": test["test"]
    })

    ds.save_to_disk(save_path)

print(ds)

Loading dataset from disk...
DatasetDict({
    train: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 8060
    })
    validation: Dataset({
        features: ['english', 'hinglish'],
        num_rows: 942
    })
    test: Dataset({
        features: ['english'],
        num_rows: 960
    })
})
