In [37]:
import os
import ssl
import zipfile
import urllib.request
from datasets import Dataset, DatasetDict, load_from_disk, load_dataset

#### IITB Corpus

In [38]:
url = "https://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.zip"
data_dir = "./datasets/IITB"
zip_path = os.path.join(data_dir, "parallel.zip")
# 1 Try loading existing HuggingFace dataset firs
try:
    print("Trying to load HuggingFace dataset from disk...")
    dataset = load_from_disk(data_dir)
    print("Loaded existing dataset.")
    
except Exception:
    print("No saved HuggingFace dataset found. Building from raw files...")
    os.makedirs(data_dir, exist_ok=True)

    # 2 Download only if zip not present
    if not os.path.exists(zip_path):
        print("Downloading dataset...")
        ssl_context = ssl._create_unverified_context()
        with urllib.request.urlopen(url, context=ssl_context) as response, \
             open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # 3 Extract only if not already extracted
    extracted_marker = os.path.join(data_dir, "parallel")
    if not os.path.exists(extracted_marker):
        print("Extracting dataset...")
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(data_dir)

    # 4 Automatically locate .en and .hi files
    en_file = None
    hi_file = None

    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".en") and "IITB" in file:
                en_file = os.path.join(root, file)
            if file.endswith(".hi") and "IITB" in file:
                hi_file = os.path.join(root, file)

    if en_file is None or hi_file is None:
        raise FileNotFoundError("Could not locate IITB .en and .hi files.")

    print("Using files:")
    print(en_file)
    print(hi_file)

    # 5 Load corpus
    en_sentences = []
    hi_sentences = []

    with open(en_file, "r", encoding="utf-8") as f_en, \
         open(hi_file, "r", encoding="utf-8") as f_hi:
        for en, hi in zip(f_en, f_hi):
            en_sentences.append(en.strip())
            hi_sentences.append(hi.strip())

    full_dataset = Dataset.from_dict({
        "english": en_sentences,
        "hindi": hi_sentences
    })

    # 6 70/10/20 split
    split_1 = full_dataset.train_test_split(test_size=0.30, seed=42)
    train_dataset = split_1["train"]
    temp_dataset = split_1["test"]

    split_2 = temp_dataset.train_test_split(test_size=2/3, seed=42)

    dataset = DatasetDict({
        "train": train_dataset,
        "validation": split_2["train"],
        "test": split_2["test"]
    })

    dataset.save_to_disk(data_dir)
    print("Saved HuggingFace dataset to disk.")
# Final output
print(dataset)

Trying to load HuggingFace dataset from disk...
Loaded existing dataset.
DatasetDict({
    train: Dataset({
        features: ['english', 'hindi'],
        num_rows: 1161358
    })
    validation: Dataset({
        features: ['english', 'hindi'],
        num_rows: 165908
    })
    test: Dataset({
        features: ['english', 'hindi'],
        num_rows: 331817
    })
})


In [39]:
dataset['train'][0]

{'english': 'The British at once agreed and also made him their agent.',
 'hindi': 'जैसे समय बीतता गया परिस्थितियों में भी सुधार होता गया।'}

#### Open Subtitles

In [50]:
url = "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2024/moses/en-hi.txt.zip"
data_dir = "./datasets/OpenSubtitles"
zip_path = os.path.join(data_dir, "en-hi.txt.zip")

# try loading existing HuggingFace dataset
try:
    dataset = load_from_disk(data_dir)
    print("Loaded existing dataset.")

except Exception:
    print("Building dataset from raw files...")
    os.makedirs(data_dir, exist_ok=True)

    # download if zip not present
    if not os.path.exists(zip_path):
        ssl_context = ssl._create_unverified_context()
        with urllib.request.urlopen(url, context=ssl_context) as response, \
             open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # extract if not already extracted
    extract_marker = os.path.join(data_dir, "OpenSubtitles.en-hi.en")
    if not os.path.exists(extract_marker):
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(data_dir)

    # find .en and .hi files
    en_file = None
    hi_file = None

    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".en"):
                en_file = os.path.join(root, file)
            if file.endswith(".hi"):
                hi_file = os.path.join(root, file)

    if en_file is None or hi_file is None:
        raise FileNotFoundError("Could not locate .en and .hi files.")

    # load corpus
    en_sentences = []
    hi_sentences = []

    with open(en_file, "r", encoding="utf-8") as f_en, \
         open(hi_file, "r", encoding="utf-8") as f_hi:
        for en, hi in zip(f_en, f_hi):
            en_sentences.append(en.strip())
            hi_sentences.append(hi.strip())

    full_dataset = Dataset.from_dict({
        "english": en_sentences,
        "hindi": hi_sentences
    })

    # 70/10/20 split
    split_1 = full_dataset.train_test_split(test_size=0.30, seed=42)
    train_dataset = split_1["train"]
    temp_dataset = split_1["test"]

    split_2 = temp_dataset.train_test_split(test_size=2/3, seed=42)

    dataset = DatasetDict({
        "train": train_dataset,
        "validation": split_2["train"],
        "test": split_2["test"]
    })

    dataset.save_to_disk(data_dir)
    print("Saved dataset.")

print(dataset)

Loaded existing dataset.
DatasetDict({
    train: Dataset({
        features: ['english', 'hindi'],
        num_rows: 2103175
    })
    validation: Dataset({
        features: ['english', 'hindi'],
        num_rows: 300454
    })
    test: Dataset({
        features: ['english', 'hindi'],
        num_rows: 600908
    })
})


In [51]:
dataset['train'][1:50]

{'english': ['Look at me.',
  "So, I'm gonna get back to work.",
  "I guess there is some small comfort in knowing that either way, we're dead.",
  'They determined that three seconds of eye contact made the other person feel most comfortable.',
  'I never did the work to find out who I am or who I am supposed to be.',
  "He actually didn't deliver the car to Gol Bandya... but to Fayyaz bhaee, the Don.",
  "I'm the Chief of Communications.",
  "Where's my cardigan?",
  '"Where\'s the dust?',
  "I've been hearing you come home every night at God knows what hour.",
  'With pretty names',
  'But that means that Mother Ginger has it.',
  'Why?',
  '- Hey, get the fuck off me!',
  "We'll go over here, you stay there. Don't listen to what we're saying.",
  'Can I have a picture please?',
  "Let's utilize the mindset of unlimited potential to discuss this new business venture the boss man wants to talk about.",
  'I do not know.',
  "'Milk is flowing nonstop!",
  "You're going crazy and don't

#### Ted talks

In [52]:
url = "https://object.pouta.csc.fi/OPUS-TED2020/v1/moses/en-hi.txt.zip"
data_dir = "./datasets/TED2020"
zip_path = os.path.join(data_dir, "en-hi.txt.zip")

# try loading existing HuggingFace dataset
try:
    dataset = load_from_disk(data_dir)
    print("Loaded existing dataset.")

except Exception:
    print("Building dataset from raw files...")
    os.makedirs(data_dir, exist_ok=True)

    # download if zip not present
    if not os.path.exists(zip_path):
        ssl_context = ssl._create_unverified_context()
        with urllib.request.urlopen(url, context=ssl_context) as response, \
             open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # extract if not already extracted
    extract_marker = os.path.join(data_dir, "TED2020.en-hi.en")
    if not os.path.exists(extract_marker):
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(data_dir)

    # find .en and .hi files
    en_file = None
    hi_file = None

    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if file.endswith(".en"):
                en_file = os.path.join(root, file)
            if file.endswith(".hi"):
                hi_file = os.path.join(root, file)

    if en_file is None or hi_file is None:
        raise FileNotFoundError("Could not locate .en and .hi files.")

    # load corpus
    en_sentences = []
    hi_sentences = []

    with open(en_file, "r", encoding="utf-8") as f_en, \
         open(hi_file, "r", encoding="utf-8") as f_hi:
        for en, hi in zip(f_en, f_hi):
            en_sentences.append(en.strip())
            hi_sentences.append(hi.strip())

    full_dataset = Dataset.from_dict({
        "english": en_sentences,
        "hindi": hi_sentences
    })

    # 70/10/20 split
    split_1 = full_dataset.train_test_split(test_size=0.30, seed=42)
    train_dataset = split_1["train"]
    temp_dataset = split_1["test"]

    split_2 = temp_dataset.train_test_split(test_size=2/3, seed=42)

    dataset = DatasetDict({
        "train": train_dataset,
        "validation": split_2["train"],
        "test": split_2["test"]
    })

    dataset.save_to_disk(data_dir)
    print("Saved dataset.")

print(dataset)


Loaded existing dataset.
DatasetDict({
    train: Dataset({
        features: ['english', 'hindi'],
        num_rows: 33343
    })
    validation: Dataset({
        features: ['english', 'hindi'],
        num_rows: 4763
    })
    test: Dataset({
        features: ['english', 'hindi'],
        num_rows: 9527
    })
})


In [54]:
dataset['train'][1:10]

{'english': ['At home, online, in school, in their communities.',
  "And that tells us something, that this isn't entertainment for children anymore.",
  "The final thing is the notion of India as a single market -- because when you didn't think of India as a market, you didn't really bother about a single market, because it didn't really matter.",
  'We had Jim Dombrowski, Albert Ben Smith, who started all kinds of things right in that restaurant, and nobody ever bothered us.',
  "We're using cognition to control our behavior.",
  'Myself, I will go back to the East.',
  "But it isn't that I'm not grateful, but I think, as long as you're living, you've got to keep moving, you've got to keep trying to get up and do what you've got to do.",
  'What will you do with your intentional empty space, with your fresh start?',
  "They don't know, OK, and they're trying to get another member of The 99 to join them."],
 'hindi': ['घर में, ऑनलाइन, स्कूल में, अपने समुदायों में ।',
  'और वह हमें कुछ

#### Samanantar

In [44]:
dataset_path = "./datasets/samanantar"
hf_marker = os.path.join(dataset_path, "dataset_dict.json")

# if valid HF dataset exists → load
if os.path.exists(hf_marker):
    print("Loading dataset from local disk...")
    dataset = load_from_disk(dataset_path)

else:
    print("Dataset not found locally. Downloading...")
    os.makedirs(dataset_path, exist_ok=True)

    dataset = load_dataset("ai4bharat/samanantar", "hi")

    # create 70/10/20 split if only train exists
    if "train" in dataset and len(dataset) == 1:
        full_dataset = dataset["train"]

        split_1 = full_dataset.train_test_split(test_size=0.30, seed=42)
        temp_dataset = split_1["test"]

        split_2 = temp_dataset.train_test_split(test_size=2/3, seed=42)

        dataset = DatasetDict({
            "train": split_1["train"],
            "validation": split_2["train"],
            "test": split_2["test"]
        })

    # rename columns if needed
    for split in dataset.keys():
        if "src" in dataset[split].column_names:
            dataset[split] = dataset[split].rename_columns({
                "src": "english",
                "tgt": "hindi"
            })

    dataset.save_to_disk(dataset_path)
    print("Saved dataset.")

print(dataset)


Loading dataset from local disk...
DatasetDict({
    train: Dataset({
        features: ['idx', 'english', 'hindi'],
        num_rows: 7087994
    })
    validation: Dataset({
        features: ['idx', 'english', 'hindi'],
        num_rows: 1012570
    })
    test: Dataset({
        features: ['idx', 'english', 'hindi'],
        num_rows: 2025142
    })
})


In [45]:
dataset['train'][1]

{'idx': 9583251,
 'english': 'Modi is abusing history for political mileage',
 'hindi': 'मोदी : राजनैतिक फायदे के लिये इतिहास का दुरूपयोग'}

#### Global Voices

In [55]:
url = "https://object.pouta.csc.fi/OPUS-GlobalVoices/v2018q4/moses/en-hi.txt.zip"
data_dir = "./datasets/GlobalVoices"
zip_path = os.path.join(data_dir, "en-hi.txt.zip")
txt_path_en = os.path.join(data_dir, "GlobalVoices.en-hi.en")
txt_path_hi = os.path.join(data_dir, "GlobalVoices.en-hi.hi")

hf_marker = os.path.join(data_dir, "dataset_dict.json")

# if valid HF dataset exists → load
if os.path.exists(hf_marker):
    print("Loaded existing dataset.")
    dataset = load_from_disk(data_dir)

else:
    print("Building dataset...")
    os.makedirs(data_dir, exist_ok=True)

    # download if needed
    if not os.path.exists(zip_path):
        print("Downloading dataset...")
        urllib.request.urlretrieve(url, zip_path)

    # extract if needed
    if not os.path.exists(txt_path_en) or not os.path.exists(txt_path_hi):
        print("Extracting dataset...")
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(data_dir)

    # load corpus
    en_sentences = []
    hi_sentences = []

    with open(txt_path_en, "r", encoding="utf-8") as f_en, \
         open(txt_path_hi, "r", encoding="utf-8") as f_hi:
        for en, hi in zip(f_en, f_hi):
            en_sentences.append(en.strip())
            hi_sentences.append(hi.strip())

    full_dataset = Dataset.from_dict({
        "english": en_sentences,
        "hindi": hi_sentences
    })

    # 70/10/20 split
    split_1 = full_dataset.train_test_split(test_size=0.30, seed=42)
    temp_dataset = split_1["test"]

    split_2 = temp_dataset.train_test_split(test_size=2/3, seed=42)

    dataset = DatasetDict({
        "train": split_1["train"],
        "validation": split_2["train"],
        "test": split_2["test"]
    })

    dataset.save_to_disk(data_dir)
    print("Saved dataset.")

print(dataset)

Loaded existing dataset.
DatasetDict({
    train: Dataset({
        features: ['english', 'hindi'],
        num_rows: 1843
    })
    validation: Dataset({
        features: ['english', 'hindi'],
        num_rows: 263
    })
    test: Dataset({
        features: ['english', 'hindi'],
        num_rows: 528
    })
})


In [57]:
dataset['train'][1]

{'english': 'Videos showed policemen in plainclothes confronting the peaceful protesters, pulling and tearing the signs from Nga’s supporters.',
 'hindi': 'कई वीडियो में सादे कपड़ों में पुलिसकर्मी को शांतिपूर्ण रूप से धरना दे रहे प्रदर्शनकारियों से तख्तियाँ को छीन कर फाड़ते देखा गया।'}