# Install Dependencies

In [None]:
# !pip install datasets==1.18.1

## Please ensure you have executed this at least once.

# Download Parallel Dataset

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("cfilt/iitb-english-hindi")

### View Parallel Corpus Details

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})

### Extract Dataset in Source and Target Text files

In [None]:
source_train_file = open("source_train.txt", "w+", encoding='utf8')
target_train_file = open("target_train.txt", "w+", encoding='utf8')
for translation_pair in dataset["train"]["translation"]:
  source_sentence = translation_pair["en"]
  target_sentence = translation_pair["hi"]
  source_train_file.write(source_sentence.strip("\n") + "\n")
  target_train_file.write(target_sentence.strip("\n") + "\n")
source_train_file.close()
target_train_file.close()

source_valid_file = open("source_valid.txt", "w+", encoding='utf8')
target_valid_file = open("target_valid.txt", "w+", encoding='utf8')
for translation_pair in dataset["validation"]["translation"]:
  source_sentence = translation_pair["en"]
  target_sentence = translation_pair["hi"]
  source_valid_file.write(source_sentence.strip("\n") + "\n")
  target_valid_file.write(target_sentence.strip("\n") + "\n")
source_valid_file.close()
target_valid_file.close()

source_test_file = open("source_test.txt", "w+", encoding='utf8')
target_test_file = open("target_test.txt", "w+", encoding='utf8')
for translation_pair in dataset["test"]["translation"]:
  source_sentence = translation_pair["en"]
  target_sentence = translation_pair["hi"]
  source_test_file.write(source_sentence.strip("\n") + "\n")
  target_test_file.write(target_sentence.strip("\n") + "\n")
source_test_file.close()
target_test_file.close()


### Parallel Corpus Sentence Pairs (Linux)

In [None]:
! wc -l source_train.txt target_train.txt source_valid.txt target_valid.txt source_test.txt target_test.txt

# Byte Pair Encoding (BPE)

In [None]:
# ! pip install subword-nmt

## Please ensure you have executed this at least once.

## Learn BPE codes

Note: Please make sure to generate the text files by running the previous cells

In [None]:
import os
os.environ['NUM_OF_MERGE_OPERATIONS'] = "16000"

# Or set environment variable NUM_OF_MERGE_OPERATIONS to desired number of BPE merge operations

In [None]:
! cat source_train.txt source_test.txt source_valid.txt > source_full.txt
! cat target_train.txt target_test.txt target_valid.txt > target_full.txt

In [None]:
! subword-nmt learn-bpe -s $NUM_OF_MERGE_OPERATIONS < source_full.txt > source-bpe.codes
! subword-nmt learn-bpe -s $NUM_OF_MERGE_OPERATIONS < target_full.txt > target-bpe.codes


## Apply BPE

In [None]:
! subword-nmt apply-bpe -c source-bpe.codes < source_train.txt > source_train_bpe.txt
! subword-nmt apply-bpe -c source-bpe.codes < source_valid.txt > source_valid_bpe.txt
! subword-nmt apply-bpe -c source-bpe.codes < source_test.txt > source_test_bpe.txt

! subword-nmt apply-bpe -c target-bpe.codes < target_train.txt > target_train_bpe.txt
! subword-nmt apply-bpe -c target-bpe.codes < target_valid.txt > target_valid_bpe.txt
! subword-nmt apply-bpe -c target-bpe.codes < target_test.txt > target_test_bpe.txt


## BPE Codes (Linux)

In [None]:
! wc -l source-bpe.codes target-bpe.codes

## BPE Parallel Corpus Sentence Pairs (Linux)

In [None]:
! wc -l source_train_bpe.txt target_train_bpe.txt source_valid_bpe.txt target_valid_bpe.txt source_test_bpe.txt target_test_bpe.txt