# <p align = "center"> **English To Hindi Translation - Transformers**</p>
<div align = "center">
    <img src = "https://peak-translations.co.uk/wp-content/uploads/2018/06/Creative-Hindi-alphabet-texture-background-2_1180x400_acf_cropped.jpg">
         </img>
</div>
    

### Downloading required Libraries and Text Vocabulary, embeddings and models

In [1]:
!pip install fasttext
!pip install inltk
!pip install gunzip

## Shall Clear the outputs for clean notebook

In [2]:
# Download the pretrained Fasttext Embeddings For Hindi Vocabulary
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.bin.gz


# Download the pretrained Fasttext Embeddings For English Vocabulary
! wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

## Shall Clear the outputs for clean notebook

In [3]:
# The downloads are zipped, so they are unzipped to get ".bin" files

! gunzip /content/cc.en.300.bin.gz
! gunzip /content/cc.hi.300.bin.gz

In [4]:
import os
from google.colab import drive


import numpy as np
import pandas as pd
import tensorflow as tf

import fasttext
import re

### Connecting to Drive

In [5]:
# The dataset has been loaded in Google Drive.
# Working in Colab, we can't upload the dataset everytime.
drive.mount('/content/drive')

In [6]:
datasets_path = '/content/drive/My Drive/Machine Learning Datasets/Hindi-English'
data_path = datasets_path + '/data.csv'

### Datasets

In [7]:
# Read only the required columns from dataset to save memory
# The Embedding Vectors are huge, and working with them would crash the memory, if not used efficiently
df = pd.read_csv(data_path, usecols = ['english_sentence', 'hindi_sentence'])               

In [8]:
df.head(10)

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,this percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,the ending portion of these vedas is called up...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
5,the then governor of kashmir resisted transfer...,कश्मीर के तत्कालीन गवर्नर ने इस हस्तांतरण का व...
6,in this lies the circumstances of people befor...,इसमें तुमसे पूर्व गुज़रे हुए लोगों के हालात हैं।
7,and who are we to say even that they are wrong,और हम होते कौन हैं यह कहने भी वाले कि वे गलत हैं
8,“”global warming“” refer to warming caused in ...,ग्लोबल वॉर्मिंग से आशय हाल ही के दशकों में हुई...
9,you may want your child to go to a school that...,हो सकता है कि आप चाहते हों कि आप का नऋर्नमेनटे...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127607 entries, 0 to 127606
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   english_sentence  127575 non-null  object
 1   hindi_sentence    127607 non-null  object
dtypes: object(2)
memory usage: 1.9+ MB


In [10]:
df.dropna(inplace = True)

### Data Cleaning
    -> Remove Punctuation Marks

In [11]:
spec_char_pattern = re.compile(r"""[।,.@_!#$%^&*()<>?/\\|'"}{~:]""")
df.english_sentence = df.english_sentence.apply(lambda x: spec_char_pattern.sub("", x))
df.hindi_sentence = df.hindi_sentence.apply(lambda x: spec_char_pattern.sub("", x))

In [12]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करन...
1,id like to tell you about one such child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी
2,this percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है
3,what we really mean is that theyre bad at not ...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,the ending portion of these vedas is called up...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है


### Text Vectorization
    -> Give a unique integral identity to each word in both English and Hindi Corpuses.
    -> Convert the texts into vectors, with the respective integers representing the words

In [13]:
def get_text_vectorizer(text_ds):

    # A tensorflow Layer that maps words in the given text to integer based on the vocabulary it has
    text_vectorizer = tf.keras.layers.TextVectorization()
    
    # Since we don't allready have a vocab, we let our TextVectorization Layer to adapt to our dataset
    text_vectorizer.adapt(text_ds)

    # After adapting to the dataset, the layer sorts the words based on their frequencies to get a vocabulary
    vocab = text_vectorizer.get_vocabulary()

    # The layer doesn't have a function to give us indices for the words, so we create a dictionary as it shall be used frequently
    word_index = dict(zip(vocab, range(len(vocab))))
    print("Total Number of Unique words in the text -", len(vocab))
    return word_index, text_vectorizer

In [14]:
eng_word_index, eng_vectorizer = get_text_vectorizer(df.english_sentence)
hin_word_index, hin_vectorizer = get_text_vectorizer(df.hindi_sentence)

Total Number of Unique words in the text - 76774
Total Number of Unique words in the text - 84467


In [15]:
feature = eng_vectorizer(df.english_sentence)               # English Sentences
target = hin_vectorizer(df.hindi_sentence)                  # Hindi Sentences

np.save((datasets_path + '/English_vectorized.npy'), feature.numpy())
np.save((datasets_path + '/Hindi_vectorized.npy'), target.numpy())

### Embedding Vectors For Our Vocabulary

In [None]:
embedding_dims = 300
hindi_embed_model = fasttext.load_model('/content/cc.hi.300.bin')
english_embed_model = fasttext.load_model('/content/cc.en.300.bin')

In [17]:
def get_embedding_matrix(word_index, embedding_model):
    num_tokens = len(word_index) + 2

    embedding_matrix = np.zeros((num_tokens, embedding_dims), dtype = np.float32)
    
    for word, idx in word_index.items():
        emb_vector = embedding_model.get_word_vector(word)
        embedding_matrix[idx] = emb_vector

    return embedding_matrix


In [19]:
eng_embedding_matrix = get_embedding_matrix(eng_word_index, english_embed_model)
hin_embedding_matrix = get_embedding_matrix(hin_word_index, hindi_embed_model)

In [20]:
np.save((datasets_path + '/English_embedding_matrix.npy'), eng_embedding_matrix)
np.save((datasets_path + '/Hindi_embedding_matrix.npy'), hin_embedding_matrix)


# END