## TRANSFORMERS DATA CREATION

#### DATA CLEANING

In [None]:
import pandas as pd
import numpy as np

In [None]:
input_csv = pd.read_csv('/Volumes/T7/ML/Data/Transformers/English_to_hindi_translation.csv')

In [None]:
input_csv.head()

In [None]:
cleaned=input_csv.drop(['source'], axis= 1)

In [None]:
cleaned.head()

In [None]:
cleaned = cleaned.dropna()

In [None]:
cleaned.isna().sum()

In [None]:
cleaned.describe()

In [None]:
unk_token = '<unk>'

In [None]:
import re

def remove_punctuations_and_lower(sentence):
    clean_sentence = re.sub(r'[^\w\s]','', sentence)
    
    # \w -> non word \s -> non space, so, \w\s replaces any special characters other than a non word or a non space.
    return clean_sentence.lower()

In [None]:
cleaned['english_sentence_no_punctuations'] = cleaned['english_sentence'].apply(remove_punctuations_and_lower)
cleaned['hindi_sentence_no_punctuations'] = cleaned['hindi_sentence'].apply(remove_punctuations_and_lower)

In [None]:
cleaned.head()

In [None]:
from string import digits
remove_digits = str.maketrans('', '', digits) #digits to be removed

def remove_digits_from_english(sentence): 
     return sentence.translate(remove_digits)
    
def remove_digits_from_hindi(sentence): 
     return re.sub("[२३०८१५७९४६]", "", sentence)

In [None]:
cleaned['english_cleaned'] = cleaned['english_sentence_no_punctuations'].apply(remove_digits_from_english)
cleaned['hindi_cleaned'] = cleaned['hindi_sentence_no_punctuations'].apply(remove_digits_from_hindi)

In [None]:
cleaned = cleaned.drop(['english_sentence_no_punctuations','hindi_sentence_no_punctuations'], axis=1)

In [None]:
cleaned.head()

In [None]:
cleaned.tail()

#### Adding Start And End Token

In [None]:
def Add_Token(sentence):
    sentence = re.sub(r'\s+', ' ', sentence)
    return f"<START> {sentence} <END>"

In [None]:
cleaned['english_cleaned']= cleaned['english_cleaned'].apply(Add_Token)
cleaned['hindi_cleaned']= cleaned['hindi_cleaned'].apply(Add_Token)

In [None]:
cleaned.head()

#### DICTIONARY FORMATION

In [None]:
HINDI_VOCAB = set()
ENGLISH_VOCAB = set()

In [None]:
def tokenize(sentence):
    temporary_set = set()
    
    for word in sentence.strip().split(' '):
        temporary_set.add(word)
    return temporary_set

In [None]:
def create_vocab(sentences, dictionary):
    i=0
    for sentence in sentences:
        if i%10000==0:
            print(f"{i} done")
        temp_set= tokenize(sentence)
        dictionary=dictionary.union(temp_set)
        i+=1
        
    return dictionary

In [None]:
ENGLISH_VOCAB = create_vocab(cleaned['english_cleaned'].tolist(),ENGLISH_VOCAB)
print("English done")
HINDI_VOCAB = create_vocab(cleaned['hindi_cleaned'].tolist(),HINDI_VOCAB)

In [None]:
len(ENGLISH_VOCAB)
len(HINDI_VOCAB)

In [None]:
ENGLISH_VOCAB.add("<START>")
ENGLISH_VOCAB.add("<END>")
HINDI_VOCAB.add("<START>")
HINDI_VOCAB.add("<END>")

#### Detecting Max Length for input and output sentences

In [None]:
ENGLISH_MAX_LENGTH=0
HINDI_MAX_LENGTH=0

In [None]:
def get_max_length_in_words(sentences):
    length_of_sentences = [len(sentence.split(' ')) for sentence in sentences]
    return np.max(np.array(length_of_sentences))

In [None]:
ENGLISH_MAX_LENGTH = get_max_length_in_words(cleaned['english_cleaned'].tolist())
HINDI_MAX_LENGTH = get_max_length_in_words(cleaned['hindi_cleaned'].tolist())

In [None]:
ENGLISH_MAX_LENGTH

In [None]:
HINDI_MAX_LENGTH

In [None]:
ENGLISH_AVG_LENGTH=0
HINDI_AVG_LENGTH=0

In [None]:
def get_avg_length_in_words(sentences):
    length_of_sentences = [len(sentence.split(' ')) for sentence in sentences]
    return np.mean(np.array(length_of_sentences))

In [None]:
ENGLISH_AVG_LENGTH = get_avg_length_in_words(cleaned['english_cleaned'].tolist())
HINDI_AVG_LENGTH = get_avg_length_in_words(cleaned['hindi_cleaned'].tolist())

In [None]:
ENGLISH_AVG_LENGTH

In [None]:
HINDI_AVG_LENGTH

In [None]:
INPUT_LENGTH = 30
OUTPUT_LENGTH = 30

In [None]:
def limit_number_of_words_in_the_sentence(sentence):
    words = sentence.split(' ')
    words_capped = words[:INPUT_LENGTH-1]
    new_sentence=""
    
    for word in words_capped:
        new_sentence+=f" {word}"
    
    return new_sentence

In [None]:
cleaned['english_cleaned'] = cleaned['english_cleaned'].apply(limit_number_of_words_in_the_sentence)
cleaned['hindi_cleaned'] = cleaned['hindi_cleaned'].apply(limit_number_of_words_in_the_sentence)

In [None]:
cleaned.head()

#### Creating dictionary indexes

In [None]:
ENGLISH_VOCAB_LIST = sorted(list(ENGLISH_VOCAB))
HINDI_VOCAB_LIST = sorted(list(HINDI_VOCAB))

In [None]:
english_dictionary_index = dict([(word, i+1) for i, word in enumerate(ENGLISH_VOCAB_LIST)])
hindi_dictionary_index = dict([(word, i+1) for i, word in enumerate(HINDI_VOCAB_LIST)])

In [None]:
reverse_english_dictionary_index = dict((i, word) for word, i in english_dictionary_index.items())
reverse_hindi_dictionary_index = dict((i, word) for word, i in hindi_dictionary_index.items())

#### Encoding the sentences

In [None]:
def encode(sentence, index_dictionary):
    
    sentence_tokens = []
    
    for index in range(len(sentence.split(' '))):
        sentence_tokens.append(index_dictionary[sentence.split(' ')[index]])
        
    return sentence_tokens

In [None]:
cleaned['index_list_english'] = cleaned['english_cleaned'].apply(encode,index_dictionary=english_dictionary_index)

In [None]:
cleaned['index_list_hindi'] = cleaned['hindi_cleaned'].apply(encode,index_dictionary=hindi_dictionary_index)

#### Adding 0 padding to have proper length

In [None]:
def pad(index_list):
    pad_seq_size = INPUT_LENGTH - len(index_list)
    zero_arr = np.zeros(pad_seq_size)
    padded_arr = np.concatenate([np.array(index_list),zero_arr])
    
    return padded_arr

In [None]:
cleaned['index_list_english'] = cleaned['index_list_english'].apply(pad)
cleaned['index_list_hindi'] = cleaned['index_list_hindi'].apply(pad)

In [None]:
cleaned.head()

#### Creating Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(cleaned['index_list_english'], cleaned['index_list_hindi'], test_size = 0.3,random_state=42)

#### Data Storing

In [None]:
import pickle

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/X_train.pkl", 'wb') as file:
    pickle.dump(x_train,file)

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/Y_train.pkl", 'wb') as file:
    pickle.dump(y_train,file)

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/X_test.pkl", 'wb') as file:
    pickle.dump(x_test,file)

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/Y_test.pkl", 'wb') as file:
    pickle.dump(y_test,file)

#### Storing dictionaries

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/English_Dict.pkl", 'wb') as file:
    pickle.dump(english_dictionary_index,file)

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/Hindi_Dict.pkl", 'wb') as file:
    pickle.dump(hindi_dictionary_index,file)

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/Reverse_English_Dict.pkl", 'wb') as file:
    pickle.dump(reverse_english_dictionary_index,file)

In [None]:
with open("/Volumes/T7/ML/Data/Transformers/utilities/Reverse_Hindi_Dict.pkl", 'wb') as file:
    pickle.dump(reverse_hindi_dictionary_index,file)