In [1]:
import csv
import tarfile
import zipfile
import os
import gzip
import pandas as pd
from pprint import pprint
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model



pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

  import pandas.util.testing as tm
Using TensorFlow backend.


In [13]:
def open_gzip_file(path_to_gzip_file):
    
    with gzip.open(path_to_gzip_file,'rb') as file:
        
        ungzipped=file.read().decode('utf-8').split('\n')
    
    print('Raw file split on new lines')
    print('\n',ungzipped[:20],'\n')
    return ungzipped

def frame_the_data(ungzipped_file,path_to_file='hindi_english_parallel_corpus.csv'):
    
    samples= []
    
    for sample in ungzipped_file:
        
        samples.append(sample.split('\t'))
    print('Each sample on tab') 
    print('\n',samples[:20],'\n')
            
    data = pd.DataFrame(samples,columns=['source','alignment_type','alignment_quality','english','hindi'])
    
    print('\n','DataFrame structure','\n')
    pprint(data.sample(10))
    
    data.to_csv(os.path.join(os.getcwd(),path_to_file))
    return data
             
    
    


if __name__=='__main__':
    
    # path_to_zip_file='HindEnCorp 0.5.zip'
    # directory_to_extract_to = os.getcwd()
    # unzip_main_file(path_to_zip_file,directory_to_extract_to)
    
    path_to_gzip_file = 'hindencorp05.plaintext.gz'
    read_file = open_gzip_file(path_to_gzip_file)
    
    data = frame_the_data(read_file)

Raw file split on new lines

 ['wikiner2013inflected\t1-1\t1.000\tSharaabi\tशराबी', 'ted\t1-1\t1.0\tpoliticians do not have permission to do what needs to be done.\tराजनीतिज्ञों के पास जो कार्य करना चाहिए, वह करने कि अनुमति नहीं है .', "ted\t1-1\t1.0\tI'd like to tell you about one such child,\tमई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी,", 'indic2012\t1-1\tmanual\tThis percentage is even greater than the percentage in India.\tयह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।', 'quote-name\t1-1\t1.0\t- John Collins\t- जॉन कॉलिन्स', "ted\t1-1\t1.0\twhat we really mean is that they're bad at not paying attention.\tहम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते", 'launchpad\t1-1\timplied\t%{APPNAME} would like to send notifications, but you need to be signed in to Chrome.\t%{APPNAME} सूचनाएं भेजना चाहता है, लेकिन आपको Chrome में साइन इन होना होगा.', 'launchpad\t1-1\timplied\tImportant Messages\tमहत्वपूर्ण संदेश', "launchpad\t1-1\timplied\tUser authentication required for VPN connecti

In [2]:
# Analyzing the text
path_to_file = 'hindi_english_parallel_corpus.csv'
data = pd.read_csv(path_to_file,index_col=0)


In [17]:
data.head(10)

Unnamed: 0,source,alignment_type,alignment_quality,english,hindi
0,wikiner2013inflected,1-1,1.000,Sharaabi,शराबी
1,ted,1-1,1.0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
2,ted,1-1,1.0,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,indic2012,1-1,manual,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
4,quote-name,1-1,1.0,- John Collins,- जॉन कॉलिन्स
5,ted,1-1,1.0,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
6,launchpad,1-1,implied,"%{APPNAME} would like to send notifications, b...","%{APPNAME} सूचनाएं भेजना चाहता है, लेकिन आपको ..."
7,launchpad,1-1,implied,Important Messages,महत्वपूर्ण संदेश
8,launchpad,1-1,implied,User authentication required for VPN connectio...,उपयोक्ता सत्यापन VPN संबंधन '%s' के लिए जरूरी ...
9,launchpad,1-1,implied,Surface width,सतल चौड़ाई


In [19]:
from collections import Counter 

def most_common_words()
cnt = Counter(hindi_docs)

cnt.most_common(10)

[('(हँसी)', 212),
 ('(हंसी)', 194),
 ('(तालियाँ)', 148),
 ('संदर्भ', 136),
 ('बाहरी कड़ियाँ', 107),
 ('(अभिवादन)', 103),
 ('यह भी देखें', 84),
 ('इतिहास', 82),
 ('इन्हें भी देखें', 72),
 ('धन्यवाद.', 37)]

In [25]:
pd.isnull(data).sum()

source               1
alignment_type       1
alignment_quality    1
english              7
hindi                1
dtype: int64

In [3]:
data=data[~pd.isnull(data['english'])]

data.drop_duplicates(inplace=True)

In [4]:
def processing_text(lines):
    #lowercase
    lines['english']=lines['english'].apply(lambda x: x.lower())
    lines['hindi']=lines['hindi'].apply(lambda x: x.lower())
    
    #remove quotes
    lines['english']=lines['english'].apply(lambda x: re.sub("'", '', x))
    lines['hindi']=lines['hindi'].apply(lambda x: re.sub("'", '', x))
    
    exclude = set(string.punctuation) # Set of all special characters
    # Remove all the special characters
    lines['english']=lines['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    lines['hindi']=lines['hindi'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
    
    # Remove all numbers from text
    remove_digits = str.maketrans('', '', digits)
    lines['english']=lines['english'].apply(lambda x: x.translate(remove_digits))
    lines['hindi']=lines['hindi'].apply(lambda x: x.translate(remove_digits))

    lines['hindi'] = lines['hindi'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

    # Remove extra spaces
    lines['english']=lines['english'].apply(lambda x: x.strip())
    lines['hindi']=lines['hindi'].apply(lambda x: x.strip())
    lines['english']=lines['english'].apply(lambda x: re.sub(" +", " ", x))
    lines['hindi']=lines['hindi'].apply(lambda x: re.sub(" +", " ", x))
    
    # Add start and end tokens to target sequences
    lines['hindi'] = lines['hindi'].apply(lambda x : 'START_ '+ x + ' _END')
    
    return lines

In [5]:
data = processing_text(data)

In [7]:
data[['english','hindi']]

Unnamed: 0,english,hindi
0,sharaabi,START_ शराबी _END
1,politicians do not have permission to do what needs to be done,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए वह करने कि अनुमति नहीं है _END
2,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहूंगी _END
3,this percentage is even greater than the percentage in india,START_ यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है। _END
4,john collins,START_ जॉन कॉलिन्स _END
...,...,...
273880,national atmospheric research laboratory,START_ राष्ट्रीय वायुमण्डलीय अनुसंधान प्रयोगशाला _END
273881,adoor gopalakrishnan,START_ अदूर गोपालकृष्णन _END
273882,natural language,START_ प्राकृतिक भाषा _END
273883,unexpected icon chunk in animation,START_ एनीमेशन में अप्रत्याशित चिह्न चंक _END


In [12]:
data[['english','hindi']].to_csv('hindi_english.csv',index=False)

In [15]:
import csv

with open('hindi_english.csv','r',encoding='utf-8') as csvin, open('hindi_english.tsv', 'w',encoding='utf-8') as tsvout:
    csvin = csv.reader(csvin)
    tsvout = csv.writer(tsvout, delimiter='\t')

    for row in csvin:
        tsvout.writerow(row)

In [33]:
### Get English and Hindi Vocabulary
all_eng_words=set()
for eng in data['english']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in data['hindi']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [36]:
len(all_eng_words)

106375

In [37]:
len(all_hindi_words)

130741

In [38]:
data['length_eng_sentence']=data['english'].apply(lambda x:len(x.split(" ")))
data['length_hin_sentence']=data['hindi'].apply(lambda x:len(x.split(" ")))

In [51]:
import numpy as np
np.max(data['length_eng_sentence'])

626

In [52]:
np.max(data['length_hin_sentence'])

419

In [53]:
data=data[data['length_eng_sentence']<=419]
data=data[data['length_hin_sentence']<=419]

In [49]:
data[data['length_hin_sentence']>120].shape

(836, 7)

In [54]:
max_length_src=max(data['length_hin_sentence'])
max_length_tar=max(data['length_eng_sentence'])

In [55]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(106375, 130741)

In [58]:
print(input_words)



In [56]:
num_decoder_tokens += 1 #for zero padding

In [57]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [59]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [61]:
data=shuffle(data)
X, y = data['english'], data['hindi']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((216432,), (54108,))

In [62]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')

In [63]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [64]:
latent_dim=300

In [65]:
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [66]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [67]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [68]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    31912500    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 300)    39222600    input_2[0][0]                    
____________________________________________________________________________________________

In [69]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 100

In [70]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/100


MemoryError: Unable to allocate 24.8 GiB for an array with shape (128, 398, 130742) and data type float32

In [None]:
from keras.