In [1]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import nltk
import numpy as np

# threshold for minimum count to be considered a valid word
MIN_VOCAB_COUNT = 3
OOV_TOKEN = "UNK"

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='r', encoding='utf-8')
    text = file.read()
    file.close()
    return text

def RepresentsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

# remove numbers. they can be like 100 1090,200 2.123 etc
# strategy is to remove punctuation and then check if its an integer
def isNumber(word):
    word_no_num = re.sub(r'[^\w\s]','',word)
    if RepresentsInt(word_no_num):
        return True
    else:
        return False

#tokenizes raw strings
def getTokenized(lines):
    exclude = set(string.punctuation)
    exclude.add('-')
    exclude.add('।')
    words_list = [] 
    total_sent = len(lines)
    n = 0
    for line in lines:
        line = line.strip()
        words = nltk.word_tokenize(line)
        words_nopunc_nonum = []
        for word in words:
            if word in exclude: # if punctuation
                continue
            else:
                word = word.replace('।', '')
                if(isNumber(word)): # if number
                    word = "NUMBER"
                words_nopunc_nonum.append(word)
#         if(len(words_nopunc_nonum) >= 1):
#             words_nopunc_nonum[-1] = words_nopunc_nonum[-1].replace('।', '')
        words_list.append(words_nopunc_nonum)
        if(n%10000 == 0):
            print((n/total_sent)*100.0, '% done')
        n+= 1
    return words_list


# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

def getVocabulary(tokenized_corpus):
    vocabulary = {}
    total_sent = len(tokenized_corpus)
    n = 0
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                vocabulary[token] = 1
            else:
                vocabulary[token] += 1
        if(n%10000 == 0):
            print((n/total_sent)*100.0, '% done')
        n+= 1
    new_dict = {}
    oov_count = 0 
    # remove infrequent words
    for word, count in vocabulary.items():
        if(count >= MIN_VOCAB_COUNT):
            new_dict[word] = count
        else:
            oov_count += count
    new_dict[OOV_TOKEN] = oov_count
    word2id = {w: idx for (idx, w) in enumerate(new_dict)}
    id2word = {idx: w for (idx, w) in enumerate(new_dict)}
    return new_dict, word2id, id2word

def removeOOV(sentences, vocab):
    new_sentences = []
    for sentence in sentences:
        new_sent = []
        for word in sentence:
            if word in vocab:
                new_sent.append(word)
            else:
                new_sent.append(OOV_TOKEN)
        new_sentences.append(new_sent)
    return new_sentences

In [2]:
hindi = load_doc('data/hindi/hindmonocorp05.plaintext')

In [3]:
len(hindi)

4359377142

In [65]:
hindi = hindi[:100000]

'hwt2013\t<s>\tलेकिन गांव के जगदीश मेघवाल, मोहन...\nspiderling\t<s>\tविटामिन सी शरीर में रोग पैदा करने वाल'

In [4]:
sentences = hindi.split('\n')

In [67]:
len(sentences)

44486483

In [8]:
sentences[:4]

['लेकिन गांव के जगदीश मेघवाल, मोहन...',
 'विटामिन सी शरीर में रोग पैदा करने वाले विषाणुओं से लड़ने की ताकत पैदा करता है और शरीर में इसकी संतुलित मात्रा बने रहने से रोग प्रतिरोधक क्षमता मजबूत रहती है।',
 'इन बोतलों के बहुत कम पैसे मिलते हैं।',
 'कार्टून :- रे लोकपाल आ गया तू ? शाबाश.... 19 0']

In [5]:
new_sentences = []
for line in sentences:
    tokens = line.split('\t')
    if len(tokens) >= 3:
        new_sentences.append(tokens[2])


In [9]:
new_sentences[-1]

'इन दिनों उसी जमीन पर काम चल रहा है।'

In [10]:
sentences = new_sentences
small = sentences[:100000]

In [2]:
x = load_doc('hindi_small.txt')

In [4]:
x = x.split('\n')
len(x)

4448649

In [8]:
x = x[:100000]

In [10]:
tokenized_hindi = getTokenized(x)

0.0 % done
10.0 % done
20.0 % done
30.0 % done
40.0 % done
50.0 % done
60.0 % done
70.0 % done
80.0 % done
90.0 % done


In [11]:
small = x
small[100]

'भारी हिमपात के समय यहां पर 2 फीट बर्फ की मोटी परत जम जाती है।'

In [12]:
fw = open('hindi_small.txt', "wb")
for line in small:
    string_for_output = (line + '\n').encode('utf8', 'replace')
    fw.write(string_for_output)
fw.close()

In [13]:
word = tokenized_hindi[1010][-1]
word
# word.replace('।', '')

'किया'

In [14]:
vocabulary, word2id, id2word = getVocabulary(tokenized_hindi)

0.0 % done
10.0 % done
20.0 % done
30.0 % done
40.0 % done
50.0 % done
60.0 % done
70.0 % done
80.0 % done
90.0 % done


In [15]:
len(word2id)

27625

In [16]:
tokenized_hindi = removeOOV(tokenized_hindi, vocabulary)

In [17]:
for i in range(2000,3000):
    print(id2word[i])

बीड़ा
केजरीवाल
फ्रांस
झरना
नामों
प्रियंका
लौटाई
घनघोर
भावनाएं
रेललाइन
एमएसपी
आभा
उगाहने
वार्नर
शोभा
प्रत्यर्पण
पहुँचाना
बालाओं
ा
ओम्
मोहाली
धारीगाढ
थ्
अल्
लगानी
प्रपत्र
राजस्
शुगर
सांगठनिक
कालू
उन्
बागडोर
बट्टा
ईश
बाइबिल
सुपारी
अनाथ
हफ्तेभर
जहाज़
पेशानी
अधिष्ठापन
मोज़िला
ष्
आत्मविश्वासी
माइ
मुताबित
आयुवानसिंह
पुनर्गठन
कोठों
.
शुभ
मुर्दा
भूलता
प्राथमिक
तत्पश्चात
कारवाई
गाढ़ी
कलंदर
संवारा
सिल्क
चुभती
तनेजा
तत्कालिक
क्रिस्टी
कलाएँ
स्टेट्स
any
भीड़
एजेंडा
कांग्रेसजनों
डि
जिलेभर
करी
सड़ा
देश-काल
आस्तिक
मध्य
गलियों
बिल्डर
मार्गशीर्ष
पूर्वाह्न
सपोर्ट
ऊन
देखते
बेहतरीन
जपते
मढ़
बजती
बहकर
उपलक्ष्य
महिलायें
पीएस
भारतविरोधी
प्रोफेशनल्स
प्लान
ख
प्राइम
गुंडा
तस
दूँगी
कॅंपस
मात्र
यो
झगड़ों
सुहानी
चारा
कपडे
आग्रह
बुलाता
भ्रष्
दुर्गे
हज़ारे
पिछड़ों
प्रवृति
शिमला
संवाददाता-
देसाई
झांसे
सँवर
बेवकूफ़
डम्बलडोर
लोकपाल
कुत्तो
जनार्दन
चिट्ठाचर्चा
बांध
ज़रिये
उमेश
भेद-भाव
बरन
अकिंचन
टेपिंग
रॉयल्स
रघुराज
मित्र
दरबारियों
कहिन
पांवों
ऑप्शन
रुबरु
दिल्ली-
वस्तुएं
एक्स्प्रेस
वाहनों
शैलेश
दुर्गेश
नाराज़गी
पढ़
डालें
र

In [23]:
num_small = 0
MAX_SENTENCE_LENGTH = 10
l = []
hindi_tokenized_maxsent_10 = []
for line in tokenized_hindi:
    length  = len(line)
    if(length <= MAX_SENTENCE_LENGTH):
        num_small += 1
        hindi_tokenized_maxsent_10.append(line)
num_small

39489

In [26]:
concatenated = []
for line in hindi_tokenized_maxsent_10:
    c = ' '.join(line)
    concatenated.append(c)

In [33]:
concatenated[120]

NameError: name 'concatenated' is not defined

In [28]:
fw = open('hindi_small_concat_maxsent_10.txt', "wb")
for line in concatenated:
    string_for_output = (line + '\n').encode('utf8', 'replace')
    fw.write(string_for_output)
fw.close()

In [34]:
x = load_doc('hindi_small_concat_maxsent_10.txt').split('\n')

In [35]:
small_x = x[:2000]

In [36]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import os
import keras
from keras.preprocessing.sequence import pad_sequences

In [37]:
os.environ["CUDA_VISIBLE_DEVICES"]="4"

In [38]:
tokenizer = Tokenizer()

In [39]:
tokenizer.fit_on_texts(small_x)

In [40]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x7f51404eee48>

In [41]:
x = tokenizer.word_index.items()

In [42]:
len(x)

3773

In [43]:
encoded = tokenizer.texts_to_sequences(small_x)

In [44]:
encoded[0]

[40, 147, 5, 646, 1222, 1223]

In [45]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 3774


In [46]:
sequences = list()
for line in small_x:
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[i-1:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))
# for i in range(1, len(encoded)):
# 	sequence = encoded[i-1:i+1]
# 	sequences.append(sequence)
# print('Total Sequences: %d' % len(sequences))

Total Sequences: 9941


In [47]:
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 2


In [48]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

In [49]:
len(y)

9941

In [50]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1, 50)             188700    
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_2 (Dense)              (None, 3774)              192474    
Total params: 401,374
Trainable params: 401,374
Non-trainable params: 0
_________________________________________________________________
None


In [102]:
small_X = X[:5000]
small_y = y[:5000]

In [55]:
# fit network
model.fit(X, y, epochs=50, verbose=1, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50

KeyboardInterrupt: 

In [56]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# pre-pad sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
		# predict probabilities for each word
		arr = model.predict(encoded, verbose=0)
		arr[0][1] = 0
		arr[0][3] = 0
# 			arr = arr[2:]
		yhat = np.argmax(arr)
		# map predicted word index to word
		out_word = ''
		print(yhat)
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
	return in_text

In [63]:
print(generate_seq(model, tokenizer, max_length-1, 'तुम्हारा', 20))
print(generate_seq(model, tokenizer, max_length-1, '', 4))

12
25
46
9
918
2201
9
918
2201
9
918
2201
9
918
2201
9
918
2201
9
918
तुम्हारा नहीं कर रहे हैं तारे गाते हैं तारे गाते हैं तारे गाते हैं तारे गाते हैं तारे गाते हैं तारे
12
25
46
9
 नहीं कर रहे हैं
