In [73]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import nltk

# threshold for minimum count to be considered a valid word
MIN_VOCAB_COUNT = 3
OOV_TOKEN = "UNK"

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='r', encoding='utf-8')
    text = file.read()
    file.close()
    return text

def RepresentsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

# remove numbers. they can be like 100 1090,200 2.123 etc
# strategy is to remove punctuation and then check if its an integer
def isNumber(word):
    word_no_num = re.sub(r'[^\w\s]','',word)
    if RepresentsInt(word_no_num):
        return True
    else:
        return False

#tokenizes raw strings
def getTokenized(lines):
    exclude = set(string.punctuation)
    exclude.add('-')
    exclude.add('।')
    words_list = [] 
    total_sent = len(lines)
    n = 0
    for line in lines:
        line = line.strip()
        words = nltk.word_tokenize(line)
        words_nopunc_nonum = []
        for word in words:
            if word in exclude: # if punctuation
                continue
            else:
                word = word.replace('।', '')
                if(isNumber(word)): # if number
                    word = "NUMBER"
                words_nopunc_nonum.append(word)
#         if(len(words_nopunc_nonum) >= 1):
#             words_nopunc_nonum[-1] = words_nopunc_nonum[-1].replace('।', '')
        words_list.append(words_nopunc_nonum)
        if(n%10000 == 0):
            print((n/total_sent)*100.0, '% done')
        n+= 1
    return words_list


# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

def getVocabulary(tokenized_corpus):
    vocabulary = {}
    total_sent = len(tokenized_corpus)
    n = 0
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                vocabulary[token] = 1
            else:
                vocabulary[token] += 1
        if(n%10000 == 0):
            print((n/total_sent)*100.0, '% done')
        n+= 1
    new_dict = {}
    oov_count = 0 
    # remove infrequent words
    for word, count in vocabulary.items():
        if(count >= MIN_VOCAB_COUNT):
            new_dict[word] = count
        else:
            oov_count += count
    new_dict[OOV_TOKEN] = oov_count
    word2id = {w: idx for (idx, w) in enumerate(new_dict)}
    id2word = {idx: w for (idx, w) in enumerate(new_dict)}
    return new_dict, word2id, id2word

def removeOOV(sentences, vocab):
    new_sentences = []
    for sentence in sentences:
        new_sent = []
        for word in sentence:
            if word in vocab:
                new_sent.append(word)
            else:
                new_sent.append(OOV_TOKEN)
        new_sentences.append(new_sent)
    return new_sentences

In [2]:
hindi = load_doc('data/hindi/hindmonocorp05.plaintext')

In [3]:
len(hindi)

4359377142

In [65]:
hindi = hindi[:100000]

'hwt2013\t<s>\tलेकिन गांव के जगदीश मेघवाल, मोहन...\nspiderling\t<s>\tविटामिन सी शरीर में रोग पैदा करने वाल'

In [4]:
sentences = hindi.split('\n')

In [67]:
len(sentences)

44486483

In [8]:
sentences[:4]

['लेकिन गांव के जगदीश मेघवाल, मोहन...',
 'विटामिन सी शरीर में रोग पैदा करने वाले विषाणुओं से लड़ने की ताकत पैदा करता है और शरीर में इसकी संतुलित मात्रा बने रहने से रोग प्रतिरोधक क्षमता मजबूत रहती है।',
 'इन बोतलों के बहुत कम पैसे मिलते हैं।',
 'कार्टून :- रे लोकपाल आ गया तू ? शाबाश.... 19 0']

In [5]:
new_sentences = []
for line in sentences:
    tokens = line.split('\t')
    if len(tokens) >= 3:
        new_sentences.append(tokens[2])


In [9]:
new_sentences[-1]

'इन दिनों उसी जमीन पर काम चल रहा है।'

In [10]:
sentences = new_sentences
small = sentences[:100000]

In [11]:
len(small)

100000

In [74]:
tokenized_hindi = getTokenized(small)

0.0 % done
10.0 % done
20.0 % done
30.0 % done
40.0 % done
50.0 % done
60.0 % done
70.0 % done
80.0 % done
90.0 % done


In [75]:
small[100]

'भारी हिमपात के समय यहां पर 2 फीट बर्फ की मोटी परत जम जाती है।'

In [44]:
fw = open('hindi_small.txt', "wb")
for line in small:
    string_for_output = (line + '\n').encode('utf8', 'replace')
    fw.write(string_for_output)
fw.close()

In [15]:
word = tokenized_hindi[1010][-1]
word
# word.replace('।', '')

'किया'

In [76]:
vocabulary, word2id, id2word = getVocabulary(tokenized_hindi)

0.0 % done
10.0 % done
20.0 % done
30.0 % done
40.0 % done
50.0 % done
60.0 % done
70.0 % done
80.0 % done
90.0 % done


In [77]:
len(word2id)

27625

In [78]:
tokenized_hindi = removeOOV(tokenized_hindi, vocabulary)

In [19]:
for i in range(2000,3000):
    print(id2word[i])

पारायण
डीएनए
रजिस्ट्री
पुडिया
आपूर्ति
मनुष्यों
छील
परतंत्रता
कर्मवीर
दरगाह
दयावती
सटाकर
एक्सपो
ग्रेट
एंट्री
दर्पण
are
चकत्ते
जनतंत्र
अरहर
रहूँ
कलकत्ते
ओक
पण
कृष्णमूर्ति
जरूरतमंद
10वीं
इंडियन
पढाया
प्रभागीय
मात
ज्ञाता
हज़ार
सुनी
तरीको
नाई
हाउ
तमाचा
मुर्गियों
रक्षण
सीएसएस
शक्तिपीठ
शराफत
तमे
उभरी
कैसेट
ब्लॉगजगत
आसार
हाईवे
पहुंचाता
अंधा
बदतर
पीटे
पश्च
संघों
चढ़ाये
देवरिया
रोगाणु
इरशाद
कारें
चहुँ
महिने
स्त्रोतों
प्रतिबिम्बित
शोभायात्रा
सुधरेगी
लेना-देना
भईया
बचेगा
शृंखलाओं
रखती
चिंतन
मजे
रत्ती
चोटियां
नवाबों
विष
उड़ीसा
योगा
टाँग
सब्जियां
सीमाएँ
बालाओं
प्रतीति
कृतियां
रसखानि
राजकपूर
बढावा
फ़िल्टर्ड
कालेजों
गवाहों
बर्तनों
सीवान
उपायों
एकतरफ़ा
डोगरी
कुचलने
उत्तर-
रघुवीर
दुर्बल
बनवा
भरना
टालते
हुयी
तुम्हीं
लटकाने
मक
चित्त
मन्तर
शीश
चमकीले
एकांश
वस्तुयें
मुर्दा
पीसते
जमीन-जायदाद
इकाइयां
साबला
आंच
लागि
अवैज्ञानिक
व्यक्त
अधेड़
कलंक
बिगाड़ना
दीपदान
फंसा
धनबाद
अमरीकी
घबराने
यंग्
जनजाति
पडता
गुनगुनाते
चौरा
सम्मानों
नदारद
चीन
प्रौद्योगिकियों
पकड़ती
बारे
छाती
एक-डेढ़
मेंरे
डि
स्थानांतरित
वॉशिंगटन
हटाई


In [20]:
num_tokens = 0
for key, value in vocabulary.items():
    num_tokens += value
num_tokens

1646555

In [79]:
concatenated = []
for line in tokenized_hindi:
    c = ' '.join(line)
    concatenated.append(c)

In [82]:
concatenated[120]

'और अब कोई जीने में जीना थोड़े ही है दिन पूरे करने हैं UNK'

In [176]:
fw = open('hindi_small_concat.txt', "wb")
for line in concatenated:
    string_for_output = (line + '\n').encode('utf8', 'replace')
    fw.write(string_for_output)
fw.close()

In [68]:
concatenated[100]

'भारी हिमपात के समय यहां पर ### फीट बर्फ की मोटी परत जम जाती है'

In [None]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
import os
import keras
from keras.preprocessing.sequence import pad_sequences

In [28]:
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [83]:
tokenizer = Tokenizer(num_words=30000)

In [84]:
tokenizer.fit_on_texts(concatenated)

In [58]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x7fa0b18f0748>

In [88]:
x = tokenizer.word_index.items()

In [89]:
len(x)

26996

In [101]:
encoded = tokenizer.texts_to_sequences(concatenated)

In [117]:
encoded[2]

[96, 11936, 2, 61, 109, 651, 1187, 11]

In [118]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 26997


In [119]:
sequences = list()
for line in concatenated:
	encoded = tokenizer.texts_to_sequences([line])[0]
	for i in range(1, len(encoded)):
		sequence = encoded[:i+1]
		sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 1542713


In [None]:
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

In [107]:
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

ValueError: setting an array element with a sequence.

In [112]:
y

array([list([3882, 255, 398, 4, 971, 503, 28, 71, 18298, 6, 2632, 5, 1380, 503, 120, 3, 7, 398, 4, 327, 2108, 1125, 525, 315, 6, 971, 13035, 838, 916, 464, 3]),
       list([96, 11936, 2, 61, 109, 651, 1187, 11]),
       list([5544, 1556, 1990, 115, 27, 766, 1, 9, 9]), ...,
       list([47, 13, 1106, 57, 999]), list([1631, 9, 9, 8, 9, 5696, 12]),
       list([65, 1, 256, 887, 22, 69, 756, 180, 5, 7882, 215])],
      dtype=object)

In [115]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1, 50)             1349850   
_________________________________________________________________
lstm_3 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_3 (Dense)              (None, 26997)             1376847   
Total params: 2,746,897
Trainable params: 2,746,897
Non-trainable params: 0
_________________________________________________________________
None


In [116]:
# compile network
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(X, y, epochs=2, verbose=2)

ValueError: Error when checking target: expected dense_3 to have shape (None, 26997) but got array with shape (99999, 1)