In [1]:
! pip install -q pyyaml h5py
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import RNN
from keras.utils import np_utils
import tensorflow as tf
import gc
import string
from tqdm import tqdm
import os
import math

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [2]:

glove_dir = '../input/glove/glove.6B.200d.txt'
embeddings_index = {} # empty dictionary
f = open(glove_dir, encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [3]:


def clean_captions(caption_list):
        table = str.maketrans('', '', string.punctuation)
 
        for i in range(len(caption_list)):
                desc = caption_list[i]
                # tokenize
                desc = desc.split()
                # convert to lower case
                desc = [word.lower() for word in desc]
                # remove punctuation from each token
                desc = [w.translate(table) for w in desc]
                # remove hanging 's' and 'a'
                desc = [word for word in desc if len(word)>1]
                # remove tokens with numbers in them
                desc = [word for word in desc if word.isalpha()]
                # store as string
                caption_list[i] =  ' '.join(desc)
 
        return (caption_list)


In [4]:


def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [9]:
in_filename = '../input/text-data/1342-0 (1).txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
lines = clean_captions(lines)
lines = lines[0: round((len(lines)/5))]

In [10]:
lines_new= []
for i in tqdm(range(len(lines))):
  if (lines[i] != ''):
    lines_new.append(lines[i])
lines = lines_new
del lines_new
gc.collect()

100%|██████████| 25395/25395 [00:00<00:00, 906165.78it/s]


78

In [11]:
words = list()
for i, key  in enumerate(lines):
    word_list = (lines[i].split(' '))
    for word in word_list:
 
      words.append(word)
vocabulary = set(words)

In [12]:

ixtoword = {}
wordtoix = {}
ix = 1
for w in list(vocabulary):
    wordtoix[w] = ix
    ixtoword[ix] = w
    ix += 1

ixtoword[0] = '<unk>'
wordtoix['<unk>'] = 0

In [13]:
def same_length_caption(caption , max_len=50):
 
    '''
    Takes caption as input and makes them of equal length
    
    Parameters:-
    caption (list) - The list of embedded caption to be made of particular length
    max_len (int) - The max length of the caption
    
    Return type:-
    
    caption (list) :- Returns a list with zero padding of length = max_len
    '''
    
    
    if(len(caption) == max_len):
        return (caption)
    else:
        for i in range((max_len-len(caption))):
            caption.append(0)
    return caption
def word_to_ix(caption , vocab):
    '''
    Maps the words to integers according to custom vocabulary
    
    Parameters:-
    caption (list) - The caption to be embedded
    vocab (dict) - The custom mapping that wil be used as vocabulary
    
    Return type:-
    
    caption (list) :- Returns a list after mapping them according to 'vocab'
    '''
        
    transformed_caption=[]
    for word in caption:
        if (word in wordtoix.keys()):
            transformed_caption.append(wordtoix[word])
        else:
            transformed_caption.append(wordtoix['<unk>'])
    return (transformed_caption)
        
def ix_to_word(caption , vocab):
    '''
    Takes caption as input and maps them to words as defined by 'vocab'
    
    Parameters:-
    caption (list) - The list of embedded caption to be made of particular length
    vocab (dict) - The dictionary that wil be used as mapping
    
    Return type:-
    
    caption (list) :- Returns a list after converting respective integers to words according to vocab
    '''
    
    transformed_caption=[]
    for word in caption:
        transformed_caption.append(ixtoword[word])

    return (transformed_caption)


In [14]:



#words = list()
#for i, key  in enumerate(lines):
#    word_list = (lines[i].split(' '))
#    for word in word_list:
 
#      words.append(word)
#vocabulary = set(words)

#from collections import Counter 
  
#def removeElements(lst, k): 
#    counted = Counter(lst) 
#    return [el for el in lst if counted[el] >= k] 
 
#k = 8
#vocabulary_new = ((removeElements(flat_list, k))) 
#vocabulary_new = set(vocabulary_new)
#vocabulary.update(['<unk>'])


train_len = 12+1
text_sequences = []
for i in range(train_len,len(words)):
    seq = words[i-train_len:i]
    text_sequences.append(seq)


def generator(batch_size=32):
    """
    Yields the next training batch.
    Suppose `samples` is an array [[image1_filename,label1], [image2_filename,label2],...].
    """
    num_samples = len(text_sequences)
    
    while True:
        for offset in range(0, num_samples, batch_size):
            batch_samples = text_sequences[offset:offset+batch_size]
 
            input_seq=[]
            output_seq=[]
 
            for batch_sample in batch_samples:
 
 
                #caption_text = batch_samples.at[batch_sample , 'image_caption']
                #caption = caption_text.split()
                batch_sample = word_to_ix(batch_sample , wordtoix)
                #caption = same_length_caption(caption , max_len = 34)
 
                #print(type(batch_sample))
                #print(len(batch_sample))
                
                
                #samples = word_to_ix(samples , wordtoix)
 
                input_seq.append(np.array(batch_sample[0:-1]))
                output_seq.append(np.array(batch_sample[-1:]))
 
                #print(input_seq)
                #print(output_seq)
            
            input_seq = np.array(input_seq)
            output_seq = np.array(output_seq)
 
            yield(input_seq , output_seq)
  
vocab_size = len(vocabulary) + 1
embedding_dim = 200
# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in wordtoix.items():
    #if i < max_words:
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

In [15]:
def build_model():

  inputs1 = tf.keras.layers.Input(shape=(12))
  se1 = tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = embedding_dim)(inputs1)
  se2 = tf.keras.layers.GRU(512,return_sequences=True )(se1)
  se3 = tf.keras.layers.GRU(512,return_sequences=False )(se2)

  output = tf.keras.layers.Dense(vocab_size,activation='softmax')(se3)

  model = tf.keras.Model(inputs=[inputs1],outputs=[output])
  opt = tf.keras.optimizers.Adam(learning_rate=0.01)

  model.compile(loss= 'sparse_categorical_crossentropy', optimizer=opt)

  return model



model = build_model()

#opt = tf.keras.optimizers.Adam(learning_rate=0.001)
#model.compile(loss= tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=opt , metrics= ['accuracy'])

model.layers[1].set_weights([embedding_matrix])
model.layers[1].trainable = False



In [16]:
lr_red = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='loss', factor=0.2, patience=3, verbose=1, mode='auto',
    min_delta=0.0001, cooldown=0, min_lr=0.0000001)
callbacks = [lr_red ]


In [17]:
BATCH_SIZE = 2048

In [18]:
train_generator = generator(batch_size= BATCH_SIZE)

In [19]:
from tensorflow.keras.backend import manual_variable_initialization
manual_variable_initialization(True)

In [41]:
history = model.fit_generator(
        train_generator,
        steps_per_epoch=(len(text_sequences)/BATCH_SIZE),
        epochs=200, 
        verbose=1,
        callbacks = callbacks
        )


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

KeyboardInterrupt: 

In [42]:
model.save(
    'my_file', overwrite=True, include_optimizer=True, save_format='h5',
    signatures=None, options=None
)

In [43]:
np.save('wordtoix.npy', wordtoix) 
np.save('ixtoword.npy', ixtoword) 

In [None]:
rm -rf /kaggle/working

In [None]:
#model = tf.keras.models.load_model('../input/language-model/my_file')

In [39]:
X_dummy = ['two','dogs' , 'fighting']
X_final = X_dummy

In [40]:

length = 30
for alpha in (range(length)):
  
  X_gamma = word_to_ix(X_dummy , wordtoix)
  X_alpha = np.expand_dims(np.array(X_gamma) , axis = 0)
  yhat = model.predict(X_alpha)
  yhat = np.argmax(yhat)
  word_to_add = ix_to_word([yhat] , ixtoword)
  X_final.append(word_to_add[0])
  print(X_dummy)
  #print(word_to_add[0])
  X_dummy.append(word_to_add[0])
  if(len(X_dummy)>12):
        X_dummy.pop(0)

#print(X_dummy)


['two', 'dogs', 'fighting', 'between']
['two', 'dogs', 'fighting', 'between', 'between', 'the']
['two', 'dogs', 'fighting', 'between', 'between', 'the', 'the', 'coachman']
['two', 'dogs', 'fighting', 'between', 'between', 'the', 'the', 'coachman', 'coachman', 'the']
['two', 'dogs', 'fighting', 'between', 'between', 'the', 'the', 'coachman', 'coachman', 'the', 'the', 'coachman']
['dogs', 'fighting', 'between', 'between', 'the', 'the', 'coachman', 'coachman', 'the', 'the', 'coachman', 'coachman', 'after']
['fighting', 'between', 'between', 'the', 'the', 'coachman', 'coachman', 'the', 'the', 'coachman', 'coachman', 'after', 'after', 'and']
['between', 'between', 'the', 'the', 'coachman', 'coachman', 'the', 'the', 'coachman', 'coachman', 'after', 'after', 'and', 'and', 'those']
['between', 'the', 'the', 'coachman', 'coachman', 'the', 'the', 'coachman', 'coachman', 'after', 'after', 'and', 'and', 'those', 'those', 'had']
['the', 'the', 'coachman', 'coachman', 'the', 'the', 'coachman', 'coac

In [None]:
np.save('wordtoix.npy', wordtoix) 

# Load
#ixtoword_copy = np.load('ixtoword.npy',allow_pickle='TRUE').item()
#print(ixtoword_copy['boy'])

In [None]:
a_file = open("ixtoword.json", "r")
ixtoword_copy = a_file.read()

In [None]:
ixtoword_copy

In [None]:
json_file = model.to_json()
with open(json_file_path, "w") as file:
   file.write(json_file)
# serialize weights to HDF5
model.save_weights(h5_file)

In [None]:
rm -rf /kaggle/working