In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jflegconll14c4/Total_final_dataset.csv


In [2]:
df = pd.read_csv("/kaggle/input/jflegconll14c4/Total_final_dataset.csv", dtype=str)

In [3]:
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, Embedding, TimeDistributed, Dense, Concatenate, dot, Activation
from tensorflow.keras.models import Model, load_model
from keras.utils.vis_utils import plot_model
import pickle

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67645 entries, 0 to 67644
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   67645 non-null  object
 1   target  67645 non-null  object
dtypes: object(2)
memory usage: 1.0+ MB


In [5]:
df = df.dropna()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67645 entries, 0 to 67644
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   67645 non-null  object
 1   target  67645 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [7]:
tkn_all = Tokenizer(filters='', lower=False, split= ' ', oov_token = "<OOV>", char_level = False)
tkn_all.fit_on_texts(df['input'] + df['target'])

In [8]:
start_token = '<sos>'
stop_token = '<eos>'

len(tkn_all.word_index)

206639

In [9]:
tkn_all.word_index = {word: index + 1 for index, word in enumerate(tkn_all.word_index)}
tkn_all.word_index[start_token] = 206640
tkn_all.word_index[stop_token] = 206641
tkn_all.index_word[206640] = '<sos>'
tkn_all.index_word[206641] = '<eos>'

In [10]:
st_token = tkn_all.word_index['<sos>']
ed_token = tkn_all.word_index['<eos>']

In [11]:
def preprocessing_df_corr(df, col, mxlen):
    seq = tkn_all.texts_to_sequences(start_token + " " + df[col]+ " "+ stop_token)
    seq_padded = pad_sequences(seq, maxlen = mxlen, padding = 'post', truncating= 'post')
    return seq_padded

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train, test = train_test_split(df, test_size = 0.2, random_state = 200)

In [14]:
vocab_size = len(tkn_all.word_index) + 1

In [15]:
inc_train_seq = preprocessing_df_corr(train,'input', 50)
inc_test_seq =preprocessing_df_corr(test,'input', 50)

corr_train_seq = preprocessing_df_corr(train,'target', 50)
corr_test_seq = preprocessing_df_corr(test,'target', 50)

In [16]:
encoder_units = 64
decoder_units = 64
embedding_dim = 50

In [17]:
# Define the encoder model
encoder_inputs = Input(shape=(inc_train_seq.shape[1],))
encoder_embedding = Embedding(vocab_size, embedding_dim, input_length=inc_train_seq.shape[1])(encoder_inputs)
encoder = LSTM(encoder_units, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder model

# decoder_embedding = Embedding(vocab_size, embedding_dim)
# decoder_embedded = decoder_embedding(decoder_inputs)

decoder = LSTM(decoder_units, return_sequences=True, return_state=True)
decoder_outputs = decoder(encoder_embedding, initial_state=encoder_states)[0]

# Attention Mechanism
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax', name='attention')(attention)

context = dot([attention, encoder_outputs], axes=[2,1])
decoder_combined_context = Concatenate()([context, decoder_outputs])

# Dense layer for prediction
outputs = TimeDistributed(Dense(vocab_size, activation='softmax'))(decoder_combined_context)

model = Model(inputs=encoder_inputs, outputs=outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 50)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 50, 50)       10332100    ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 50, 64),     29440       ['embedding[0][0]']              
                                 (None, 64),                                                      
                                 (None, 64)]                                                      
                                                                                              

In [19]:
from keras.callbacks import CSVLogger

In [20]:
log_csv = CSVLogger('logs.csv', separator=",", append=False)

In [21]:
import re
def correct_sentence(sentence):
    sequence = tkn_all.texts_to_sequences([start_token + " " + sentence + " " + stop_token])
    padded = pad_sequences(sequence, maxlen=inc_train_seq.shape[1], padding = 'post', truncating = 'post')
    encoded = model.predict(padded)
    y = np.argmax(encoded, axis = 2)
    y = np.reshape(y, (1,50))
    decoded = []
    decoded_np = np.ndarray(50)
    for i in range(encoded.shape[1]):
        word_index = np.argmax(encoded[0, i, :])
        decoded_np = np.append(decoded_np,word_index)
        if word_index > 0:
            decoded.append(tkn_all.index_word[word_index])
            if word_index == tkn_all.word_index[stop_token]:
                break
        else:
            decoded.append('<OOV>')
    return ' '.join(decoded)


def correctAllsentences(input_sentences, target_sentences):
  target_value = target_sentences
  value = []
  for i in input_sentences:
    sen = correct_sentence(i)
    sen = sen.replace("<OOV>" , "")
    sen = sen.replace("<sos>", "")
    sen = sen.replace("<eos>", "")
    sen = re.sub(" +", " " , sen)
    if sen[0] == " ":
      sen = sen[1:]
    value.append(sen)
  return value, target_value


In [21]:
from nltk.translate.bleu_score import sentence_bleu 
def calculate_bleu(candidate, references):
  bleu_score = []
  for i in range(len(candidate)):
      score = sentence_bleu(references, candidate, weights=(0.5,0.5,0,0))
      bleu_score.append(score)
  return bleu_score

In [22]:
import csv
class CustomCallback(keras.callbacks.Callback):
  def __init__(self, candidates=[], references=[]):
    self.candidates, self.references = correctAllsentences(df['input'][900:1000], df['target'][900:1000])
    self.bleu_list = []
    self.epoch_list = []
  def on_epoch_end(self, epoch, logs=None):
    keys = list(logs.keys())
    bleu_score = calculate_bleu(self.candidates, self.references)
    print("\nThe bleu Score is  {} ".format(epoch), sum(bleu_score)/ len(bleu_score))
    self.bleu_list.append(sum(bleu_score)/ len(bleu_score))
    self.epoch_list.append(epoch)
  def on_train_end(self, logs=None):
    print(self.bleu_list)
    print(self.epoch_list)
    with open('data.csv', 'w', newline='') as file:
      writer = csv.writer(file)
      writer.writerow(['Epoch', 'Bleu'])

      for i in range(len(self.epoch_list)):
          writer.writerow([self.epoch_list, self.bleu_list])



In [22]:
history = model.fit(inc_train_seq, corr_train_seq, epochs=15, batch_size=64, verbose = 1, validation_split = 0.2, callbacks = [log_csv])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [23]:
cor_sentence = correct_sentence("He eating food")
cor_sentence



'<sos> He eating food <eos>'

In [26]:
history = model.fit(inc_train_seq, corr_train_seq, epochs=15, batch_size=64, verbose = 1, validation_split = 0.2, callbacks = [log_csv])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [27]:
model.save('model_encoder_decoder_attention_30_epochs.h5')

AttributeError: 'History' object has no attribute 'accuracy'

In [28]:
from IPython.display import FileLink

FileLink(r'model_encoder_decoder_attention_30_epochs.h5')