<a href="https://colab.research.google.com/github/a8nguyen/4845_projects/blob/main/nlp5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

import os
from keras.models import Model
from keras.layers import Input, LSTM, Dense, GRU
import matplotlib.pyplot as plt
import seaborn as sns

from keras.utils import to_categorical
from numpy import argmax


import re
import os
import sys
import numpy as np
import time
import math
from datetime import datetime

from nltk.translate.bleu_score import corpus_bleu

### **Loading Dataset**

In [None]:
def load_data(filePath):
  data_cwe = open(filePath, 'r').read().lower()
  return data_cwe

### **Defining Preprocessing Class**

In [None]:
class Language_Translation_Model:
  def __init__(self, lang="") -> None:
    self.lang = lang
    self.all_Unique_Characters = None
    self.length_all_Unique_Characters = None
    self.length_max_sentence = None
    self.char_to_num = None
    self.num_to_char = None
  
  def sentence_to_array(self, sentence):
    sentenceArray = [self.char_to_num[char] if char in self.char_to_num else 0 for char in sentence]
    return sentenceArray
  
  def sentenceArray_to_OHE(self, sentenceArray):
    sentenceArray_OHE = to_categorical(sentenceArray)
    (req_rows, req_cols) = (self.length_max_sentence, self.length_all_Unique_Characters)
    (actual_rows, actual_cols) = sentenceArray_OHE.shape
    sentenceArray_OHE = np.concatenate((sentenceArray_OHE, np.zeros((req_rows-actual_rows, actual_cols), dtype='float32')), axis=0)
    sentenceArray_OHE = np.concatenate((sentenceArray_OHE, np.zeros((req_rows, req_cols-actual_cols), dtype='float32')), axis=1)
    return sentenceArray_OHE

  def OHE_to_sentenceArray(self, sentence_array_OHE):
    sentenceArray = []
    for OHE in sentence_array_OHE:
      if sum(OHE)!=0:
        sentenceArray.append(argmax(OHE))
    return sentenceArray
  
  def preprocess_data(self, text):
    data = pd.DataFrame()
    # Cleaning --------------------------
    text = text.replace("\n", " ") 
    for char in [' .', ' ,', ' ;', ' ?', ' !', '( ', ' )']:
      text = text.replace(char, char.strip())
    text = text.replace("<s>", "")
    
    # =========================== CHARACTERS ===========================
    # Unique Characters -----------------
    self.all_Unique_Characters = list(set(text.replace("</s>", "")))
    self.all_Unique_Characters.sort()
    
    # Max Length of Unique Characters ---
    self.length_all_Unique_Characters = len(self.all_Unique_Characters)
    
    # Preparing Character to Number and vice-versa -----
    self.char_to_num = {char:num for num, char in enumerate(self.all_Unique_Characters)}
    self.num_to_char = {val:key for key, val in self.char_to_num.items()}

    # =========================== SENTENCES ===========================
    # Sentences -------------------------
    sentences = text.split("</s>")[:-1]
    sentences = [sentence.strip() for sentence in sentences]
    
    # Max Length of Sentences -----------
    self.length_max_sentence = max([len(sentence) for sentence in sentences])
    
    # =========================== DATAFRAME ===========================
    # Storing sentences in dataframe ----
    data['sentence'] = sentences
    # Converting Sentence to Array Numbers ----
    data['sentence_array'] = data['sentence'].apply(lambda x: self.sentence_to_array(x))
    # Converting Array Numbers to One-Hot-Encoding (OHE) ----
    data['sentence_array_OHE'] = data['sentence_array'].apply(lambda x: self.sentenceArray_to_OHE(x))

    return data

  def preprocess_test_data(self, text):
    data = pd.DataFrame()
    # Cleaning --------------------------
    text = text.replace("\n", " ") 
    for char in [' .', ' ,', ' ;', ' ?', ' !', '( ', ' )']:
      text = text.replace(char, char.strip())
    text = text.replace("<s>", "")
    
    # =========================== SENTENCES ===========================
    # Sentences -------------------------
    sentences = text.split("</s>")[:-1]
    sentences = [sentence.strip() for sentence in sentences]
    
    # =========================== DATAFRAME ===========================
    # Storing sentences in dataframe ----
    data['sentence'] = sentences
    # Converting Sentence to Array Numbers ----
    data['sentence_array'] = data['sentence'].apply(lambda x: self.sentence_to_array(x))
    # Converting Array Numbers to One-Hot-Encoding (OHE) ----
    data['sentence_array_OHE'] = data['sentence_array'].apply(lambda x: self.sentenceArray_to_OHE(x))

    return data

Reference link: https://www.kaggle.com/code/digvijayyadav/neural-machine-translation-de-l-anglais-vers-le-f/notebook

### **Process Input and Target datasets**

In [None]:
filePath = "/content/drive/MyDrive/NLP/NLP_Assingnmnet05/data/train-05/train-source.txt"
train_input_data = load_data(filePath)

In [None]:
Input_LTM = Language_Translation_Model()
Input_data = Input_LTM.preprocess_data(train_input_data[0:10000])
Input_data.head()

Unnamed: 0,sentence,sentence_array,sentence_array_OHE
0,"cinnte go leór, thiocfadh dóbhtha bás a fhaghá...","[16, 22, 26, 26, 31, 18, 0, 20, 27, 0, 24, 18,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,(bhí sé follasach go rabh an poll sin ag foscl...,"[4, 15, 21, 36, 0, 30, 35, 0, 19, 27, 24, 24, ...","[[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,..."
2,) d'fhéadfadh siad bás ' fhagháil ar a bhruach...,"[5, 0, 17, 3, 19, 21, 35, 14, 17, 19, 14, 17, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,..."
3,thiocfadh dóbhtha fosta lámh a chur ina mbás f...,"[31, 21, 22, 27, 16, 19, 14, 17, 21, 0, 17, 37...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,"' na dhiaidh sin, bhí rud éigin do-chreidte ag...","[3, 0, 26, 14, 0, 17, 21, 22, 14, 22, 17, 21, ...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
filePath = "/content/drive/MyDrive/NLP/NLP_Assingnmnet05/data/train-05/train-target.txt"
train_target_data = load_data(filePath)

In [None]:
Output_LTM = Language_Translation_Model()
Output_data = Output_LTM.preprocess_data(train_target_data[0:10000])
Output_data.head()

Unnamed: 0,sentence,sentence_array,sentence_array_OHE
0,"cinnte go leor, thiocfadh dóibh bás a fháil ar...","[15, 21, 25, 25, 30, 17, 0, 19, 26, 0, 23, 17,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,bhí sé follasach go raibh an poll sin ag foscl...,"[14, 20, 35, 0, 29, 34, 0, 18, 26, 23, 23, 13,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,d'fhéadfadh siad bás a fháil ar a bhruach agus...,"[16, 2, 18, 20, 34, 13, 16, 18, 13, 16, 20, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,thiocfadh dóibh fosta lámh a chur ina mbás féi...,"[30, 20, 21, 26, 15, 18, 13, 16, 20, 0, 16, 36...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,ina dhiaidh sin bhí rud éigin dochreidte agus ...,"[21, 25, 13, 0, 16, 20, 21, 13, 21, 16, 20, 0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
encoder_input_data = np.array([record for record in Input_data['sentence_array_OHE']])
decoder_input_data = np.array([np.concatenate((np.zeros((record.shape[0], 1), dtype='float32'), record[:, 1:]), axis=1) for record in Output_data['sentence_array_OHE']])
decoder_target_data = np.array([record for record in Output_data['sentence_array_OHE']])

### **Model Preparation & Training**

In [None]:
class Model_LSTM:
  def __init__(self, Input_LTM, Output_LTM, num_samples=10000, latent_dim=256, batch_size = 128, epochs=1):
    self.batch_size = batch_size
    self.epochs = epochs
    self.latent_dim = latent_dim
    self.num_samples = num_samples
    self.Input_LTM = Input_LTM
    self.Output_LTM = Output_LTM

    encoder_inputs = Input(shape=(None, self.Input_LTM.length_all_Unique_Characters))
    encoder = LSTM(latent_dim, return_state=True) # latent_dim = 256
    encoder_outputs, state_h, state_c = encoder(encoder_inputs) # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    decoder_inputs = Input(shape=(None, Output_LTM.length_all_Unique_Characters))

    # We set up our decoder to return full output sequences, and to return internal states as well. We don't use the return states in the training model, but we will use them in inference.
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
    decoder_dense = Dense(Output_LTM.length_all_Unique_Characters, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Defining Model ------------------
    self.model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Compiling Model ------------------
    self.model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

    # For Prediction ---------------------
    self.encoder_model = Model(encoder_inputs, encoder_states)

    decoder_states_inputs = [Input(shape=(latent_dim,)), Input(shape=(latent_dim,))]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    self.decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states)
    
  def train(self, encoder_input_data, decoder_input_data, decoder_target_data):
    startTime = datetime.now()

    self.model.fit([encoder_input_data, decoder_input_data], 
          decoder_target_data,
          batch_size=self.batch_size,
          epochs=self.epochs,
          validation_split=0.2)
    endTime = datetime.now()
    print("Training is Completed Successfully in time : ", endTime-startTime)
    
  def predict(self, input_seq):
    # Encode the input as state vectors.
    states_value = self.encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, self.Output_LTM.length_all_Unique_Characters)) # Output_LTM.length_all_Unique_Characters -> Number of output characters
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, 0] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = self.decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = self.Output_LTM.num_to_char[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > self.Output_LTM.length_max_sentence):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, self.Output_LTM.length_all_Unique_Characters))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence
  
  def evaluation(self, test_Input_text, test_Output_text):
    # Preprocess data ---------------
    test_Input_data = self.Input_LTM.preprocess_test_data(test_Input_text[0:100])  # Input_LTM is already defined
    test_Output_data = self.Output_LTM.preprocess_test_data(test_output_text[0:100])  # Input_LTM is already defined

    # Predictions -------------------
    predicted_sentences = []
    for i in range(len(test_Input_data)):
      input_record = np.array([test_Input_data.loc[0, 'sentence_array_OHE']])
      predicted_sentences.append(self.predict(input_record))
    test_Output_data['sentences_predictions'] = predicted_sentences
    actual_sentences = list(test_Output_data['sentence'])

    # Score finding -----------------
    blueScore = self.blueScore(actual_sentences, predicted_sentences)
    return test_Output_data, blueScore

  def blueScore(self,actual_sentences, predicted_sentences):
    predicted_sentences_1 = []
    for i in range(len(actual_sentences)):
      predicted_sentences_1.append(predicted_sentences[i][0:len(actual_sentences[i])])
    blueScore = corpus_bleu(actual_sentences, predicted_sentences_1)
    return blueScore

  def blueScore1(self, actual_sentences, predicted_sentences):
    blueScore = 0
    if isinstance(actual_sentences, list) & isinstance(predicted_sentences, list):
      maxLength = len(self.Output_LTM.length_max_sentence)
      actual_sentences_with_padding = []
      pred_sentences_with_padding = []

      for i in range(len(actual_sentences)):
        input = actual_sentences[i]
        output = predicted_sentences[i]
        input += [" " for j in range(maxLength-len(input))]
        output += [" " for j in range(maxLength-len(output))]
        actual_sentences_with_padding.append(input)
        pred_sentences_with_padding.append(output)

    elif isinstance(actual_sentences, str) & isinstance(predicted_sentences, str):
      actual_sentences_with_padding = actual_sentences
      pred_sentences_with_padding = predicted_sentences
    
    blueScore = corpus_bleu(actual_sentences_with_padding, pred_sentences_with_padding)
    return blueScore

  def saveModel(self, modelPath, fileName="machine_translation_model"):
    self.model.save_weights(modelPath+fileName+'.h5')
    print(f"Model is stored successfully @ {modelPath+fileName+'.h5'}")

  def loadModel(self, modelFilePath):
    self.model.load_weights(modelFilePath)
    print("Loading Model is successfull in a variable @ 'self.model'")


In [None]:
ModelLSTM = Model_LSTM(Input_LTM, Output_LTM)

In [None]:
ModelLSTM.train(encoder_input_data, decoder_input_data, decoder_target_data)







Training is Completed Successfully in time :  0:00:08.886584


### **Prediction**

In [None]:
input_sentence = Input_data.loc[0, 'sentence']
print(input_sentence)

output_sentence = Output_data.loc[0, 'sentence']
print(output_sentence)

cinnte go leór, thiocfadh dóbhtha bás a fhagháil ar imeall an phuill udaí.
cinnte go leor, thiocfadh dóibh bás a fháil ar imeall an phoill úd.


In [None]:
input_record = np.array([Input_data.loc[0, 'sentence_array_OHE']])
print(input_record.shape)

(1, 234, 39)


In [None]:
predicted_sentence = ModelLSTM.predict(input_record)



In [None]:
print(output_sentence)
print(predicted_sentence)

In [None]:
len(output_sentence), len(predicted_sentence)

(67, 237)

In [None]:
ModelLSTM.blueScore(output_sentence, predicted_sentence)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


9.966150368347767e-232

### **Evaluation**

**Loading Input_Test data**

In [None]:
filePath = "/content/drive/MyDrive/NLP/NLP_Assingnmnet05/data/test-05/test-source.txt"
test_Input_text = load_data(filePath)

In [None]:
test_Input_data = Input_LTM.preprocess_test_data(test_Input_text[0:100])  # Input_LTM is already defined
test_Input_data.head()

Unnamed: 0,sentence,sentence_array,sentence_array_OHE
0,scéal chathail freeman - téid mo dhearbhráthai...,"[30, 16, 35, 14, 24, 0, 16, 21, 14, 31, 21, 14...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
filePath = "/content/drive/MyDrive/NLP/NLP_Assingnmnet05/data/test-05/test-target.txt"
test_output_text = load_data(filePath)

In [None]:
test_Output_data = Output_LTM.preprocess_test_data(test_output_text[0:100])  # Input_LTM is already defined
test_Output_data.head()

Unnamed: 0,sentence,sentence_array,sentence_array_OHE
0,scéal chathail freeman - téann mo dheartháir c...,"[29, 15, 34, 13, 23, 0, 15, 20, 13, 30, 20, 13...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [None]:
ModelLSTM.evaluation(test_Input_text[0:100], test_output_text[0:100])



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


(                                            sentence  \
 0  scéal chathail freeman - téann mo dheartháir c...   
 
                                       sentence_array  \
 0  [29, 15, 34, 13, 23, 0, 15, 20, 13, 30, 20, 13...   
 
                                   sentence_array_OHE  \
 0  [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
 
                                sentences_predictions  
 0  hhh hh h hh aaaaaaa a aa aa aa aa aa aa aa aa ...  ,
 8.477028509227734e-232)