In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import re

In [None]:
with open('/content/drive/MyDrive/dataset/tokenizer_lang8.pickle', 'rb') as temp:
    tkn_all = pickle.load(temp)

In [None]:
start_token = '<sos>'
stop_token = '<eos>'

In [None]:
df_eval = pd.read_csv('/content/drive/MyDrive/dataset/eval_total_dataset.csv', dtype = str)

In [None]:
df_eval.head()

Unnamed: 0,input,target
0,"In most cases , people complain about the time .",", people complain about time ."
1,Technologies had also been designed to help th...,Technologies have also been designed to help t...
2,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...
3,While current research and policies have helpe...,While current research and policies have helpe...
4,In cases where the elder do not have any child...,In cases where the elderly do not have any chi...


In [None]:
df_eval.insert(2,'predicted_ed','')
df_eval.insert(3,'predicted_at','')

In [None]:
df_eval.head()

Unnamed: 0,input,target,predicted_ed,predicted_at
0,"In most cases , people complain about the time .",", people complain about time .",,
1,Technologies had also been designed to help th...,Technologies have also been designed to help t...,,
2,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...,,
3,While current research and policies have helpe...,While current research and policies have helpe...,,
4,In cases where the elder do not have any child...,In cases where the elderly do not have any chi...,,


In [None]:
def clip_text(text):
    MAX_LEN = 32
    clipped_text = []
    for sentence in text.split("."):
        sentence = sentence.strip()
        if len(sentence.split()) > MAX_LEN:
            words = sentence.split()
            clipped_words = words[:MAX_LEN]
            clipped_sentence = ' '.join(clipped_words)
            clipped_text.append(clipped_sentence)
        elif len(sentence) > 0:
            clipped_text.append(sentence)
    return '. '.join(clipped_text)

In [None]:
df_eval.insert(2,'clipped_input','')

In [None]:
df_eval['clipped_input'] = df_eval['input'].apply(clip_text)

In [None]:
df_eval.head()

Unnamed: 0,input,target,clipped_input,predicted_ed,predicted_at
0,"In most cases , people complain about the time .",", people complain about time .","In most cases , people complain about the time",,
1,Technologies had also been designed to help th...,Technologies have also been designed to help t...,Technologies had also been designed to help th...,,
2,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...,,
3,While current research and policies have helpe...,While current research and policies have helpe...,While current research and policies have helpe...,,
4,In cases where the elder do not have any child...,In cases where the elderly do not have any chi...,In cases where the elder do not have any child...,,


In [None]:
def correct_sentence(sentence, model):
    sequence = tkn_all.texts_to_sequences([start_token + " " + sentence + " " + stop_token])
    padded = pad_sequences(sequence, maxlen=32, padding = 'post', truncating = 'post')
    encoded = model.predict(padded)
    y = np.argmax(encoded, axis = 2)
    y = np.reshape(y, (1,32))
    decoded = []
    decoded_np = np.ndarray(32)
    for i in range(encoded.shape[1]):
        word_index = np.argmax(encoded[0, i, :])
        decoded_np = np.append(decoded_np,word_index)
        if word_index > 0:
            decoded.append(tkn_all.index_word[word_index])
            if word_index == tkn_all.word_index[stop_token]:
                break
        else:
            decoded.append('<OOV>')
    return ' '.join(decoded)

In [None]:
word_dict = {"<sos>", "<OOV>", "<eos>"}

def remove_dict_words(text, word_dict):
    cleaned_text = []
    for word in text.split():
        if word not in word_dict:
            cleaned_text.append(word)
    return ' '.join(cleaned_text)

In [None]:
model_attGEC = load_model('/content/drive/MyDrive/models/model_encoder_decoder_attention_10_epochs.h5')

In [None]:
model_ED_GEC = load_model('/content/drive/MyDrive/models/model_encoder_decoder_10epochs.h5')

In [None]:
model_attGEC.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 32)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 32, 32)       4612640     ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 32, 32),     8320        ['embedding[0][0]']              
                                 (None, 32),                                                      
                                 (None, 32)]                                                      
                                                                                              

In [None]:
text = 'he lik fishing'
text1 = remove_dict_words(correct_sentence(text, model_attGEC), word_dict)
text2 = remove_dict_words(correct_sentence(text, model_ED_GEC), word_dict)



In [None]:
print(text1, text2)

He like fishing He likes fishing


In [None]:
text = df_eval.iloc[456]['input']
print(text)

Vegetables would also demonstrate the same benefits in antioxidant capabilities , for example , Genistein in soybeans works as cancer genes ' enzymes ' inhibitor .


In [None]:
len(text.split(' '))

26

In [None]:
print(remove_dict_words(correct_sentence(text, model_attGEC), word_dict))

The would also a the same benefits in situation the , for example , , , , works as as be ' ' ' . .


In [None]:
for i in range(len(df_eval)):
    text = df_eval.iloc[i]['clipped_input']
    improv_text = remove_dict_words(correct_sentence(text, model_ED_GEC), word_dict)
    df_eval.at[i, 'predicted_ed'] = improv_text



In [None]:
for i in range(len(df_eval)):
    text = df_eval.iloc[i]['clipped_input']
    improv_text = remove_dict_words(correct_sentence(text, model_attGEC), word_dict)
    df_eval.at[i, 'predicted_at'] = improv_text



In [None]:
df_eval.head(10)

Unnamed: 0,input,target,clipped_input,predicted_ed,predicted_at
0,"In most cases , people complain about the time .",", people complain about time .","In most cases , people complain about the time","In most cases , people complain about the time","In most cases , people complain about the time"
1,Technologies had also been designed to help th...,Technologies have also been designed to help t...,Technologies had also been designed to help th...,My had also been designed to help the elderly ...,Generally had also been designed to help the e...
2,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...,Nuclear power has always been a very controver...
3,While current research and policies have helpe...,While current research and policies have helpe...,While current research and policies have helpe...,While current research and policies have helpe...,While current research and policies have helpe...
4,In cases where the elder do not have any child...,In cases where the elderly do not have any chi...,In cases where the elder do not have any child...,In cases where the elder do not have any child...,In cases where the elderly do not have any chi...
5,He knows many countries around the world .,He knows of many countries around the world .,He knows many countries around the world,He knows many countries around the world world,He knows many countries around the world world
6,A question has been set up in Straits Times la...,A question has been brought up in Straits Time...,A question has been set up in Straits Times la...,"A question has been set up in the , lately , W...",A question has been set up in the Times lately...
7,""" From August 2008 to July this year , 172 peo...",""" From August 2008 to July this year , 172 peo...",""" From August 2008 to July this year , 172 peo...","From August festival to July this year , , , f...","The From August of to July this year , , peopl..."
8,"From the example of Hiroshima , we can see how...","From the example of Hiroshima , we can see how...","From the example of Hiroshima , we can see how...","From the example of Hiroshima , we can see how...","From the example of Hiroshima , we can see how..."
9,"For exemple , you were working like broadcaste...","For example , you were working like a broadcas...","For exemple , you were working like broadcaste...","For example , you were working like , but in t...","For example , you were working like , but in t..."


In [None]:
df_eval.to_csv('/content/drive/MyDrive/dataset/evaluation_result_EDandAtt.csv')

In [None]:
df_rg1 = pd.DataFrame(columns = ['precision','recall','fmeasure'])

In [None]:
for i in range(len(rouge_score)):
  df_rg1.at[i,'precision'] = rouge_score[i]['rouge1'].high.precision
  df_rg1.at[i, 'recall'] = rouge_score[i]['rouge1'].high.recall
  df_rg1.at[i, 'fmeasure'] = rouge_score[i]['rouge1'].high.fmeasure