In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense,  Dropout, Input, LSTM, Embedding,SpatialDropout1D, BatchNormalization, Flatten, LayerNormalization
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import tensorflow_text as text

import pandas_tfrecords as pdtfr

import matplotlib.pyplot as plt
import re

from pathlib import Path

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import pickle
from sklearn.metrics import confusion_matrix

import keras_tuner as kt
import seaborn as sns
import tensorflow_datasets as tfds


tf.random.set_seed(0)
np.random.seed(0)
stop_words_list = stopwords.words('english') 

#tf.config.run_functions_eagerly(True)
#import tensorflow_data_validation as tfdv
#tf.debugging.set_log_device_placement(True)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

from sklearn.model_selection import train_test_split
import keras_nlp
from transformers import  RobertaTokenizer, TFRobertaModel



# Read dataset


In [None]:
dataset, info = tfds.load('imdb_reviews', with_info=True, 
                          as_supervised=True)

train_dataset, test_dataset = dataset['train'], dataset['test']
train_dataset

In [None]:
for i, o in train_dataset.take(1):
    print(i.shape, o.shape)
    print(i, o)

In [None]:
def convert_tfdataframe_to_pddataframe(tf_df):
    data_list = []
    for i, (features, label) in enumerate(tf_df):
        
        if i % 5000 == 0 : print(f"{i}/{len(tf_df)}")
        
        data_list.append((features.numpy().decode('utf-8'), label.numpy()))

    # Create a Pandas DataFrame
    return pd.DataFrame(data_list, columns = [ "review", "sentiment"])

# Create a Pandas DataFrame
train_dataset = convert_tfdataframe_to_pddataframe(train_dataset)
train_dataset

In [None]:
# Create a Pandas DataFrame
test_dataset = convert_tfdataframe_to_pddataframe(test_dataset)
test_dataset

# Check statistics

In [None]:
train_dataset.dtypes

In [None]:
# # Instantiate a StatsOptions class and define the feature_allowlist property
# stats_options = tfdv.StatsOptions(
#     feature_allowlist=train_dataset.columns.tolist(),
#     enable_semantic_domain_stats= True # Because we are processing text
#     )
# stats_options

In [None]:
# train_stats = tfdv.generate_statistics_from_dataframe(train_dataset, stats_options) 

# # get the number of features used to compute statistics
# print(f"Number of features used: {len(train_stats.datasets[0].features)}")

# # check the number of examples used
# print(f"Number of examples used: {train_stats.datasets[0].num_examples}")

# # check the column names of the first and last feature
# print(f"First feature: {train_stats.datasets[0].features[0].path.step[0]}")
# print(f"Last feature: {train_stats.datasets[0].features[-1].path.step[0]}")
# tfdv.visualize_statistics(train_stats)


# Set Parameters

In [None]:
batch_size=128

ETL_STOPWORDS= False
PREPARE_TRAINIG_DATA= False
TRAIN_MODE= True

TUNNING = True
INFERENCE_MODE = True

threshold = 1
classN=1
min_freq = 4



In [None]:

DATA_CHECKPOINTS_DIR = './checkpoints'
tfrecord_filename = "train_tmp.tfrecord"
train_tmp_record_path = f'{DATA_CHECKPOINTS_DIR}/{tfrecord_filename}'

!mkdir -p {DATA_CHECKPOINTS_DIR}



# Set Functions to perform data cleaning and feature engineering

In [None]:
def check_word_frequency(train_df):
    
    print("Dataset lentgh : ",len(train_df))

    tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;=?@[\\]^_´`«»{|}~\t\n\'',oov_token='<oov>')# sequence_size = 500 ~= mean+ std sentence size
    tokenizer.fit_on_texts(train_df.review.values.tolist())

    word_frequency = {}
    for i, (word, count) in enumerate(tokenizer.word_counts.items()):
        word_frequency[word]=count
            
    return word_frequency


def update_stop_words(word_frequency, freq,max_freq,stop_word):
 
    it = 0
    for word, count in word_frequency.items():
        if count < freq:
            if it < 10:
                print(word)
                it +=1
            stop_word.append(word)
        elif count >max_freq :
            stop_word.append(word)
            
    return list(set(stop_word))


word_frequency= check_word_frequency(train_dataset)
max_freq = np.max(list(word_frequency.values())) +1 
# Update stop words list with the less frequent 
updated_stop_word = update_stop_words(word_frequency, min_freq, max_freq, stop_words_list.copy())

In [None]:
len(updated_stop_word)/len(word_frequency)

In [None]:

def limit_words_tf(review, sentiment):
    # Split the string into tokens
    tokens = tf.strings.split(review)

    # Limit the number of tokens to max_words
    limited_tokens = tokens[:sequence_size]
    
    # Join the tokens back into a string
    limited_review = tf.strings.reduce_join(limited_tokens, separator=' ')
        
    return limited_review, sentiment



def save_record(dataset, record_path):

    with tf.io.TFRecordWriter(record_path) as writer:
        for index, row in dataset.iterrows():
            input_ids, attention_mask, sentiment = row["input_ids"], row["attention_mask"], row["sentiment"]
            # Serialize example
            feature = {
                'input_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=input_ids)),  # List feature
                'attention_mask': tf.train.Feature(float_list=tf.train.FloatList(value=attention_mask)), # List feature
                'sentiment': tf.train.Feature(int64_list=tf.train.Int64List(value=[sentiment])) # Single value feature
            }
            example_proto = tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
            
            writer.write(example_proto)

def clean_reviews(review, sentiment):
    
    lowercase = tf.strings.lower(review)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    
    tokens = tf.strings.split(stripped_html)
    # Filter out words to remove
    words_to_remove_tensor = tf.constant(updated_stop_word)
    mask = tf.logical_not(tf.reduce_any(tf.equal(tokens, words_to_remove_tensor[:, tf.newaxis]), axis=0))
    
    # Filter out words to remove
    filtered_tokens = tf.boolean_mask(tokens, mask)
    
    # Join the remaining tokens back into a string
    processed_data = tf.strings.reduce_join(filtered_tokens, separator=' ')
    
    sentence_size = len(tf.strings.split(processed_data))
    return (processed_data, sentiment , sentence_size)
    
def process_stop_words(td_dataset):

    # Convert to tf.dataset
    td_dataset = tf.data.Dataset.from_tensor_slices((td_dataset['review'], td_dataset['sentiment']))

    for i, o in td_dataset.take(1):
        print(i.shape, o.shape)
        print("Pre processed ds : ",i, o )
   
    
    td_dataset = td_dataset.map(clean_reviews, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    
    for i, o, l in td_dataset.take(3):
        print("Post processed ds : ",l, o, i )
        
    data_list_element = []
    total_len = len(td_dataset)

    print(f"Starting Extraction in batches {total_len} ...")
    for i, (features, label, size) in enumerate(td_dataset.take(total_len)):
        if i % 1000 == 0 : print(f"{i}/{total_len}")
        
        data_list_element.append((features.numpy(), label.numpy(), size.numpy()))
    
    print("Creating dataframe....")
    return pd.DataFrame(data_list_element, columns = [ "review", "sentiment", "size"])

if ETL_STOPWORDS:
    train_ds = process_stop_words(train_dataset)
    train_ds.to_pickle(f"{DATA_CHECKPOINTS_DIR}/train_ds.pkl")  
    
    display(train_ds)
    test_ds  = process_stop_words(test_dataset)
    test_ds.to_pickle(f"{DATA_CHECKPOINTS_DIR}/test_ds.pkl") 
    display(test_ds) 
else:
    train_ds = pd.read_pickle(f"{DATA_CHECKPOINTS_DIR}/train_ds.pkl")  
    test_ds = pd.read_pickle(f"{DATA_CHECKPOINTS_DIR}/test_ds.pkl") 



In [None]:
train_ds.review = train_ds.review.apply(lambda x : x.decode("utf-8"))
test_ds.review = test_ds.review.apply(lambda x : x.decode("utf-8"))

In [None]:
if ETL_STOPWORDS:

    with open(f'{DATA_CHECKPOINTS_DIR}/word_frequency.pkl', 'wb') as f:
        pickle.dump(word_frequency, f)

    with open(f'{DATA_CHECKPOINTS_DIR}/updated_stop_word.pkl', 'wb') as f:
        pickle.dump(updated_stop_word, f)

else:
    
    with open(f'{DATA_CHECKPOINTS_DIR}/word_frequency.pkl', 'rb') as f:
        word_frequency = pickle.load(f)
        
    with open(f'{DATA_CHECKPOINTS_DIR}/updated_stop_word.pkl', 'rb') as f:
        updated_stop_word = pickle.load(f)

        


# Check sentence size
 

In [None]:
train_ds["size"].describe()

In [None]:
train_ds[train_ds["size"] > 800].review.values[0]

In [None]:
sns.boxplot(x=test_ds['size'])
test_ds["size"].describe()

Finding the outliers to fix the sentence size . Every sample with size considered as outlier will be trimmed

In [None]:

Q1_train = train_ds["size"].quantile(0.25)
Q3_train = train_ds["size"].quantile(0.75)
IQR_train = Q3_train - Q1_train
print("Train Q1_train : ",Q1_train)
print("Train Q3_train : ",Q3_train)
print("Train IQR : ",IQR_train)
print("Final training dataset size : ",(Q1_train - 1.5 * IQR_train),(Q3_train + 1.5 * IQR_train), len(train_ds[ train_ds["size"] <= (Q3_train + 1.5 * IQR_train)]))
print("Training data to be trimed : ",len(train_ds[ train_ds["size"] > (Q3_train + 1.5 * IQR_train)]))


In [None]:
Q1_test = test_ds["size"].quantile(0.25)
Q3_test = test_ds["size"].quantile(0.75)
IQR_test = Q3_test - Q1_test

print("Test IQR : ",IQR_test)
print("Final testing dataset size : ",(Q1_test - 1.5 * IQR_test),(Q3_test + 1.5 * IQR_test), len(test_ds[ test_ds["size"] <= (Q3_test + 1.5 * IQR_test)]))
print("Testing data to be trimed : ",len(test_ds[ test_ds["size"] > (Q3_test + 1.5 * IQR_test)]))


Apply the sentence cut

In [None]:
IQR_total = np.maximum((Q3_train + 1.5 * IQR_train),(Q3_test + 1.5 * IQR_test))
sequence_size = int(IQR_total)
sequence_size

In [None]:
test_ds = test_ds[test_ds['size'] > 0] 
train_ds = train_ds[train_ds['size'] > 0] 

In [None]:

def limit_words(review):
    # Split the string into tokens
    tokens = review.split()

    # Limit the number of tokens to max_words
    limited_tokens = tokens[:sequence_size]
    
    # Join the tokens back into a string
    limited_review = ' '.join(limited_tokens)
        
    return limited_review

print("Cleaning training dataset - sentences size processing ")
train_ds["cleaned_review"] = train_ds.review.apply(lambda x : limit_words(x)) 
train_ds["cleaned_size"] = train_ds.cleaned_review.apply(lambda x : len(x.split())) 

print("Cleaning testing dataset - sentences size processing ")
test_ds["cleaned_review"] = test_ds.review.apply(lambda x : limit_words(x)) 
test_ds["cleaned_size"] = test_ds.cleaned_review.apply(lambda x : len(x.split())) 

num_labels = len(train_ds.sentiment.unique())  # number of labels for your classification task

display(train_ds)
display(test_ds)

Before thinking on trimming, we must check if any important sentiment information will not be deleted.

# Tokenize and prepare embeddings

In [None]:
plt.figure()
pd.Series(train_ds.size).hist(bins=10)
plt.title("Sentence Size for training dataset")

plt.figure()
pd.Series(train_ds.sentiment).hist()
plt.title("Sentiment for training dataset")

plt.show()
print(pd.Series(train_ds.size).describe())

plt.figure()
pd.Series(test_ds.size).hist()
plt.title("Sentence Size for test dataset")

plt.figure()
pd.Series(test_ds.sentiment).hist()
plt.title("Sentiment for test dataset")

plt.show()
print(pd.Series(test_ds.size).describe())

In [None]:
train_ds = train_ds[["cleaned_review","sentiment"]]
test_ds = test_ds[["cleaned_review","sentiment"]]
train_ds.sentiment = train_ds.sentiment.astype("int8")
test_ds.sentiment = test_ds.sentiment.astype("int8")

display(train_ds)
display(test_ds)

In [None]:

train_ds, val_ds = train_test_split(train_ds,test_size=0.2,train_size=0.8)
display(train_ds)
display(val_ds)

Saving the data to the disk to clean memory after

In [None]:
train_tfds = tf.data.Dataset.from_tensor_slices(
        (
            train_ds.cleaned_review.tolist()
        )
    ).batch(batch_size).shuffle(buffer_size=len(train_ds)).prefetch(tf.data.AUTOTUNE)
for x in train_tfds.take(1):
    print(x)

In [None]:
dev_tfds = tf.data.Dataset.from_tensor_slices(
        (
            val_ds.cleaned_review.tolist(), val_ds.sentiment.tolist()
        )
    ).batch(batch_size).shuffle(buffer_size=len(val_ds)).prefetch(tf.data.AUTOTUNE)
for x in dev_tfds.take(1):
    print(x)

test_tfds = tf.data.Dataset.from_tensor_slices(
        (
            test_ds.cleaned_review.tolist(), test_ds.sentiment.tolist()
        )
    ).batch(batch_size).shuffle(buffer_size=len(test_ds)).prefetch(tf.data.AUTOTUNE)
for x in test_tfds.take(1):
    print(x)

In [None]:
vocab_size = len(word_frequency.keys())
vocab_size, sequence_size

In [None]:
## Train tokenizer vocabulary on training data only. Words that occur only on the test data will be unknown, and this is the expected behaviour when considering real word environment.

vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    train_tfds,
    vocabulary_size=vocab_size,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"],
)

#WordPieceTokenizer is an efficient implementation of the WordPiece algorithm used by BERT and other models. 
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=sequence_size,
    lowercase=True,
)

train_tfds = tf.data.Dataset.from_tensor_slices(
        (
            train_ds.cleaned_review.tolist(), train_ds.sentiment.tolist()
        )
    ).batch(batch_size).shuffle(buffer_size=len(train_ds)).prefetch(tf.data.AUTOTUNE)
for x in train_tfds.take(1):
    print(x)

In [None]:
# packer adds a start token
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=sequence_size,
    start_value=tokenizer.token_to_id("[BOS]"),
    end_value = tokenizer.token_to_id("[EOS]"),
)

def preprocess(inputs,output):
    tokenized_inputs = tokenizer(inputs)
    features = start_packer(tokenized_inputs)
    return features, output

train_tfds= train_tfds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

dev_tfds= dev_tfds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)


test_tfds= test_tfds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

print("Training set ...")
for i, o in train_tfds.take(1):
    print("input : ",i[0])
    print("output : ",o)
    print("Detokenized input: ",tokenizer.detokenize(i[0]))


print("Dev set ...")
for i, o in dev_tfds.take(1):
    print("input : ",i[0])
    print("output : ",o)
    print("Detokenized input: ",tokenizer.detokenize(i[0]))


print("Testing set ...")
for i, o in test_tfds.take(1):
    print("input : ",i[0])
    print("output : ",o)
    print("Detokenized input: ",tokenizer.detokenize(i[0]))


# Attention model training process

### Run training process 


In [None]:
class AttentionSentimentModel(tf.keras.Model):
    def __init__(
        self, 
        n_classes = 1,
        vocab_len = vocab_size, 
        window_size = sequence_size, 
        embedding_dim = 100,  
        num_att_layers = 1, 
        n_attention_head = 2, 
        feed_forward_dim = [128],
        dropout_rate = [0.0],
        num_dense_layers = 1,
        dense_units_list = [32]):
        super(AttentionSentimentModel, self).__init__()
        
        self.num_att_layers = num_att_layers
        self.n_attention_head =n_attention_head
        self.feed_forward_dim = feed_forward_dim
        self.dropout_rate = dropout_rate

        

        self.embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
            vocabulary_size=vocab_len,
            sequence_length=window_size,
            embedding_dim=embedding_dim,
            mask_zero=True,
        )
        
        self.decoder_layer = [keras_nlp.layers.TransformerDecoder(num_heads=self.n_attention_head, intermediate_dim=self.feed_forward_dim[i],dropout = dropout_rate[i]) for i in range(self.num_att_layers)]
        
        self.num_dense_layers = num_dense_layers
        self.dense_layers = [Dense(dense_units_list[i], activation='relu') for i in range(self.num_dense_layers)]
        self.output_dense = Dense(n_classes, activation='sigmoid')
        self.flatten = tf.keras.layers.Flatten()

    def call(self, inputs):
        x = self.embedding_layer(inputs)
        for i in range(self.num_att_layers ):
            x = self.decoder_layer[i](x)

        x = self.flatten(x)
        for i in range(self.num_dense_layers):
            x = self.dense_layers[i](x)

        classes  = self.output_dense(x)

        return classes

# test_sentiment_model =AttentionSentimentModel(n_classes = 1)
# test_sentiment_model.compile(optimizer=Adam(learning_rate=0.0001), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics =[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.AUC()])
# for i, o in train_tfds.take(1):
#     test_sentiment_model(i)
# test_sentiment_model.summary()


In [None]:
# test_sentiment_model.fit(train_tfds,  epochs=5, batch_size=batch_size, validation_data = dev_tfds, verbose =1)

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/attention_tunning/")

def get_model_tunning(hp):

    pr = tf.keras.metrics.AUC(curve="PR")
    p = tf.keras.metrics.Precision()
    r = tf.keras.metrics.Recall()

    n_dense_layers = hp.Int('max_n_dense', 1, 2)
    dense_units_list =  [hp.Int('dense_units_'+str(i), 32, 128) for i in range(n_dense_layers)]

    num_att_layers = hp.Int('num_att_layers', 1, 3)
    n_attention_head = hp.Int('n_attention_head', 1, 3)
    feed_forward_dim =  [hp.Int('feed_forward_dim_'+str(i), 32, 128) for i in range(num_att_layers)]
    dropout_units_list =  [hp.Float('dropout_rate_'+str(i), 0.0, 0.5, step = 0.1) for i in range(num_att_layers)]
    
    hp_learning_rate = hp.Float('learning_rate', 0.000001, 0.001)


    embedding_dim = hp.Int('embedding_dim', 50, 200, step = 50)

    model =AttentionSentimentModel(
        n_classes = 1, 
        vocab_len = vocab_size, 
        window_size = sequence_size, 
        embedding_dim = embedding_dim,  

        num_att_layers = num_att_layers, 
        n_attention_head = n_attention_head, 
        feed_forward_dim = feed_forward_dim,
        dropout_rate = dropout_units_list,
        num_dense_layers = n_dense_layers,
        dense_units_list = dense_units_list
        )

        

    model.compile(optimizer=Adam(learning_rate=hp_learning_rate), loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), metrics =[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.AUC(), pr, p, r])
    for i, o in train_tfds.take(1):
        model(i)

    return model


tuner = kt.BayesianOptimization(
        get_model_tunning,
        objective= kt.Objective('val_auc', direction="max"),
        max_trials = 30,
        directory=r"Hyperparam_tunning",
        project_name='keras_attention_tunning',
    )

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=50)
if TUNNING:
    tuner.search(train_tfds,  epochs=100, batch_size=batch_size, validation_data = dev_tfds, verbose =1, callbacks=[stop_early, tensorboard_callback]) 


In [None]:
for i, best_hps in enumerate(tuner.get_best_hyperparameters(num_trials=3)):
    print(f"Best Hyperparameters: {best_hps.__dict__}")
    text_generator_model = tuner.hypermodel.build(best_hps)  # Build the model with best hyperparameters
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=f"logs/attention_training/model{i}")
    text_generator_model.fit(train_tfds,  epochs=300, batch_size=batch_size, validation_data = test_tfds, verbose =1, callbacks=[stop_early, tensorboard_callback]) 

In [None]:
aaaaaaaaaaaaaaa

In [None]:
model_dynamic = get_model_dynamic(best_hps)

if FIT_MODEL:
    
    history = model_dynamic.fit(x[0], y, 
              batch_size=batch_size, 
              epochs=300,
              shuffle=True,
              validation_data = (x_dev[0],y_dev),
              callbacks=[PlotLossesCallback() ,stop_early]
             )
    
    model_dynamic.save(f'{DATA_CHECKPOINTS_DIR}_dynamic/model_dynamic')

    with open(f'{DATA_CHECKPOINTS_DIR}_dynamic/history.pickle', 'wb') as handle:
        pickle.dump(history.history, handle, protocol=pickle.HIGHEST_PROTOCOL)


else:
    model_dynamic = tf.keras.models.load_model(f'{DATA_CHECKPOINTS_DIR}_dynamic/model_dynamic')

    with open(f'{DATA_CHECKPOINTS_DIR}_dynamic/history.pickle', 'rb') as handle:
        history = pickle.load(handle)
        plt.figure()
        plt.plot(history['binary_accuracy'])
        plt.plot(history['val_binary_accuracy'])
        plt.title('Model accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')

        plt.figure()
        plt.plot(history['loss'])
        plt.plot(history['val_loss'])
        plt.title('Model training loss')
        plt.ylabel('Loss')
        plt.xlabel('epoch')
        plt.legend(['train', 'val'], loc='upper left')

        plt.show()

In [None]:
yhat =model_dynamic.predict(x_test[0][0:100])
yhat[yhat >=0.5] = 1
yhat[yhat < 0.5] = 0 
yhat

y_test = y_test.reshape((len(y_test),1))
y_test
print("x 1º line : "+str(x))
print("y : ",y) 
print("y_test : ",y_test) 

In [None]:

yhat =model_dynamic.predict(x_test)
yhat[yhat >=0.5] = 1
yhat[yhat < 0.5] = 0 
y_test = y_test.reshape((len(y_test),1))

pr = tf.keras.metrics.AUC(curve="PR")
print()

p = tf.keras.metrics.Precision()
p.update_state(y_test,yhat)

r = tf.keras.metrics.Recall()
r.update_state(y_test,yhat)

base_pr = pr(y_test, yhat).numpy()
base_p = p.result().numpy()
base_r = r.result().numpy()
print("AUC",base_pr," | Precision : ",base_p, " | Recall : ",base_r)


In [None]:
confusion_matrix(list(y_test),list(yhat))

In [None]:

base_tn, base_fp, base_fn, base_tp = confusion_matrix(list(y_test),list(yhat)).ravel()
base_tn, base_fp, base_fn, base_tp



In [None]:
def print_predictions(X, pred):
    for i in range(len(X)):
        xx = [tokenizer.index_word.get(ind) for ind in X[i] if tokenizer.index_word.get(ind) is not None]
        print(' '.join(xx), "Prediction :", int(pred[i])," - Real :",y_test[i][0],"\n")
        
        if i==10:
            break

print_predictions(x_test[0], yhat)