<a href="https://colab.research.google.com/github/alilotfi90/A-Natural-Language-Processing-Journey/blob/main/steam-review-attention-model-2-tuning-included.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding



nltk.download('punkt')
nltk.download('stopwords')

drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


In [2]:
!unzip "/content/drive/MyDrive/steam_data_set.zip" -d "/content/"

Archive:  /content/drive/MyDrive/steam_data_set.zip
  inflating: /content/test_gr/test.csv  
  inflating: /content/train_gr/game_overview.csv  
  inflating: /content/train_gr/train.csv  


In [3]:
data_path = "/content/train_gr/train.csv"
df = pd.read_csv(data_path)

# Explore first few rows of the DataFrame
print(df.head())

   review_id                        title    year  \
0          1  Spooky's Jump Scare Mansion  2016.0   
1          2  Spooky's Jump Scare Mansion  2016.0   
2          3  Spooky's Jump Scare Mansion  2016.0   
3          4  Spooky's Jump Scare Mansion  2015.0   
4          5  Spooky's Jump Scare Mansion  2015.0   

                                         user_review  user_suggestion  
0  I'm scared and hearing creepy voices.  So I'll...                1  
1  Best game, more better than Sam Pepper's YouTu...                1  
2  A littly iffy on the controls, but once you kn...                1  
3  Great game, fun and colorful and all that.A si...                1  
4  Not many games have the cute tag right next to...                1  


In [4]:
# Tokenizing and Padding
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df['user_review'])

# Getting vocab size from tokenizer word index
vocab_size = len(tokenizer.word_index) + 1

padded_sequences = pad_sequences(tokenizer.texts_to_sequences(df['user_review']), maxlen=100, padding='post', truncating='post')


In [6]:
import tensorflow as tf
from tensorflow.keras import layers

class AttentionLayer(layers.Layer):
    def __init__(self, units):
        super(AttentionLayer, self).__init__()
        self.W1 = layers.Dense(units)
        self.W2 = layers.Dense(units)
        self.V = layers.Dense(1)

    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

In [10]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.layers import Input

embedding_dim = 32
max_length = 100

input_text = Input(shape=(max_length,))
embedding_layer = layers.Embedding(vocab_size, embedding_dim)(input_text)

# LSTM layer
lstm_out, lstm_hidden, _ = layers.LSTM(256, return_sequences=True, return_state=True)(embedding_layer)

# Attention layer
attention_layer = AttentionLayer(64)
context_vector, attention_weights = attention_layer(lstm_out, lstm_hidden)

# Fully connected layers
dense_layer = layers.Dense(64, activation='relu')(context_vector)
output_layer = layers.Dense(1, activation='sigmoid')(dense_layer)


model = models.Model(inputs=input_text, outputs=output_layer)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [14]:

from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split



# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['user_suggestion'].values, test_size=0.2, random_state=42)

# Model creation
input_text = Input(shape=(max_length,))
embedding_layer = layers.Embedding(vocab_size, embedding_dim)(input_text)
lstm_out, lstm_hidden, _ = layers.LSTM(256, return_sequences=True, return_state=True)(embedding_layer)
attention_layer = AttentionLayer(64)
context_vector, attention_weights = attention_layer(lstm_out, lstm_hidden)
dense_layer = layers.Dense(64, activation='relu')(context_vector)
output_layer = layers.Dense(1, activation='sigmoid')(dense_layer)
model = models.Model(inputs=input_text, outputs=output_layer)

#model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training for different epoch values
epoch_list = [5, 10, 20, 40]
total_epochs = 0

for epochs in epoch_list:
    print(f"\nTraining for {epochs} epochs...\n")

    # Training
    model.fit(X_train, y_train, epochs=epochs, initial_epoch=total_epochs, batch_size=32, validation_data=(X_test, y_test))

    total_epochs = epochs

    # Evaluation
    loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
    print(f"\nTest accuracy after a total of {epochs} epochs: {accuracy*100:.2f}%\n")





Training for 5 epochs...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
110/110 - 1s - loss: 0.5612 - accuracy: 0.8062 - 668ms/epoch - 6ms/step

Test accuracy after a total of 5 epochs: 80.62%


Training for 10 epochs...

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
110/110 - 1s - loss: 1.3786 - accuracy: 0.8008 - 660ms/epoch - 6ms/step

Test accuracy after a total of 10 epochs: 80.08%


Training for 20 epochs...

Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
110/110 - 1s - loss: 1.6348 - accuracy: 0.7934 - 540ms/epoch - 5ms/step

Test accuracy after a total of 20 epochs: 79.34%


Training for 40 epochs...

Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
110/110 - 1s - loss: 1.9668 - accuracy: 0.7894 - 823ms/

In [18]:
!pip install keras-tuner

import kerastuner as kt
from tensorflow.keras import layers, models, Input
from sklearn.model_selection import train_test_split


def build_model(hp):
    input_text = Input(shape=(max_length,))
    embedding_layer = layers.Embedding(
        vocab_size,
        hp.Int('embedding_dim', min_value=32, max_value=512, step=32)
    )(input_text)
    lstm_out, lstm_hidden, _ = layers.LSTM(
        hp.Int('lstm_units', min_value=128, max_value=512, step=32),
        return_sequences=True,
        return_state=True
    )(embedding_layer)
    attention_layer = AttentionLayer(
        hp.Int('attention_units', min_value=32, max_value=128, step=32)
    )
    context_vector, attention_weights = attention_layer(lstm_out, lstm_hidden)
    dense_layer = layers.Dense(
        hp.Int('dense_units', min_value=32, max_value=128, step=32),
        activation='relu'
    )(context_vector)
    output_layer = layers.Dense(1, activation='sigmoid')(dense_layer)
    model = models.Model(inputs=input_text, outputs=output_layer)

    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=3,
    directory='my_dir',
    project_name='helloworld'
)

tuner.search(X_train, y_train, epochs=5, validation_data=(X_test, y_test))


# best model
best_model = tuner.get_best_models(num_models=1)[0]

# Additional training
epoch_list = [5, 10, 15, 20]

for epochs in epoch_list:
    print(f"\nTraining for {epochs} epochs...\n")

    # Further Training
    best_model.fit(X_train, y_train, epochs=epochs, initial_epoch=0, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate model
    loss, accuracy = best_model.evaluate(X_test, y_test, verbose=2)
    print(f"\nTest accuracy after a total of {epochs} epochs: {accuracy*100:.2f}%\n")


Trial 5 Complete [00h 04m 07s]
val_accuracy: 0.8347146908442179

Best val_accuracy So Far: 0.8428122401237488
Total elapsed time: 00h 30m 48s

Training for 5 epochs...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
110/110 - 1s - loss: 0.7526 - accuracy: 0.8165 - 752ms/epoch - 7ms/step

Test accuracy after a total of 5 epochs: 81.65%


Training for 10 epochs...

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
110/110 - 1s - loss: 1.2049 - accuracy: 0.8111 - 853ms/epoch - 8ms/step

Test accuracy after a total of 10 epochs: 81.11%


Training for 15 epochs...

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
110/110 - 1s - loss: 1.9967 - accuracy: 0.8099 - 628ms/epoch - 6ms/step

Test accuracy after a total of 15 epochs: 80.99%


Training for 20 epochs...

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epo