<a href="https://colab.research.google.com/github/alilotfi90/A-Natural-Language-Processing-Journey/blob/main/steam-review-sequential-model-2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding



nltk.download('punkt')
nltk.download('stopwords')

drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive


In [2]:
!unzip "/content/drive/MyDrive/steam_data_set.zip" -d "/content/"

Archive:  /content/drive/MyDrive/steam_data_set.zip
  inflating: /content/test_gr/test.csv  
  inflating: /content/train_gr/game_overview.csv  
  inflating: /content/train_gr/train.csv  


In [3]:
data_path = "/content/train_gr/train.csv"
df = pd.read_csv(data_path)

# Explore first few rows of the DataFrame
print(df.head())

   review_id                        title    year  \
0          1  Spooky's Jump Scare Mansion  2016.0   
1          2  Spooky's Jump Scare Mansion  2016.0   
2          3  Spooky's Jump Scare Mansion  2016.0   
3          4  Spooky's Jump Scare Mansion  2015.0   
4          5  Spooky's Jump Scare Mansion  2015.0   

                                         user_review  user_suggestion  
0  I'm scared and hearing creepy voices.  So I'll...                1  
1  Best game, more better than Sam Pepper's YouTu...                1  
2  A littly iffy on the controls, but once you kn...                1  
3  Great game, fun and colorful and all that.A si...                1  
4  Not many games have the cute tag right next to...                1  


In [4]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization and case normalization
    tokens = [word for word in tokens if word.isalpha()]  # Keep words only
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(tokens)

df['cleaned_review'] = df['user_review'].apply(preprocess_text)

df.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion,cleaned_review
0,1,Spooky's Jump Scare Mansion,2016.0,I'm scared and hearing creepy voices. So I'll...,1,scared hearing creepy voices pause moment writ...
1,2,Spooky's Jump Scare Mansion,2016.0,"Best game, more better than Sam Pepper's YouTu...",1,best game better sam pepper youtube account ne...
2,3,Spooky's Jump Scare Mansion,2016.0,"A littly iffy on the controls, but once you kn...",1,littly iffy controls know play easy master mad...
3,4,Spooky's Jump Scare Mansion,2015.0,"Great game, fun and colorful and all that.A si...",1,great game fun colorful side note though getti...
4,5,Spooky's Jump Scare Mansion,2015.0,Not many games have the cute tag right next to...,1,many games cute tag right next horror tag firs...


In [8]:
# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_review'])
sequences = tokenizer.texts_to_sequences(df['cleaned_review'])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')
padded_sequences_1 = pad_sequences(sequences, maxlen=200, padding='post', truncating='post')

# Build LSTM Model 1
model_1 = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=200),
    LSTM(32),
    Dense(1, activation='sigmoid')
])


# Build LSTM Model 2
model_2 = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=100),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

In [9]:
model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['user_suggestion'].values, test_size=0.3, random_state=42)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(padded_sequences_1, df['user_suggestion'].values, test_size=0.3, random_state=42)

# List of epochs
epoch_list = [5, 10, 20,40]

# Model Training with different epochs
for epoch in epoch_list:
    print(f'\nTraining model for {epoch} epochs...\n')

    model_1.fit(X_train_1, y_train_1, validation_data=(X_test_1, y_test_1), epochs=epoch, batch_size=32)

    test_loss, test_acc = model_1.evaluate(X_test_1, y_test_1, verbose=2)
    print(f'\nTest accuracy after {epoch} epochs for model_1 which had input_lenght = 200:', test_acc, '\n')


    model_2.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epoch, batch_size=32)

    test_loss, test_acc = model_2.evaluate(X_test, y_test, verbose=2)
    print(f'\nTest accuracy after {epoch} epochs for model_2 which had output_dim = 64:', test_acc, '\n')


Training model for 5 epochs...

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
165/165 - 1s - loss: 0.6963 - accuracy: 0.5854 - 887ms/epoch - 5ms/step

Test accuracy after 5 epochs for model_1 which had input_lenght = 200: 0.5854448676109314 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
165/165 - 1s - loss: 0.5539 - accuracy: 0.7630 - 532ms/epoch - 3ms/step

Test accuracy after 5 epochs for model_2 which had output_dim = 64: 0.7630024552345276 


Training model for 10 epochs...

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
165/165 - 1s - loss: 0.7102 - accuracy: 0.5856 - 667ms/epoch - 4ms/step

Test accuracy after 10 epochs for model_1 which had input_lenght = 200: 0.5856353640556335 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
165/165 - 1s - loss: 0.5383 - accuracy: 0.7735 - 814ms/epoch - 5ms/step

Test accuracy after 10 epochs for model_2 whi