### Functions

In [2]:
pip install numpy pandas scikit-learn torch torchvision torchaudio transformers nltk openai tqdm matplotlib seaborn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import os
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
import openai
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.nn.utils import clip_grad_norm_
import torch.nn as nn


In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

def preprocess_text(text):
    return str(text).replace('&gt', '').replace('\n', ' ').strip().lower()

class BertBinaryMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.dropout(outputs.pooler_output)
        return self.classifier(pooled)  # raw logits for BCEWithLogitsLoss

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

cuda


In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('all')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import wordnet as wn

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

custom_stopwords = stop_words - {'not', 'no'}
custom_stopwords.update({'he', 'she', 'it', 'they', 'them', 'his', 'her', 'hers', 'him', 'you', 'your', 'yourself', 'etc', 'anything', 'else', 'early' , 'access'})

auxiliary_verbs = {
    'would', 'could', 'should', 'might', 'may', 'must', 'will', 'can', 'does', 'did', 'do',
    'am', 'is', 'are', 'was', 'were', 'been', 'being',
    'has', 'have', 'had',
    'shall', 'ought', 'shall',
    'need', 'dare', 'used'
}

contraction_mapping = {
    "wouldn't": "would not",
    "wouldnt" : "would not",
    "couldn't": "could not",
    "couldnt": "could not",
    "shouldn't": "should not",
    "shouldnt": "should not",
    "mightn't": "might not",
    "mightnt": "might not",
    "mustn't": "must not",
    "mustnt": "must not",
    "doesn't": "does not",
    "doesnt": "does not",
    "don't": "do not",
    "dont": "do not",
    "didn't": "did not",
    "didnt": "did not",
    "aren't": "are not",
    "arent": "are not",
    "isn't": "is not",
    "isnt": "is not",
    "wasn't": "was not",
    "wasnt": "was not",
    "weren't": "were not",
    "werent": "were not",
    "hasn't": "has not",
    "hasnt": "has not",
    "haven't": "have not",
    "havent": "have not",
    "hadn't": "had not",
    "hadnt": "had not",
    "won't": "will not",
    "wont": "will not",
    "can't": "can not",
    "cant": "can not",
    "ain't": "am not",
    "aint" : "am not",
    "should've": "should have",
    "shouldve": "should have",
    "could've": "could have",
    "couldve": "could have",
    "would've": "would have",
    "wouldve": "would have",
    "might've": "might have",
    "mightve": "might have",
    "must've": "must have",
    "mustve": "must have",
    "haven't": "have not",
    "had've": "had have",
    "hadve": "had have",
    "needn't": "need not",
    "neednt": "need not",
    "dare'nt": "dare not",
    "darent": "dare not",
    "used'nt": "used not",
    "usednt": "used not",
    "im": "i am",
    "youre": "you are",
    "youve": "you have",
    "youll": "you will",
    "youre": "you are",
    "youve": "you have",
    "youll": "you will",
    "hes": "he is",
    "shes": "she is",
    "its": "it is",
    "theyre": "they are",
    "theyve": "they have",
    "theyll": "they will",
    "i've": "i have",
    "we're": "we are",
    "we've": "we have",
    "we'll": "we will",
    "ive": "i have",
    "ill": "i will",
    "i'll": "i will",
    "we'll": "we will",
    "you'll": "you will",
}

def preprocess_text(text):
    return str(text).replace('&gt', '').replace('\n', ' ').strip().lower()

def expand_contractions(text):

    for contraction, expanded in contraction_mapping.items():
        text = re.sub(r'\b' + re.escape(contraction) + r'\b', expanded, text, flags=re.IGNORECASE)
    return text

def preprocess_text1(text, protected_keywords=None):

    text = expand_contractions(text)

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    tokens = word_tokenize(text)

    tokens = [
        lemmatizer.lemmatize(word.lower(), pos=wn.VERB)
        for word in tokens
        if word.lower() not in custom_stopwords and word.lower() not in auxiliary_verbs and not word.isdigit()
    ]

    return ' '.join(tokens)

sample_text = " cant The brown isnt fox im am aint jump ive over combining the combined lazy dog. It's not a bright sunny day!. 2133d"
processed_text = preprocess_text1(sample_text)
print("Original Text:", sample_text)
print("Processed Text:", processed_text)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_

Original Text:  cant The brown isnt fox im am aint jump ive over combining the combined lazy dog. It's not a bright sunny day!. 2133d
Processed Text: not brown not fox not jump combine combine lazy dog not bright sunny day


In [6]:
def adjust_prediction_with_negation(text, prob, flip_threshold=0.7):
    text = text.lower()
    tokens = re.findall(r'\w+', text)

    for i, word in enumerate(tokens):
        if word in {"not", "no"}:
            scope = tokens[i+1:i+4]
            if scope:
                if prob >= flip_threshold:
                    prob = 1 - prob * 0.9
                elif prob <= 1 - flip_threshold:
                    prob = 1 - prob * 0.9
                break
    return prob

### Class Assigner Model

In [None]:
df = pd.read_csv('/content/Relabeled_Cleaned_Story_Fixed.csv',  encoding='ISO-8859-1')
df.head()
print(df.shape)
df['user_suggestion'] = df['user_suggestion'].replace({'Recommended': 1, 'Not Recommended': 0})
df.head()
label_columns = ["gameplay_mechanics", "visual_quality", "story_depth","pricing_and_value", "bugs_and_stability"]
X = df["user_review"]
y = df[label_columns].values.astype(np.float32)

(31585, 7)


In [None]:
X = X.apply(preprocess_text)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding="max_length", max_length=256, return_tensors="pt")
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding="max_length", max_length=256, return_tensors="pt")

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(y_train))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(y_val))

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=64)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=64)

In [None]:
model = BertBinaryMultiLabelClassifier(num_labels=len(label_columns)).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
loss_fn = nn.BCEWithLogitsLoss()

clip_value =50.0
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]
        model.zero_grad()
        logits = model(b_input_ids, b_input_mask)
        loss = loss_fn(logits, b_labels.float())
        loss.backward()

        if step % 100 == 0:
            print(f"\n--- Gradient Report at Step {step}, Epoch {epoch+1} ---")
            for name, param in model.named_parameters():
                if param.requires_grad and param.grad is not None:
                    print(f"{name}: grad min = {param.grad.min():.6f}, grad max = {param.grad.max():.6f}")

        clip_grad_norm_(model.parameters(), clip_value)
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]
        logits = model(b_input_ids, b_input_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        predictions.extend(probs)
        true_labels.extend(b_labels.cpu().numpy())

pred_array = np.array(predictions)
pred_labels = (pred_array >= 0.5).astype(int)
true_labels = np.array(true_labels)

f1 = f1_score(true_labels, pred_labels, average='macro')
print("\nMacro F1 Score (binary multi-label):", round(f1, 4))

print("\nClassification Report:")
print(classification_report(true_labels, pred_labels, target_names=label_columns))


--- Gradient Report at Step 0, Epoch 1 ---
bert.embeddings.word_embeddings.weight: grad min = -0.006132, grad max = 0.004829
bert.embeddings.position_embeddings.weight: grad min = -0.006639, grad max = 0.005507
bert.embeddings.token_type_embeddings.weight: grad min = -0.024659, grad max = 0.023417
bert.embeddings.LayerNorm.weight: grad min = -0.003387, grad max = 0.001892
bert.embeddings.LayerNorm.bias: grad min = -0.001438, grad max = 0.001354
bert.encoder.layer.0.attention.self.query.weight: grad min = -0.001225, grad max = 0.000499
bert.encoder.layer.0.attention.self.query.bias: grad min = -0.000142, grad max = 0.000187
bert.encoder.layer.0.attention.self.key.weight: grad min = -0.000485, grad max = 0.000477
bert.encoder.layer.0.attention.self.key.bias: grad min = -0.000000, grad max = 0.000000
bert.encoder.layer.0.attention.self.value.weight: grad min = -0.003010, grad max = 0.002802
bert.encoder.layer.0.attention.self.value.bias: grad min = -0.000756, grad max = 0.001126
bert.enc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#torch.save(model.state_dict(), "Class_Assigner.pt")
#print("✅ Model state saved to: Class_Assigner.pt")

✅ Model state saved to: Class_Assigner.pt


In [7]:
model = BertBinaryMultiLabelClassifier(num_labels=5)
model.load_state_dict(torch.load("C:/Users/ASUS/Desktop/University/second semester 2025 (END)/Capstone2/Class_Assigner.pt", map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
model.to(device)
model.eval()
print("✅ Mention Model Loaded...")

  model.load_state_dict(torch.load("C:/Users/ASUS/Desktop/University/second semester 2025 (END)/Capstone2/Class_Assigner.pt", map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))


✅ Mention Model Loaded...


In [None]:
#client = openai.OpenAI(api_key= API)  # use your real key

def speller(user_review):
    system_prompt = """
You are a gamer who fixes spelling in reviews and follows the rules below. Return only the corrected review.

EXAMPLES:
Input: grafix is amazing
Corrected: graphics is amazing

Input: storii was boring but coope was fun
Corrected: story was boring but co-op was fun

Input: grameplay has glithces and chaters
Corrected: gameplay has glitches and cheaters

Input: fpss drop is bad and bgs
Corrected: fps drop is bad and bugs

IMPORTANT RULES:
1. If the word 'storyline' appears (or any variation like 'storyine'), always change it to 'story').
2. If the word 'glitches' appears, always change it to glitch.
3. If the word 'textures' appears, always change it to texture.

ONLY return the corrected review. No explanation.
"""

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_review}
        ],
        temperature=0,
        max_tokens=150
    )
    return response.choices[0].message.content.strip()

user_review = "now i do understand why the fuck people was mad at the end of the game, like actually what in the hell was that ending?"
print("Original:", user_review)
print("Corrected:", speller(user_review))

Original: now i do understand why the fuck people was mad at the end of the game, like actually what in the hell was that ending?
Corrected: now I do understand why the fuck people were mad at the end of the game, like actually what in the hell was that ending?


In [27]:
model.eval()

label_columns = ["gameplay_mechanics", "visual_quality", "story_depth","pricing_and_value", "bugs_and_stability", "user_suggestion"]

def classify_review(text, threshold=0.5):
    text =  str(text).replace('&gt', '').replace('\n', ' ').strip().lower()
    text = speller(text)
    encoded = tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors="pt"
    )

    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(logits).cpu().numpy()[0]

    results = {}
    for i, prob in enumerate(probs):
        results[label_columns[i]] = (int(prob >= threshold), round(prob, 3))

    print("\nPrediction (1 = mentioned, 0 = not):")
    for label, (pred, score) in results.items():
        print(f"{label:25}: {pred}  (confidence: {score:.3f})")

classify_review("This game will reward you with a high trust factor even after thousands of hours spent queueing with friends and playing fair — so now you'll enjoy matches with equally fair, skilled, and communicative teammates as a reward for your dedication. The best part is how smooth and consistent the experience feels, even when things get tense. If you can appreciate occasional bugs or quirks without letting them ruin the fun: absolutely give it a try. As a game, it's brilliantly crafted and incredibly addictive. While no system is perfect (some minor matchmaking inconsistencies or rare bugs), it still earns my full recommendation — both for casual players and competitive enthusiasts.")


Prediction (1 = mentioned, 0 = not):
gameplay_mechanics       : 1  (confidence: 0.996)


### Sentiment Overall Analysis          

In [6]:
df = pd.read_csv("/content/train.csv")
X = df["user_review"]
y = df["user_suggestion"].tolist()

In [7]:
X = X.apply(preprocess_text)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=0)

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding="max_length", max_length=256, return_tensors="pt")
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding="max_length", max_length=256, return_tensors="pt")

train_dataset = TensorDataset(train_encodings['input_ids'],train_encodings['attention_mask'],torch.tensor(y_train).float().view(-1, 1))
val_dataset = TensorDataset(val_encodings['input_ids'],val_encodings['attention_mask'],torch.tensor(y_val).float().view(-1, 1))

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)

model = BertBinaryMultiLabelClassifier(num_labels=1).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()

#clip_value = 50.0
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]
        model.zero_grad()
        logits = model(b_input_ids, b_input_mask)
        loss = loss_fn(logits, b_labels.float())
        loss.backward()

        if step % 50 == 0:
            print(f"\n--- Gradient Report at Step {step}, Epoch {epoch+1} ---")
            for name, param in model.named_parameters():
                if param.requires_grad and param.grad is not None:
                    print(f"{name}: grad min = {param.grad.min():.6f}, grad max = {param.grad.max():.6f}")

        #clip_grad_norm_(model.parameters(), clip_value)
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]
        logits = model(b_input_ids, b_input_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        predictions.extend(probs)
        true_labels.extend(b_labels.cpu().numpy())

pred_array = np.array(predictions)
pred_labels = (pred_array >= 0.5).astype(int)
true_labels = np.array(true_labels)

f1 = f1_score(true_labels, pred_labels)
print("\n F1 Score:", round(f1, 4))
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


--- Gradient Report at Step 0, Epoch 1 ---
bert.embeddings.word_embeddings.weight: grad min = -0.010083, grad max = 0.008088
bert.embeddings.position_embeddings.weight: grad min = -0.007905, grad max = 0.006482
bert.embeddings.token_type_embeddings.weight: grad min = -0.043356, grad max = 0.050525
bert.embeddings.LayerNorm.weight: grad min = -0.009820, grad max = 0.004467
bert.embeddings.LayerNorm.bias: grad min = -0.002695, grad max = 0.003100
bert.encoder.layer.0.attention.self.query.weight: grad min = -0.003737, grad max = 0.004054
bert.encoder.layer.0.attention.self.query.bias: grad min = -0.000776, grad max = 0.000616
bert.encoder.layer.0.attention.self.key.weight: grad min = -0.002317, grad max = 0.002978
bert.encoder.layer.0.attention.self.key.bias: grad min = -0.000000, grad max = 0.000000
bert.encoder.layer.0.attention.self.value.weight: grad min = -0.005181, grad max = 0.006767
bert.encoder.layer.0.attention.self.value.bias: grad min = -0.002481, grad max = 0.003607
bert.enc

In [12]:
def predict_sentiment(text, model, tokenizer, max_length=256):
    model.eval()

    #text = speller(text)
    text = preprocess_text(text)
    #text = preprocess_text1(text)
    #text = extract_context_window(text, window=2)
    #print(text)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        prob = torch.sigmoid(logits).item()

    #prob = adjust_prediction_with_negation(text, prob, flip_threshold=0.7)
    label = 1 if prob >= 0.51 else 0
    print(f"\n📝 Review: {text}")
    print(f"🔢 Predicted Sentiment: {label} (Confidence: {prob:.4f})")

    return label, prob
test_review = "this game ois so bad becuase the price and the graphics , paid for nothing"
predict_sentiment(test_review, model, tokenizer)


📝 Review: this game ois so bad becuase the price and the graphics , paid for nothing
🔢 Predicted Sentiment: 0 (Confidence: 0.0061)


(0, 0.006145811174064875)

SyntaxError: 'return' outside function (4238111109.py, line 1)

In [None]:
# torch.save(model.state_dict(), "Sentiment_Overall_Model.pt")
# print("✅ Model state saved to: Sentiment_Overall_Model.pt")

✅ Model state saved to: Sentiment_Overall_Model.pt


In [11]:
model = BertBinaryMultiLabelClassifier(num_labels=1)
model.load_state_dict(torch.load("C:/Users/ASUS/Desktop/University/second semester 2025 (END)/Capstone2/Sentiment_Overall_Model.pt", map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
model.to(device)
model.eval()
print("✅ Gameplay Sentiment Overall Model loaded...")

  model.load_state_dict(torch.load("C:/Users/ASUS/Desktop/University/second semester 2025 (END)/Capstone2/Sentiment_Overall_Model.pt", map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))


✅ Gameplay Sentiment Overall Model loaded...


### Sentiment Classes Model

In [None]:
df = pd.read_csv("/content/train.csv")
X = df["user_review"]
y = df["user_suggestion"].tolist()

In [None]:
X = X.apply(preprocess_text)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=0)

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding="max_length", max_length=256, return_tensors="pt")
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding="max_length", max_length=256, return_tensors="pt")

train_dataset = TensorDataset(train_encodings['input_ids'],train_encodings['attention_mask'],torch.tensor(y_train).float().view(-1, 1))
val_dataset = TensorDataset(val_encodings['input_ids'],val_encodings['attention_mask'],torch.tensor(y_val).float().view(-1, 1))

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=32)

model = BertBinaryMultiLabelClassifier(num_labels=1).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.BCEWithLogitsLoss()

#clip_value = 50.0
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_loader):
        b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]
        model.zero_grad()
        logits = model(b_input_ids, b_input_mask)
        loss = loss_fn(logits, b_labels.float())
        loss.backward()

        if step % 50 == 0:
            print(f"\n--- Gradient Report at Step {step}, Epoch {epoch+1} ---")
            for name, param in model.named_parameters():
                if param.requires_grad and param.grad is not None:
                    print(f"{name}: grad min = {param.grad.min():.6f}, grad max = {param.grad.max():.6f}")

        #clip_grad_norm_(model.parameters(), clip_value)
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader):.4f}")

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_input_mask, b_labels = [b.to(device) for b in batch]
        logits = model(b_input_ids, b_input_mask)
        probs = torch.sigmoid(logits).cpu().numpy()
        predictions.extend(probs)
        true_labels.extend(b_labels.cpu().numpy())

pred_array = np.array(predictions)
pred_labels = (pred_array >= 0.5).astype(int)
true_labels = np.array(true_labels)

f1 = f1_score(true_labels, pred_labels)
print("\n F1 Score:", round(f1, 4))
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

In [14]:
gameplay_keywords = [
    "gameplay", "controls", "movement", "mechanics", "combat", "shooting", "driving",
    "jumping", "pacing", "balance", "physics", "interaction", "difficulty", "ai", 'playstyle',
    'play style', 'technique', 'techniques', 'cpu', 'play', 'difficult', "attack", "attack", "system",

]

visual_keywords = {
    "graphic", "graphics", "art", "drawing", "drawings", "style",
    "animation", "visual", "visuals","design", "texture", "render",
    "appearance", "resolution", "resolutions","color", "environment"
}

story_keywords = {
    "story", "storyline", "plot", "narrative", "writing", "dialogue", "script",
    "cutscene", "cutscenes", "character", "characters", "backstory",
    "development", "lore", "quest", "quests", "theme", "themes",
    "protagonist", "antagonist", "emotion", "emotional", "twist", "ending",
    "arc", "worldbuilding", "immersive", "immersion", "journey", "setting",
    "endings", "end", "narrator", "arcs"
}

pricing_keywords = {
    "price", "priced", "pricing", "cheap", "expensive", "cost", "costs", "value", "worth",
    "deal", "deals", "sale", "sales", "discount", "discounted", "money", "pay", "paid",
    "purchase", "purchased", "buy", "bought", "refund", "refunded", "dlc", "microtransaction",
    "microtransactions", "transaction", "transactions", "season pass", "loot box", "loot boxes",
    "in-game purchase", "in-game purchases", "in game", "add-on", "add-ons", "add on", "add ons","affordable", "overpriced", "underpriced",
    "wallet", "wallet-friendly", "pricing model", "price tag", "not worth", "worth it", "ripoff",
    "overcharge", "reasonable", "expensive", "free", "free-to-play", "freemium", "premium", "addons"
}

bug_keywords = {
    "bug", "bugs", "glitch", "glitches", "crash", "crashes", "crashed", "crashing",
    "lag", "laggy", "freeze", "freezes", "freezing", "stutter", "stuttering",
    "frame drop", "frame rate", "fps drop", "low fps", "optimization", "unoptimized",
    "unstable", "desync", "broken", "gamebreaking", "game-breaking", "issue", "issues",
    "problem", "problems", "technical", "update broke", "patched", "patch broke",
    "unplayable", "error", "errors", "connection issue", "server lag", "network lag",
    "buggy", "glitchy", "softlock", "hardlock", "lag spikes", "performance", "load times",
    "loading bug", "input lag", "jank", "debug", "runtime error", "black screen",
    "infinite loading", "flickering", "corrupt save", "save bug", "fix client", "client",
    "stuck", "stick"
}

In [None]:
def extract_context_window(text, window=4):
    words = re.findall(r"\w+", str(text).lower())
    
    indices = [i for i, word in enumerate(words) if word in visual_keywords]
    if not indices:
        return ""
    i = indices[0]
    start = max(0, i - window)
    end = min(len(words), i + window + 1)
    return " ".join(words[start:end])


def predict_sentiment(text, model, tokenizer, max_length=256):
    model.eval()

    text = speller(text)
    text = preprocess_text(text)
    text = preprocess_text1(text)
    text = extract_context_window(text, window=2)
    #print(text)

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        prob = torch.sigmoid(logits).item()

    #prob = adjust_prediction_with_negation(text, prob, flip_threshold=0.7)
    label = 1 if prob >= 0.51 else 0
    print(f"\n📝 Review: {text}")
    print(f"🔢 Predicted Sentiment: {label} (Confidence: {prob:.4f})")

    return label, prob
test_review = "this game ois so bad becuase the price and the graphics , paid for nothing"
predict_sentiment(test_review, model, tokenizer)


📝 Review: bad price graphics pay nothing
🔢 Predicted Sentiment: 0 (Confidence: 0.0588)


(0, 0.05880578234791756)

In [None]:
# torch.save(model.state_dict(), "Sentiment_Classes_Model_Final.pt")
# print("✅ Model state saved to: Sentiment_Classes_Model_Final.pt")

In [13]:
model = BertBinaryMultiLabelClassifier(num_labels=1)
model.load_state_dict(torch.load("C:/Users/ASUS/Desktop/University/second semester 2025 (END)/Capstone2/Sentiment_Overall_Model.pt", map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))
model.to(device)
model.eval()
print("✅ Gameplay Sentiment Classes Model loaded...")

  model.load_state_dict(torch.load("C:/Users/ASUS/Desktop/University/second semester 2025 (END)/Capstone2/Sentiment_Overall_Model.pt", map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')))


✅ Gameplay Sentiment Classes Model loaded...


In [24]:
def extract_context_window(text, window=4):
    words = re.findall(r"\w+", str(text).lower())
    all_keywords = set(gameplay_keywords) | visual_keywords | story_keywords | pricing_keywords | bug_keywords
    indices = [i for i, word in enumerate(words) if word in all_keywords]
    if not indices:
        return ""
    i = indices[0]
    start = max(0, i - window)
    end = min(len(words), i + window + 1)
    return " ".join(words[start:end])

def get_matched_keywords(text, keyword_set):
    words = re.findall(r"\w+", text.lower())
    return list(set(words) & keyword_set)

def predict_sentiment(text, model, tokenizer, max_length=256):
    model.eval()

    original_text = text  # Save the original for keyword extraction
    text = speller(text)
    text = preprocess_text(text)
    text = preprocess_text1(text)
    text = extract_context_window(text, window=2)

    # Extract matched keywords for explainability
    matched_keywords = get_matched_keywords(original_text, pricing_keywords.union(visual_keywords, story_keywords, gameplay_keywords, bug_keywords))

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        prob = torch.sigmoid(logits).item()

    label = 1 if prob >= 0.51 else 0
    sentiment_text = "positive" if label == 1 else "negative"

    print(f"\n📝 Review: {text}")
    print(f"🔍 Detected Keywords: {matched_keywords}")
    print(f"🔢 Predicted Sentiment: {label} (Confidence: {prob:.4f})")

    if matched_keywords:
        reasons = ", ".join(matched_keywords)
        print(f"🗯️ The game review is {sentiment_text}, likely due to: {reasons}.")
    else:
        print(f"🗯️ The game review is {sentiment_text}.")

    return label, prob, matched_keywords



In [26]:
test_review = "This game will reward you with a high trust factor even after thousands of hours spent queueing with friends and playing fair — so now you'll enjoy matches with equally fair, skilled, and communicative teammates as a reward for your dedication. The best part is how smooth and consistent the experience feels, even when things get tense. If you can appreciate occasional bugs or quirks without letting them ruin the fun: absolutely give it a try. As a game, it's brilliantly crafted and incredibly addictive. While no system is perfect (some minor matchmaking inconsistencies or rare bugs), it still earns my full recommendation — both for casual players and competitive enthusiasts."
predict_sentiment(test_review, model, tokenizer)



📝 Review: queue friends play fair enjoy
🔍 Detected Keywords: ['bugs', 'system']
🔢 Predicted Sentiment: 1 (Confidence: 0.9795)
🗯️ The game review is positive, likely due to: bugs, system.


(1, 0.9794834852218628, ['bugs', 'system'])