In [93]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import torch, re
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
import time

torch.manual_seed(1)

<torch._C.Generator at 0x7f9bb670da50>

In [97]:
conn = sqlite3.connect("scraped_data.db")

df = pd.read_sql_query("SELECT * FROM reviews", conn)
#sample_df = df.sample(n=6500, random_state=42)  # random_state for reproducibility

# Rows with at least one NaN
rows_with_nan = df[df.isna().any(axis=1)]
print("Number of Rows without score:", len(rows_with_nan))

# Drop all rows with at least one NaN
df_clean = df.dropna()

#Test on only few data examples
#df_clean = df.sample(n=200, random_state=42)  # random_state for reproducibility


X = df_clean['Tokenized_Long_Text']
Y = df_clean['Score']
pd.set_option('display.max_colwidth', None)


Number of Rows without score: 5091


In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


In [99]:
def simple_tokenise(text):
    # very basic: split on non‑letters, lowercase
    return re.findall(r"[A-Za-z0-9']+", text.lower())

# ➊ tokenise every review
tokenised_train = [simple_tokenise(t) for t in X_train]
tokenised_val   = [simple_tokenise(t) for t in X_val]
tokenised_test  = [simple_tokenise(t) for t in X_test]

# ➋ count words and map to indices
counter = Counter(w for sent in tokenised for w in sent)
specials = ["<pad>", "<unk>"]
stoi = {w:i+len(specials) for i,(w,_) in enumerate(counter.most_common())}
stoi.update({sp:i for i,sp in enumerate(specials)})  # add specials
unk, pad = stoi["<unk>"], stoi["<pad>"]

In [100]:
max_len = 300  # truncate / pad length

def encode(sent):
    ids = [stoi.get(w, unk) for w in sent][:max_len]
    return torch.tensor(ids + [pad]*(max_len-len(ids)), dtype=torch.long)

encoded_train = torch.stack([encode(s) for s in tokenised_train])
encoded_val   = torch.stack([encode(s) for s in tokenised_val])
encoded_test  = torch.stack([encode(s) for s in tokenised_test])


In [101]:
y_train = torch.tensor(y_train.values - 1, dtype=torch.long)  
y_val = torch.tensor(y_val.values - 1, dtype=torch.long)
y_test = torch.tensor(y_test.values - 1, dtype=torch.long)

In [102]:
batch_size = 64
train_loader = DataLoader(
    TensorDataset(encoded_train, y_train), shuffle=True, batch_size=batch_size
)
val_loader = DataLoader(
    TensorDataset(encoded_val, y_val), shuffle=False, batch_size=batch_size
)
test_loader = DataLoader(
    TensorDataset(encoded_test, y_test), shuffle=False, batch_size=batch_size
)

In [103]:
class ReviewLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, lstm_units=128, num_classes=10):
        super().__init__()
        self.emb  = nn.Embedding(vocab_size, embed_dim, padding_idx=pad)
        self.lstm = nn.LSTM(embed_dim, lstm_units, batch_first=True, bidirectional=True)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc   = nn.Linear(lstm_units*2, num_classes)    # change to 1 for regression

    def forward(self, x):
        x = self.emb(x)                         # (B, L, E)
        x, _ = self.lstm(x)                     # (B, L, 2H)
        x = self.pool(x.transpose(1,2)).squeeze(-1)  # (B, 2H)
        return self.fc(x)                       # (B, C)

model = ReviewLSTM(len(stoi))


In [104]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using", device)

Using cpu


In [105]:
# ── 2.  Model / optimiser / loss ────────────────────────────────
model = ReviewLSTM(vocab_size=len(stoi)).to(device)
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=1e-3)

# ── 3.  Training loop with early stopping ───────────────────────
patience = 2
best_val_acc = 0
epochs_no_improve = 0
num_epochs = 5

In [106]:
for epoch in range(1, num_epochs + 1):
    t0 = time.time()
    # ---- train ----
    model.train()
    train_loss = 0
    for Xb, yb in train_loader:
        Xb, yb = Xb.to(device), yb.to(device)
        optimiser.zero_grad()
        logits = model(Xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimiser.step()
        train_loss += loss.item() * Xb.size(0)

    # ---- validate ----
    model.eval()
    val_loss, preds, gts = 0, [], []
    with torch.no_grad():
        for Xb, yb in val_loader:
            Xb, yb = Xb.to(device), yb.to(device)
            logits = model(Xb)
            loss = criterion(logits, yb)
            val_loss += loss.item() * Xb.size(0)
            preds.extend(logits.argmax(1).cpu().tolist())
            gts.extend(yb.cpu().tolist())

    train_loss /= len(train_loader.dataset)
    val_loss   /= len(val_loader.dataset)
    val_acc     = accuracy_score(gts, preds)

    print(f"Epoch {epoch:2d} | "
          f"train loss {train_loss:.4f} | "
          f"val loss {val_loss:.4f} | "
          f"val acc {val_acc*100:5.2f}% | "
          f"time {time.time()-t0:.1f}s")


    # ---- early stopping ----
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_review_lstm.pth")
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break

Epoch  1 | train loss 1.8327 | val loss 1.8846 | val acc 30.20% | time 4.2s
Epoch  2 | train loss 1.6932 | val loss 1.8393 | val acc 30.20% | time 3.9s
Epoch  3 | train loss 1.6453 | val loss 1.8330 | val acc 25.50% | time 3.8s
Early stopping triggered.


In [107]:
# ── 4.  Load best & evaluate on test set ────────────────────────
#model.load_state_dict(torch.load("best_review_lstm.pth"))
model.eval()
preds, gts = [], []
with torch.no_grad():
    for Xb, yb in test_loader:
        Xb = Xb.to(device)
        logits = model(Xb)
        preds.extend(logits.argmax(1).cpu().tolist())
        gts.extend(yb.tolist())
        
test_acc = accuracy_score(gts, preds)
print(f"\nTest accuracy: {test_acc*100:.2f}%")

# optional: confusion matrix
cm = confusion_matrix(gts, preds)
print("Confusion‑matrix (rows=true, cols=pred):\n", cm)


Test accuracy: 26.15%
Confusion‑matrix (rows=true, cols=pred):
 [[ 0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  6  0  0  0]
 [ 0  0  0  0  0  0 15  1  0  0]
 [ 0  0  0  0  0  0 22  2  0  0]
 [ 0  0  0  0  0  0 47  7  0  0]
 [ 0  0  0  0  0  0 80 15  0  0]
 [ 0  0  0  0  0  0 98 16  0  0]
 [ 0  0  0  0  0  0 48 10  1  0]
 [ 0  0  0  0  0  0  1  0  0  0]]
