In [56]:
import sqlite3
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, random_split

In [39]:
conn = sqlite3.connect("scraped_data.db")

df = pd.read_sql_query("SELECT Preprocessed_Long_Text, Score FROM reviews", conn)
#sample_df = df.sample(n=6500, random_state=42)  # random_state for reproducibility

# Rows with at least one NaN
rows_with_nan = df[df.isna().any(axis=1)]
print("Number of Rows without score:", len(rows_with_nan))

# Drop all rows with at least one NaN
df_clean = df.dropna()


X = df_clean['Preprocessed_Long_Text']
Y = df_clean['Score']

Number of Rows without score: 19


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

#Ridge Regression
model = make_pipeline(TfidfVectorizer(), Ridge())
model.fit(X_train, y_train)

preds = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, preds))

RMSE: 1.4377544929197066


In [60]:
# 1. Scale Y
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(Y.to_numpy().reshape(-1, 1)).flatten()

# 2. TF-IDF -> takes bigrams input
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_tfidf = vectorizer.fit_transform(X).toarray()

# 3. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_scaled, test_size=0.2, random_state=42)

# 4. Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# 5. Validation split
full_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


In [61]:
# 6. Model definition (keep your deeper model)
class RegressionModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)

model = RegressionModel(input_dim=X_train.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)

# 7. Training with early stopping
best_val_loss = float('inf')
patience = 3
patience_counter = 0
epochs = 50

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # Validation loss
    model.eval()
    total_val_loss = 0
    with torch.no_grad():
        for val_x, val_y in val_loader:
            val_preds = model(val_x)
            val_loss = criterion(val_preds, val_y)
            total_val_loss += val_loss.item()
    avg_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    # Early stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model_state = model.state_dict()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# 8. Evaluation on test set
model.load_state_dict(best_model_state)
model.eval()
with torch.no_grad():
    test_preds = model(X_test_tensor).cpu().numpy()
    test_preds_rescaled = scaler_y.inverse_transform(test_preds)
    y_test_rescaled = scaler_y.inverse_transform(y_test_tensor.cpu().numpy())
    rmse = mean_squared_error(y_test_rescaled, test_preds_rescaled)
    print("Final RMSE:", rmse)

Epoch 1, Train Loss: 1.0019, Val Loss: 0.9694
Epoch 2, Train Loss: 0.8949, Val Loss: 0.6682
Epoch 3, Train Loss: 0.4563, Val Loss: 0.2958
Epoch 4, Train Loss: 0.2510, Val Loss: 0.2663
Epoch 5, Train Loss: 0.2004, Val Loss: 0.2547
Epoch 6, Train Loss: 0.1636, Val Loss: 0.2536
Epoch 7, Train Loss: 0.1360, Val Loss: 0.2512
Epoch 8, Train Loss: 0.1169, Val Loss: 0.2596
Epoch 9, Train Loss: 0.1019, Val Loss: 0.2601
Epoch 10, Train Loss: 0.0841, Val Loss: 0.2601
Early stopping triggered.
Final RMSE: 1.2136303186416626


In [62]:
# do sentences: 