In [7]:
DATA_PATH = "filmReview_1.csv"   # file input
USE_BERT = False                 # True nếu muốn dùng BERT embedding
EPOCHS = 5
MAX_WORDS = 20000
MAX_LEN = 200

# =========================
# 1. Load & preprocess
# =========================
import re, pandas as pd
from sklearn.model_selection import train_test_split

def clean_text(s):
    s = re.sub(r"<.*?>", " ", str(s))
    s = re.sub(r"[^0-9a-zA-Z' ]+", " ", s)
    return re.sub(r"\s+", " ", s).strip().lower()

df = pd.read_csv(DATA_PATH)
df["clean_text"] = df["review_text"].apply(clean_text)
df["label"] = (df["rating"] >= 6).astype(int)

train_df, val_df = train_test_split(df, test_size=0.15, stratify=df["label"], random_state=42)

print("Train size:", len(train_df), " Val size:", len(val_df))

Train size: 4250  Val size: 750


In [8]:
import numpy as np

if USE_BERT:
    from sentence_transformers import SentenceTransformer
    bert = SentenceTransformer("all-MiniLM-L6-v2")

    X_train = bert.encode(train_df["clean_text"].tolist(), batch_size=64, show_progress_bar=True)
    X_val   = bert.encode(val_df["clean_text"].tolist(), batch_size=64, show_progress_bar=True)
    y_train, y_val = train_df["label"].values, val_df["label"].values

else:
    # Word2Vec + tokenizer for CNN/RNN
    from gensim.models import Word2Vec
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences

    sentences = [t.split() for t in train_df["clean_text"]]
    w2v = Word2Vec(sentences, vector_size=128, window=5, min_count=2, workers=4, epochs=5)

    tokenizer = Tokenizer(num_words=MAX_WORDS)
    tokenizer.fit_on_texts(train_df["clean_text"])

    X_train = pad_sequences(tokenizer.texts_to_sequences(train_df["clean_text"]), maxlen=MAX_LEN)
    X_val   = pad_sequences(tokenizer.texts_to_sequences(val_df["clean_text"]), maxlen=MAX_LEN)
    y_train, y_val = train_df["label"].values, val_df["label"].values

print("Embedding ready:", X_train.shape, y_train.shape)

ImportError: cannot import name 'triu' from 'scipy.linalg' (C:\Users\phamt\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\linalg\__init__.py)