In [None]:
!pip install -U gdown -q

In [None]:
import ast
import gdown
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

In [None]:
# ID và link tải trực tiếp từ Drive
file_id = '1xeQGP9I99eTMasv8GVUtBhuhsgpt-AqW'
url = f'https://drive.google.com/uc?id={file_id}'

# Tải và lưu thành 'data.csv'
gdown.download(url, 'data.csv', quiet=False)

# Đọc file CSV
df = pd.read_csv('data.csv')

# Xem trước 5 dòng đầu
df.head()

Downloading...
From (original): https://drive.google.com/uc?id=1xeQGP9I99eTMasv8GVUtBhuhsgpt-AqW
From (redirected): https://drive.google.com/uc?id=1xeQGP9I99eTMasv8GVUtBhuhsgpt-AqW&confirm=t&uuid=eec542a6-31fe-4e28-a9be-23dcdc249b3c
To: /content/data.csv
100%|██████████| 145M/145M [00:02<00:00, 56.2MB/s]
  df = pd.read_csv('data.csv')


Unnamed: 0,ratings,title,text,author,date_stayed,offering_id,num_helpful_votes,date,id,via_mobile
0,"{'service': 3.0, 'cleanliness': 3.0, 'overall'...",“Great location!”,This hotel is average as far as NYC hotels go....,"{'username': 'whatever240', 'num_cities': 11, ...",June 2012,99387.0,0.0,2012-06-08,131585395.0,False
1,{'overall': 3.0},“W is for Whatever”,We spent a girls' getaway overnight. At check-...,"{'username': 'jetsetsugar', 'num_cities': 5, '...",October 2009,112066.0,0.0,2009-11-30,50382077.0,False
2,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“A REAL New York Experience”,At first we were a little unsure of what to ex...,"{'username': 'Sammifli', 'num_cities': 18, 'nu...",December 2010,1791248.0,1.0,2011-01-21,93942470.0,False
3,"{'cleanliness': 5.0, 'sleep_quality': 5.0, 'ov...",“Enjoyable stay”,We booked a King Premier as they are at the ba...,"{'username': 'LadytravelsEssex', 'num_cities':...",May 2011,217616.0,1.0,2011-06-05,111617949.0,False
4,"{'service': 5.0, 'cleanliness': 5.0, 'overall'...",“Had a fun Time”,Thanks to All the staff for the friendly and w...,"{'username': 'Barry C', 'num_cities': 4, 'num_...",October 2011,1858923.0,0.0,2012-01-06,122653425.0,False


In [None]:
df['ratings'] = df['ratings'].apply(ast.literal_eval)
df['rating_overall'] = df['ratings'].apply(lambda x: x.get('overall', None))
df.drop(columns=['ratings'], inplace=True)

In [None]:
def label_sentiment(x):
    if x >= 4: return 'positive'
    elif x <= 2: return 'negative'
    else: return 'neutral'

df['sentiment'] = df['rating_overall'].apply(label_sentiment)


In [None]:
# TF-IDF vector hóa
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['text'])

# Mã hóa sentiment: 0-negative, 1-neutral, 2-positive
label_encoder = LabelEncoder()
y_class = label_encoder.fit_transform(df['sentiment'])

In [None]:
clf = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
clf.fit(X, y_class)

# Lấy xác suất class 'positive'
class_index = list(label_encoder.classes_).index('positive')
y_score = clf.predict_proba(X)[:, class_index]




In [None]:
ratios = {
    '4:1': 0.2,
    '7:3': 0.3,
    '6:4': 0.4
}

models = {
    'Linear Regression': LinearRegression(),
    'XGBoost': XGBRegressor(n_estimators=50, max_depth=6, learning_rate=0.1, n_jobs=-1, verbosity=0)
}

for label, test_ratio in ratios.items():
    print(f"\n==== TỈ LỆ TRAIN:VALIDATION = {label} ====")

    # 1. Chia train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y_score, test_size=test_ratio, random_state=42)

    # 2. Giảm chiều (còn 1/3)
    svd = TruncatedSVD(n_components=int(X.shape[1] / 3))
    X_train_reduced = svd.fit_transform(X_train)
    X_test_reduced = svd.transform(X_test)

    for name, model in models.items():
        # Trên dữ liệu GỐC
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        print(f"{name} | Gốc | R2: {r2:.4f} | MSE: {mse:.4f}")

        # Trên dữ liệu GIẢM CHIỀU
        model.fit(X_train_reduced, y_train)
        y_pred_red = model.predict(X_test_reduced)
        r2_red = r2_score(y_test, y_pred_red)
        mse_red = mean_squared_error(y_test, y_pred_red)
        print(f"{name} | Giảm | R2: {r2_red:.4f} | MSE: {mse_red:.4f}")



==== TỈ LỆ TRAIN:VALIDATION = 4:1 ====
Linear Regression | Gốc | R2: 0.8890 | MSE: 0.0106
Linear Regression | Giảm | R2: 0.8293 | MSE: 0.0163
XGBoost | Gốc | R2: 0.6442 | MSE: 0.0341
XGBoost | Giảm | R2: 0.7600 | MSE: 0.0230

==== TỈ LỆ TRAIN:VALIDATION = 7:3 ====
Linear Regression | Gốc | R2: 0.8871 | MSE: 0.0108
Linear Regression | Giảm | R2: 0.8355 | MSE: 0.0157
XGBoost | Gốc | R2: 0.6446 | MSE: 0.0339
XGBoost | Giảm | R2: 0.7625 | MSE: 0.0227

==== TỈ LỆ TRAIN:VALIDATION = 6:4 ====
Linear Regression | Gốc | R2: 0.8864 | MSE: 0.0108
Linear Regression | Giảm | R2: 0.8332 | MSE: 0.0158
XGBoost | Gốc | R2: 0.6419 | MSE: 0.0340
XGBoost | Giảm | R2: 0.7572 | MSE: 0.0231
