<a href="https://colab.research.google.com/github/annabelleshea/yelp_final/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data = pd.read_csv('/content/drive/MyDrive/MGSC 410 Final Project/cleaned_data.csv')

In [4]:
# Drop rows with missing values
data = data.dropna(subset=['user_rating_variance'])

In [None]:
# Define features
features = [
    'review_count_x', 'average_stars', 'restaurant_review_count', 'latitude',
    'is_open', 'longitude',
    'Alcohol_full_bar', 'Alcohol_beer_and_wine',
    'RestaurantsDelivery',
    'HasTV',
    'RestaurantsTableService', 'RestaurantsGoodForGroups', 'DriveThru',
    'Parking_street',
    'touristy',
    'intimate', 'classy', 'RestaurantsAttire_casual',
    'user_AvgRestaurantCat_rating', 'latenight',
    'sentiment_score', 'hour_of_day',
    'is_weekend', 'is_in_best_food_city'
]

# Prepare the data

def prepare_time_aware_split(data):
    data['date'] = pd.to_datetime(data['date'])
    data_sorted = data.sort_values(['user_id', 'date'])

    test_mask = data_sorted.groupby('user_id')['date'].transform('max') == data_sorted['date']
    X_test = data_sorted[test_mask][features]
    y_test = data_sorted[test_mask]['rating']

    train_cv_data = data_sorted[~test_mask]

    cv_mask = train_cv_data.groupby('user_id')['date'].transform('max') == train_cv_data['date']
    X_cv = train_cv_data[cv_mask][features]
    y_cv = train_cv_data[cv_mask]['rating']

    X_train = train_cv_data[~cv_mask][features]
    y_train = train_cv_data[~cv_mask]['rating']

    return X_train, y_train, X_cv, y_cv, X_test, y_test

# Sample data
sample_data = data.sample(n=50000, random_state=42)  # Randomly sample 50,000 rows
X_train, y_train, X_cv, y_cv, X_test, y_test = prepare_time_aware_split(sample_data)

# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressor(objective='reg:squarederror', eval_metric='rmse'))
])

# Define parameter grid
param_grid = {
    'xgb__n_estimators': [100, 200, 300],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__subsample': [0.6, 0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0],
    'xgb__gamma': [0, 0.1, 0.2]
}

from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=100,  # Test only 100 combinations
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train, y_train)


# Train the model
random_search.fit(X_train, y_train)

# Output the best hyperparameters
print("Best parameters from GridSearchCV:", random_search.best_params_)

# Evaluate the model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", test_mse)

In [None]:
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")
print(f"Test RÂ²: {r2}")