Catboost Google Maps

In [None]:
!pip install catboost scikit-learn pandas tqdm optuna

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd, json, numpy as np, optuna, random
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm import tqdm
from collections import defaultdict

file_path = '/content/drive/MyDrive/review-South_Carolina_10.json'
data = [json.loads(line) for line in open(file_path)]
df = pd.DataFrame(data)[['user_id', 'gmap_id', 'rating', 'time']].dropna()
df['timestamp'] = pd.to_datetime(df['time'], unit='ms')
df['time_norm'] = (df['time'] - df['time'].min()) / (df['time'].max() - df['time'].min())

# Feature engineering
user_stats = df.groupby('user_id').agg(
    user_num_reviews=('rating', 'count'),
    user_avg_rating=('rating', 'mean'),
    user_first_review=('timestamp', 'min')
).reset_index()
user_stats['user_days_since_first'] = (df['timestamp'].max() - user_stats['user_first_review']).dt.days

item_stats = df.groupby('gmap_id').agg(
    item_num_reviews=('rating', 'count'),
    item_avg_rating=('rating', 'mean'),
    item_first_review=('timestamp', 'min')
).reset_index()
item_stats['item_days_since_first'] = (df['timestamp'].max() - item_stats['item_first_review']).dt.days

df = df.merge(user_stats.drop(columns='user_first_review'), on='user_id', how='left')
df = df.merge(item_stats.drop(columns='item_first_review'), on='gmap_id', how='left')
df.drop(columns=['timestamp'], inplace=True)
df = df[df['user_num_reviews'] >= 3]
df = df[df['item_num_reviews'] >= 3]

# === 3. Split Dataset ===
train_val_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=42)

cat_features = ['user_id', 'gmap_id']
num_features = [
    'time_norm', 'user_num_reviews', 'user_avg_rating', 'user_days_since_first',
    'item_num_reviews', 'item_avg_rating', 'item_days_since_first'
]
all_features = cat_features + num_features

train_pool = Pool(train_df[all_features], label=train_df['rating'], cat_features=cat_features)
test_pool = Pool(test_df[all_features], cat_features=cat_features)

final_model = CatBoostRegressor(
    iterations=1000,
    depth=8,
    learning_rate=0.02,
    loss_function='RMSE',
    cat_features=cat_features,
    task_type="GPU",
    devices='0',
    verbose=0
)
final_model.fit(train_pool)
test_df['pred_rating'] = final_model.predict(test_df[all_features])
mse = mean_squared_error(test_df['rating'], test_df['pred_rating'])
rmse = np.sqrt(mse)
mae = mean_absolute_error(test_df['rating'], test_df['pred_rating'])

print("\n=== Test Set Rating Metrics ===")
print(f"RMSE: {rmse:.4f}")
print(f"MAE:  {mae:.4f}")
K = 10
N_NEG = 99
all_items = set(df['gmap_id'].unique())
seen_items = train_val_df.groupby('user_id')['gmap_id'].apply(set).to_dict()
test_positives = test_df[test_df['rating'] >= 4].groupby('user_id').first().reset_index()

user_feats = df[['user_id', 'user_num_reviews', 'user_avg_rating', 'user_days_since_first']].drop_duplicates('user_id').set_index('user_id').to_dict('index')
item_feats = df[['gmap_id', 'item_num_reviews', 'item_avg_rating', 'item_days_since_first']].drop_duplicates('gmap_id').set_index('gmap_id').to_dict('index')

precision_list, recall_list, ndcg_list, hit_list = [], [], [], []

for _, row in tqdm(test_positives.iterrows(), total=len(test_positives)):
    user_id = row['user_id']
    pos_item = row['gmap_id']

    if user_id not in seen_items:
        continue

    user_seen = seen_items[user_id]
    negatives = list(all_items - user_seen - {pos_item})
    if len(negatives) < N_NEG:
        continue

    sampled_negs = random.sample(negatives, N_NEG)
    candidate_items = [pos_item] + sampled_negs

    records = []
    for item_id in candidate_items:
        user_f = user_feats.get(user_id)
        item_f = item_feats.get(item_id)
        if user_f and item_f:
            record = {
                'user_id': user_id,
                'gmap_id': item_id,
                'time_norm': 1.0,
                **user_f,
                **item_f
            }
            records.append(record)

    if len(records) < K:
        continue

    candidate_df = pd.DataFrame(records)
    scores = final_model.predict(candidate_df[all_features])
    candidate_df['score'] = scores

    top_k = candidate_df.sort_values('score', ascending=False).head(K)
    pred_items = top_k['gmap_id'].tolist()

    hit = int(pos_item in pred_items)
    precision = hit / K
    recall = 1.0 if hit else 0.0
    rank = pred_items.index(pos_item) + 1 if pos_item in pred_items else 0
    ndcg = 1 / np.log2(rank + 1) if rank > 0 else 0.0

    hit_list.append(hit)
    precision_list.append(precision)
    recall_list.append(recall)
    ndcg_list.append(ndcg)

print("\n=== Sampled Top-K Ranking Metrics ===")
print(f"Precision@{K}: {np.mean(precision_list):.4f}")
print(f"Recall@{K}:    {np.mean(recall_list):.4f}")
print(f"NDCG@{K}:      {np.mean(ndcg_list):.4f}")
print(f"Hit@{K}:       {np.mean(hit_list):.4f}")




Precision@10: 0.0034

Recall@10:    0.0344

NDCG@10:      0.0166

Hit@10:       0.0344