LightGBM Embeddings Google Maps

In [None]:
!pip install -q lightgbm tqdm sentence-transformers

from google.colab import drive
drive.mount('/content/drive')

import os
import gc
import json
import torch
import re
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import SentenceTransformer
from collections import defaultdict

print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

file_path = '/content/drive/MyDrive/review-South_Carolina_10.json'
with open(file_path, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data).sample(frac=0.8, random_state=42).reset_index(drop=True)

# Text cleaning
def clean_text(text, max_words=80):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = text.lower()
    return ' '.join(text.split()[:max_words])

df['text'] = df['text'].fillna('').apply(clean_text)

# Feature engineering
df['text_length'] = df['text'].str.len()
df['num_exclamations'] = df['text'].str.count('!')
df['time'] = pd.to_datetime(df['time'], unit='ms', errors='coerce')
df['hour'] = df['time'].dt.hour
df['weekday'] = df['time'].dt.weekday
df['month'] = df['time'].dt.month
df['time_of_day'] = df['hour'].apply(
    lambda h: 'morning' if 5 <= h < 12 else
              'afternoon' if 12 <= h < 17 else
              'evening' if 17 <= h < 21 else 'night'
).astype('category')
df['is_weekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)
df['days_since_review'] = (df['time'].max() - df['time']).dt.days
df['user_id'] = df['user_id'].astype('category')
df['gmap_id'] = df['gmap_id'].astype('category')
y = df['rating'] - 1

# Structured features
simple_features = [
    'hour', 'weekday', 'month', 'is_weekend',
    'time_of_day', 'days_since_review',
    'text_length', 'num_exclamations'
]
X_simple = df[simple_features].reset_index(drop=True)
categorical_features = ['time_of_day']

# Embedding setup
embedding_path = '/content/drive/MyDrive/full_embeddings.npy'
texts = df['text'].tolist()
total_rows = len(texts)
embedding_dim = 384

if os.path.exists(embedding_path):
    print("Embedding file already exists, skipping recomputation.")
else
    embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
    embedder.half()
    batch_size = 1024
    embedding_dim = embedder.encode(["test"], device='cuda', convert_to_numpy=True).shape[1]
    embedding_array = np.memmap(embedding_path, dtype='float16', mode='w+', shape=(total_rows, embedding_dim))

    print("Embedding text to Google Drive...")
    cursor = 0
    for i in tqdm(range(0, total_rows, batch_size), desc="Embedding"):
        batch = texts[i:i+batch_size]
        emb = embedder.encode(batch, batch_size=batch_size, device='cuda', convert_to_numpy=True)
        embedding_array[cursor:cursor+len(emb)] = emb.astype('float16')
        cursor += len(emb)
        del batch, emb
        gc.collect()
        torch.cuda.empty_cache()

    embedding_array.flush()
    del embedding_array
    gc.collect()
    print("Embeddings saved.")

embedding_loaded = np.memmap(embedding_path, dtype='float16', mode='r', shape=(total_rows, embedding_dim))
embedding_df = pd.DataFrame(embedding_loaded)

X = pd.concat([X_simple.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, categorical_feature=categorical_features)

params = {
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'multi_logloss',
    'learning_rate': 0.03,
    'num_leaves': 64,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 5,
    'verbosity': -1,
    'device': 'gpu',
    'max_bin': 255
}

print("Training LightGBM...")
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=500,
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=10)
    ]
)

y_pred = model.predict(X_test)
y_pred_labels = np.argmax(y_pred, axis=1)
acc = accuracy_score(y_test, y_pred_labels)
f1 = f1_score(y_test, y_pred_labels, average='weighted')
print(f"\nTest Accuracy: {acc:.4f}")
print(f"Test F1 Score: {f1:.4f}")

embed_lookup_df = pd.concat([df[['gmap_id']].reset_index(drop=True), embedding_df], axis=1)
gmap_id_to_embedding = embed_lookup_df.groupby('gmap_id').first().to_dict(orient='index')


Test Accuracy: 0.6183

Test F1 Score: 0.5009

In [None]:
def evaluate_recommender_metrics(model, df, gmap_id_to_embedding,
                                 user_col='user_id', item_col='gmap_id',
                                 K_list=[5, 10, 20], num_negatives=99):
    print("Evaluating Recommender Metrics")

    user_items = defaultdict(set)
    for uid, gid in zip(df[user_col], df[item_col]):
        user_items[uid].add(gid)

    all_items = df[item_col].unique().tolist()
    test_users = df[user_col].unique()
    embedding_dim = len(next(iter(gmap_id_to_embedding.values())))

    results_by_k = {k: {'recall': [], 'precision': [], 'hit': [], 'ndcg': []} for k in K_list}

    for user in tqdm(test_users, desc="Users"):
        user_df = df[df[user_col] == user]
        if user_df.empty:
            continue
        pos_item = user_df.sample(1, random_state=42)[item_col].values[0]

        negatives = random.sample(
            [item for item in all_items if item not in user_items[user]],
            min(num_negatives, len(all_items) - len(user_items[user]) - 1)
        )
        candidates = [pos_item] + negatives
        candidate_labels = [1] + [0] * len(negatives)

        rows = []
        for item in candidates:
            row = user_df.iloc[0].copy()
            row[item_col] = item
            rows.append(row)

        test_batch = pd.DataFrame(rows)
        test_batch['time_of_day'] = test_batch['time_of_day'].astype('category')

        features = [
            'hour', 'weekday', 'month', 'is_weekend',
            'time_of_day', 'days_since_review',
            'text_length', 'num_exclamations'
        ]
        X_struct = test_batch[features].reset_index(drop=True)

        X_embed = pd.DataFrame([
            gmap_id_to_embedding.get(item, {i: 0 for i in range(embedding_dim)})
            for item in candidates
        ])
        X_batch = pd.concat([X_struct, X_embed], axis=1)

        scores = model.predict(X_batch)
        scores = scores[:, -1] if scores.ndim == 2 else scores

        ranked_indices = np.argsort(-scores)
        ranked_labels = np.array(candidate_labels)[ranked_indices]

        for k in K_list:
            top_k = ranked_labels[:k]
            hit = int(1 in top_k)
            rank = np.where(ranked_labels == 1)[0][0] + 1
            ndcg = 1.0 / np.log2(rank + 1) if rank <= k else 0
            precision = top_k.sum() / k
            recall = 1.0 if 1 in top_k else 0

            results_by_k[k]['recall'].append(recall)
            results_by_k[k]['precision'].append(precision)
            results_by_k[k]['hit'].append(hit)
            results_by_k[k]['ndcg'].append(ndcg)

    print("\n Ranking Metrics:")
    for k in K_list:
        recall = np.mean(results_by_k[k]['recall'])
        precision = np.mean(results_by_k[k]['precision'])
        hit = np.mean(results_by_k[k]['hit'])
        ndcg = np.mean(results_by_k[k]['ndcg'])
        print(f"@{k}: Recall={recall:.4f}, Precision={precision:.4f}, HitRate={hit:.4f}, NDCG={ndcg:.4f}")

evaluate_recommender_metrics(
    model=model,
    df=df,
    gmap_id_to_embedding=gmap_id_to_embedding,
    user_col='user_id',
    item_col='gmap_id',
    K_list=[5, 10, 20],
    num_negatives=99
)

Ranking Metrics:

@5: Recall=0.0410, Precision=0.0082, HitRate=0.0410, NDCG=0.0284

@10: Recall=0.0845, Precision=0.0085, HitRate=0.0845, NDCG=0.0423

@20: Recall=0.1712, Precision=0.0086, HitRate=0.1712, NDCG=0.0639