LightGBM + Vader GoogleMaps

In [None]:

!pip install -q lightgbm tqdm
import nltk
nltk.download('vader_lexicon')
from google.colab import drive
drive.mount('/content/drive')
import json

import pandas as pd
import numpy as np
from tqdm import tqdm
import lightgbm as lgb
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch

# Check GPU
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

file_path = '/content/drive/MyDrive/review-South_Carolina_10.json'


with open(file_path, 'r') as f:
    data = [json.loads(line) for line in f]

df = pd.DataFrame(data)

# Check and handle missing columns
expected_cols = {'user_id', 'gmap_id', 'rating', 'time', 'text'}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in dataset: {missing}")

df['text'] = df['text'].fillna('')

# VADER Sentiment Analysis
sia = SentimentIntensityAnalyzer()

def vader_sentiment_batch(texts, batch_size=10000):
    sentiments = []
    for i in tqdm(range(0, len(texts), batch_size), desc="VADER Sentiment", ncols=100):
        batch = texts[i:i+batch_size]
        scores = [sia.polarity_scores(text)['compound'] for text in batch]
        sentiments.extend(scores)
    return sentiments

df['vader_sentiment'] = vader_sentiment_batch(df['text'].tolist())

# Time features
df['time'] = pd.to_datetime(df['time'], unit='ms', errors='coerce')
df['hour'] = df['time'].dt.hour
df['weekday'] = df['time'].dt.weekday
df['month'] = df['time'].dt.month

def time_of_day(hour):
    if pd.isna(hour):
        return 'unknown'
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

df['time_of_day'] = df['hour'].apply(time_of_day).astype('category')
df['is_weekend'] = df['weekday'].apply(lambda x: 1 if x >= 5 else 0)
latest_time = df['time'].max()
df['days_since_review'] = (latest_time - df['time']).dt.days

# Encode IDs
df['user_id'] = df['user_id'].astype('category')
df['gmap_id'] = df['gmap_id'].astype('category')

# Define features for LightGBM
features = [
    'user_id', 'gmap_id', 'hour', 'weekday', 'month',
    'is_weekend', 'time_of_day', 'days_since_review', 'vader_sentiment'
]

X = df[features]
y = df['rating']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train LightGBM
train_data = lgb.Dataset(
    X_train, label=y_train,
    categorical_feature=['user_id', 'gmap_id', 'time_of_day']
)
test_data = lgb.Dataset(
    X_test, label=y_test,
    reference=train_data,
    categorical_feature=['user_id', 'gmap_id', 'time_of_day']
)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.03,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity': -1
}

print("Training LightGBM...")
evals_result = {}
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=100,
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
        lgb.log_evaluation(period=10)
    ]
)



# Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
print(f"\nTest RMSE: {rmse:.4f}")

#  Feature importance
lgb.plot_importance(model, max_num_features=15, importance_type='gain')
plt.show()

from collections import defaultdict
from tqdm import tqdm

# Build interaction map from training set
train_user_items = df.loc[X_train.index].groupby('user_id')['gmap_id'].apply(set).to_dict()
all_items = df['gmap_id'].unique().tolist()
test_users = df.loc[X_test.index]['user_id'].unique()

K_list = [5, 10, 20]
results_by_k = {k: {'recall': [], 'precision': [], 'hit': [], 'ndcg': []} for k in K_list}

# Prepare categorical encodings if needed
X_test_full = X.copy()
X_test_full['rating'] = y

from collections import defaultdict
from tqdm import tqdm
import random

# Setup
K_list = [5, 10, 20]
results_by_k = {k: {'recall': [], 'precision': [], 'hit': [], 'ndcg': []} for k in K_list}
all_items = df['gmap_id'].unique().tolist()
train_user_items = df.loc[X_train.index].groupby('user_id')['gmap_id'].apply(set).to_dict()
test_users = df.loc[X_test.index]['user_id'].unique()

# Precompute features
user_feats = df[['user_id', 'hour', 'weekday', 'month', 'is_weekend', 'time_of_day', 'days_since_review']].drop_duplicates('user_id').set_index('user_id')
item_feats = df[['gmap_id', 'vader_sentiment']].drop_duplicates('gmap_id').set_index('gmap_id')

# Prepare candidate rows
candidate_rows = []
labels = []

print("\n Preparing candidate batch...")

for user_id in tqdm(test_users, desc="Users"):
    if user_id not in train_user_items:
        continue
    user_seen = train_user_items[user_id]
    user_df = X_test[X_test['user_id'] == user_id]
    if user_df.empty:
        continue

    pos_item = user_df.sample(1, random_state=42)['gmap_id'].values[0]
    negatives = [item for item in all_items if item not in user_seen and item != pos_item]
    if len(negatives) < 99:
        continue
    sampled_negs = random.sample(negatives, 99)

    candidate_items = [pos_item] + sampled_negs
    user_data = user_feats.loc[user_id]

    for item_id in candidate_items:
        row = {
            'user_id': user_id,
            'gmap_id': item_id,
            'hour': user_data['hour'],
            'weekday': user_data['weekday'],
            'month': user_data['month'],
            'is_weekend': user_data['is_weekend'],
            'time_of_day': user_data['time_of_day'],
            'days_since_review': user_data['days_since_review'],
            'vader_sentiment': item_feats.loc[item_id]['vader_sentiment'],
        }
        candidate_rows.append(row)
    labels.extend([1] + [0] * 99)

# Build batch dataframe
candidate_df = pd.DataFrame(candidate_rows)

# Match categorical types
for cat in ['user_id', 'gmap_id', 'time_of_day']:
    candidate_df[cat] = candidate_df[cat].astype(pd.api.types.CategoricalDtype(categories=df[cat].cat.categories))

print("\n Predicting on batch...")
scores = model.predict(candidate_df[features])
candidate_df['score'] = scores
candidate_df['label'] = labels

print("\n Evaluating...")
grouped = candidate_df.groupby('user_id')

for user_id, group in tqdm(grouped, desc="Scoring users"):
    group = group.sort_values('score', ascending=False)
    ranked_labels = group['label'].values

    for k in K_list:
        top_k = ranked_labels[:k]
        hit = int(1 in top_k)

        pos_indices = np.where(ranked_labels == 1)[0]
        if len(pos_indices) == 0:
            continue  # skip users with no positive item

        rank = pos_indices[0] + 1
        ndcg = 1.0 / np.log2(rank + 1) if rank <= k else 0
        precision = top_k.sum() / k
        recall = 1.0 if 1 in top_k else 0

        results_by_k[k]['recall'].append(recall)
        results_by_k[k]['precision'].append(precision)
        results_by_k[k]['hit'].append(hit)
        results_by_k[k]['ndcg'].append(ndcg)


print("\n Final Ranking Metrics:")
for k in K_list:
    print(f"@{k}: Recall={np.mean(results_by_k[k]['recall']):.4f}, "
          f"Precision={np.mean(results_by_k[k]['precision']):.4f}, "
          f"HitRate={np.mean(results_by_k[k]['hit']):.4f}, "
          f"NDCG={np.mean(results_by_k[k]['ndcg']):.4f}")




Final Ranking Metrics:

@5: Recall=0.2365, Precision=0.0473, HitRate=0.2365, NDCG=0.1416

@10: Recall=0.2880, Precision=0.0288, HitRate=0.2880, NDCG=0.1586

@20: Recall=0.3132, Precision=0.0157, HitRate=0.3132, NDCG=0.1648