# Negative Sampling for LightGBM Recommendation System

This notebook demonstrates how to use the HybridNegativeSampler to generate negative samples for training a LightGBM recommendation model.

In [None]:
import sys
import os

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Import the required modules
from data.negative_sampler import HybridNegativeSampler
from data.data_loader import load_interaction_features
from utils.utils import get_default_config, save_to_csv

import pandas as pd
from tqdm import tqdm

## 1. Load Configuration and Data

In [None]:
# Get default configuration
config = get_default_config()

# Load interaction features
interaction_features = load_interaction_features(config['data']['interaction_features_path'])
print(f"Loaded interaction features with shape: {interaction_features.shape}")

## 2. Initialize Negative Sampler

In [None]:
# Initialize the sampler with default weights
sampler = HybridNegativeSampler(
    interaction_features=interaction_features,
    popularity_weight=0.5,  # Weight for popularity (transaction count)
    recency_weight=0.3,     # Weight for recency (days since last purchase)
    promo_weight=0.2,       # Weight for promotion usage
    device='cuda',          # Use GPU if available
    verbose=True            # Print detailed logs
)

## 3. Generate Negative Samples

This will generate negative samples with a 1:1 ratio to positive samples. The process combines popularity, recency, and promotion signals to select the most relevant negative samples.

In [None]:
# Generate training data with both positive and negative samples
training_data = sampler.generate_samples(
    batch_size=10000,        # Process this many customers at once 
    neg_ratio=1,             # Generate 1 negative sample for each positive
    min_samples_per_user=1   # Minimum number of negative samples per user
)

## 4. Examine the Results

In [None]:
# Display the first few rows
training_data.head()

In [None]:
# Check the class distribution
print("Class distribution:")
print(training_data['target'].value_counts(normalize=True))

## 5. Save the Training Data

In [None]:
# Save the generated training data to a Parquet file with gzip compression
training_data.to_parquet(
    config['data']['training_data_path'],
    compression='gzip'
)
print(f"Training data saved to {config['data']['training_data_path']}")