# LightGBM Model Training and Inference

This notebook demonstrates how to train a LightGBM model and use it for generating product recommendations.

In [None]:
import sys
import os

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Import the required modules
from data.data_loader import load_interaction_features, load_test_data, prepare_train_test_split
from model.lightgbm_model import train_lightgbm_model, predict_and_rank, get_feature_importance
from model.evaluation import hitrate_at_k, format_submission
from utils.utils import get_default_config, save_model, save_to_csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb

## 1. Load Configuration and Data

In [None]:
# Get default configuration
config = get_default_config()

# Define feature columns to use
numeric_features = config['features']['numeric_features']
print("Using the following features:", numeric_features)

In [None]:
# Load training data
training_data = pd.read_parquet('full_training_data_for_model')
print(f"Loaded training data with shape: {training_data.shape}")
training_data.info()

## 2. Prepare Data for Training

In [None]:
# Prepare train-test split
X_train, X_val, y_train, y_val = prepare_train_test_split(
    training_data,
    feature_columns=numeric_features,
    target_column='target',
    test_size=config['model']['test_size'],
    random_state=config['model']['random_state']
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

## 3. Train LightGBM Model

In [None]:
# Train the model
model = train_lightgbm_model(
    X_train, y_train, X_val, y_val,
    params=config['model']['params'],
    early_stopping_rounds=config['model']['early_stopping_rounds'],
    verbose=True
)

In [None]:
# Free memory (optional)
del X_train, y_train, X_val, y_val

## 4. Analyze Feature Importance

In [None]:
# Get feature importance
feature_importance = get_feature_importance(model, numeric_features)
feature_importance

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('LightGBM Feature Importance')
plt.tight_layout()
plt.show()

## 5. Generate Predictions on Test Data

In [None]:
# Load test data
test_data = load_test_data(
    test_path=config['data']['test_data_path'],
    interaction_features_path=config['data']['interaction_features_path'],
    needed_columns=numeric_features
)
print(f"Loaded test data with shape: {test_data.shape}")

In [None]:
# Generate predictions and top-k recommendations
test_data_with_preds, top_k_recommendations = predict_and_rank(
    model,
    test_data,
    feature_columns=numeric_features,
    top_k=config['evaluation']['top_k']
)

# Display top recommendations
print("Top 10 Recommendations:")
top_k_recommendations.head(10)

## 6. Evaluate Model Performance

In [None]:
# Load true data for evaluation
true_data = pd.read_csv(config['data']['test_data_path'], compression='gzip')
print(f"Loaded true data with shape: {true_data.shape}")

In [None]:
# Calculate hitrate@k
hitrate = hitrate_at_k(
    true_data,
    top_k_recommendations,
    k=config['evaluation']['top_k']
)
print(f"Hitrate@{config['evaluation']['top_k']}: {hitrate:.4f}")

## 7. Prepare Submission File

In [None]:
# Filter recommendations for the submission
submission = format_submission(
    top_k_recommendations,
    customer_range=config['submission']['customer_range']
)

# Display the submission
submission.head()

In [None]:
# Number of customers in submission
print(f"Number of customers in submission: {submission.shape[0]}")

In [None]:
# Save the submission file
save_to_csv(
    submission,
    path=config['submission']['output_path'],
    index=False,
    verbose=True
)