# LightFM Predictor Demo

This notebook demonstrates how to use the LightFMPredictor class to generate recommendations.

In [1]:
from pyspark.sql import SparkSession

class SparkSessionFactory:
    def create_spark_session(self, app_name: str):
        print("Initializing Spark Session...")
        spark = SparkSession.builder \
            .appName(app_name) \
            .config("spark.driver.memory", "8g") \
            .config("spark.executor.memory", "8g") \
            .config("spark.sql.shuffle.partitions", "10") \
            .config("spark.memory.fraction", "0.8") \
            .config("spark.sql.files.maxPartitionBytes", "128m") \
            .config("spark.sql.adaptive.enabled", "true") \
            .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
            .config("spark.sql.adaptive.skewJoin.enabled", "true") \
            .config("spark.driver.host", "localhost") \
            .config("spark.driver.bindAddress", "127.0.0.1") \
            .config("spark.network.timeout", "800s") \
            .config("spark.cleaner.periodicGC.interval", "1min") \
            .config("spark.sql.files.openCostInBytes", "1048576") \
            .config("spark.sql.broadcastTimeout", "300") \
            .config("spark.sql.parquet.filterPushdown", "true") \
            .config("spark.sql.inMemoryColumnarStorage.compressed", "true") \
            .config("spark.sql.parquet.writeLegacyFormat", "false") \
            .config("spark.default.parallelism", "10") \
            .master("local[*]") \
            .getOrCreate()
        
        spark.sparkContext.setLogLevel("WARN")
        print("Spark Session initialized.")
        return spark

In [16]:
import logging
from lightfm import LightFM
import numpy as np
import scipy.sparse as sp
import pickle
from pyspark.sql.functions import col, udf, array
from pyspark.sql.types import ArrayType, FloatType, StringType, StructType, StructField


logger = logging.getLogger(__name__)


class LightFMPredictor:
    def __init__(self, model_path: str, user_features_path: str, item_features_path: str):
        """
        Initialize the predictor
        
        Args:
            model_path: Path to saved LightFM model
            user_features_path: Path to user features parquet
            item_features_path: Path to item features parquet
        """
        self.spark = SparkSessionFactory().create_spark_session("LightFM Predictor")
        self.model = self._load_model(model_path)
        self.user_features_path = user_features_path
        self.item_features_path = item_features_path
        self.user_features = None
        self.item_features = None
        self.item_ids = None
        self._load_features()

    def _load_model(self, model_path: str) -> LightFM:
        """Load the trained LightFM model"""
        logger.info(f"Loading model from {model_path}")
        try:
            with open(model_path, 'rb') as f:
                return pickle.load(f)
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            raise

    def _load_features(self):
        """Load and prepare user and item features"""
        logger.info("Loading features...")
        
        try:
            # Load user features
            user_features_df = self.spark.read.parquet(self.user_features_path)
            self.user_features = self._convert_features_to_sparse(user_features_df, "user_features")
            
            # Load item features
            item_features_df = self.spark.read.parquet(self.item_features_path)
            self.item_features = self._convert_features_to_sparse(item_features_df, "features")
            
            # Store item IDs for mapping predictions back to items
            self.item_ids = [row.page for row in item_features_df.select("page").collect()]
            
            logger.info("Features loaded successfully")
        except Exception as e:
            logger.error(f"Error loading features: {str(e)}")
            raise

    def _convert_features_to_sparse(self, df, feature_col):
        """Convert DataFrame features to sparse matrix"""
        features_list = [row[feature_col].toArray() for row in df.select(feature_col).collect()]
        features_array = np.array(features_list)
        if features_array.ndim > 2:
            features_array = features_array.reshape(features_array.shape[0], -1)
        return sp.csr_matrix(features_array)

    def predict_for_user(self, user_id: str, n_items: int = 10) -> list:
        """
        Generate recommendations for a single user
        
        Args:
            user_id: User ID to generate predictions for
            n_items: Number of items to recommend
            
        Returns:
            List of tuples (item_id, score)
        """
        try:
            # Get user features
            user_df = self.spark.read.parquet(self.user_features_path).filter(col("userId") == user_id)
            if user_df.count() == 0:
                logger.warning(f"User {user_id} not found in features")
                return []

            user_features = self._convert_features_to_sparse(user_df, "user_features")
            
            # Generate predictions for all items
            n_items_total = len(self.item_ids)
            user_ids = np.repeat(0, n_items_total)  # Create array of same length as items
            item_ids = np.arange(n_items_total)
            
            # Generate predictions
            scores = self.model.predict(
                user_ids=user_ids,
                item_ids=item_ids,
                user_features=user_features,
                item_features=self.item_features
            )

            # Normalize scores to [0,1] range
            scores = (scores - scores.min()) / (scores.max() - scores.min())
            
            # Get top N items
            top_items_idx = np.argsort(-scores)[:n_items]
            recommendations = [(self.item_ids[idx], float(scores[idx])) for idx in top_items_idx]
            
            return recommendations

        except Exception as e:
            logger.error(f"Error generating predictions for user {user_id}: {str(e)}")
            return []

    def batch_predict(self, user_ids: list, n_items: int = 10) -> dict:
        """
        Generate predictions for multiple users
        
        Args:
            user_ids: List of user IDs
            n_items: Number of items to recommend per user
            
        Returns:
            Dictionary mapping user IDs to their recommendations
        """
        recommendations = {}
        for user_id in user_ids:
            recommendations[user_id] = self.predict_for_user(user_id, n_items)
        return recommendations


## Initialize the Predictor

First, we'll create an instance of the LightFMPredictor by providing paths to the model and feature files.

In [17]:
import os

# Define paths to required files
MODEL_PATH = '../models/lightfm_model.pkl'
USER_FEATURES_PATH = '../datalake/gold/lightfm_user_features/user_features.parquet'
ITEM_FEATURES_PATH = '../datalake/gold/lightfm_item_features/item_features.parquet'

if not os.path.exists(MODEL_PATH):
            raise FileNotFoundError(f"Model file not found at path: {MODEL_PATH}")

if not os.path.exists(USER_FEATURES_PATH):
            raise FileNotFoundError(f"Model file not found at path: {USER_FEATURES_PATH}")

if not os.path.exists(ITEM_FEATURES_PATH):
            raise FileNotFoundError(f"Model file not found at path: {ITEM_FEATURES_PATH}")

# Initialize the predictor
predictor = LightFMPredictor(
    model_path=MODEL_PATH,
    user_features_path=USER_FEATURES_PATH,
    item_features_path=ITEM_FEATURES_PATH
)

Initializing Spark Session...
Spark Session initialized.


## Generate Recommendations for a Single User

Let's generate recommendations for a single user and examine the results.

In [18]:
# Example user ID - replace with an actual user ID from your dataset
user_id = "e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4b901419051126488b9"

# Get recommendations for the user
recommendations = predictor.predict_for_user(user_id, n_items=10)

print(f"Top 5 recommendations for user {user_id}:")
for item_id, score in recommendations:
    print(f"Item: {item_id}, Score: {score:.4f}")

Top 5 recommendations for user e25fbee3a42d45a2914f9b061df3386b2ded2d8cc1f3d4b901419051126488b9:
Item: 6a52871e-1a78-4575-8999-765c98d6aafc, Score: 1.0000
Item: 92875985-34de-4549-9dc8-0735ea34d972, Score: 0.9969
Item: a47afb19-8d6f-4550-ad38-9b64b754ab3f, Score: 0.9539
Item: f7a88bf3-c3c3-4b37-b51d-e7eabe570fce, Score: 0.9503
Item: 05d4ac1a-822b-4751-a9c4-0a7b1fd5cdd3, Score: 0.9461
Item: 2f478e54-eedc-44d3-b1d9-da6a5c28a800, Score: 0.9402
Item: e1462137-046f-458d-bad2-85f4cf66b723, Score: 0.9365
Item: 8bf71ac1-9412-4df7-87e7-10d5c55236e4, Score: 0.9265
Item: b2f3b10f-6e2f-4cd8-9554-c0de2ada1c81, Score: 0.9245
Item: 3343e1a0-133e-4012-89f5-467793780d25, Score: 0.9220
