In [1]:

import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'd:\\recommendation-engine'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    implicit_model_path: Path
    nn_model_path: Path
    scaler_path: Path
    encoders_path: Path
    all_params: dict
    metrics_file_name: Path
    k: int

In [5]:
from src.hybrid_recommender.constants import *
from src.hybrid_recommender.utils.common import read_yaml, create_directories, save_json

In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH,
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params.HybridRecommender

        create_directories([config.root_dir])

        return ModelEvaluationConfig(
            root_dir=config.root_dir,
            test_data_path=config.test_data_path,
            implicit_model_path=config.implicit_model_path,
            nn_model_path=config.nn_model_path,
            scaler_path=config.scaler_path,
            encoders_path=config.encoders_path,
            all_params=params,
            metrics_file_name=config.metrics_file_name,
            k=params.k
        )

In [7]:
import numpy as np
import pandas as pd
import joblib
import tensorflow as tf
from typing import Dict, Tuple, List
import json
from scipy.sparse import csr_matrix
from src.hybrid_recommender import logger

In [None]:
class HybridRecommenderEvaluator:
    def __init__(self, config: ModelEvaluationConfig):
        logger.info("Initializing HybridRecommenderEvaluator")
        self.config = config
        self.load_models()
        logger.info("HybridRecommenderEvaluator initialized successfully")
        
    def load_models(self):
        """Load all required models and encoders"""
        logger.info("Loading models and encoders")
        
        try:
            self.implicit_model = joblib.load(self.config.implicit_model_path)
            logger.info(f"Loaded implicit model from {self.config.implicit_model_path}")
            self.nn_model = tf.keras.models.load_model(self.config.nn_model_path)
            logger.info(f"Loaded neural network model from {self.config.nn_model_path}")
            self.scaler = joblib.load(self.config.scaler_path)
            logger.info(f"Loaded scaler from {self.config.scaler_path}")
            encoders = joblib.load(self.config.encoders_path)
            logger.info(f"Loaded encoders from {self.config.encoders_path}")
            
            self.user_encoder = encoders['user_encoder']
            self.item_encoder = encoders['item_encoder']
            self.organizer_encoder = encoders['organizer_encoder']
            logger.info("Successfully extracted user, item and organizer encoders")
            
            self.user_decoder = {i: u for u, i in self.user_encoder.items()}
            self.item_decoder = {i: m for m, i in self.item_encoder.items()}
            self.organizer_decoder = {i: o for o, i in self.organizer_encoder.items()}
            logger.info("Created inverse mappings for encoders")
        except Exception as e:
            logger.error(f"Error loading models: {str(e)}")
            raise
        

    def evaluate_recommendations(self, test_data: pd.DataFrame) -> Dict[str, float]:
        """Evaluate recommendation quality using multiple metrics"""
        logger.info("Starting recommendation evaluation")
        
        original_size = len(test_data)
        test_data = test_data[
            test_data['user_id'].isin(self.user_encoder) & 
            test_data['event_id'].isin(self.item_encoder)
        ]
        filtered_size = len(test_data)
        logger.info(f"Filtered test data from {original_size} to {filtered_size} records")

        
        user_events = test_data.groupby('user_id')['event_id'].apply(set).to_dict()
        logger.info(f"Prepared evaluation data for {len(user_events)} users")
        
        metrics = {
            'precision@k': [],
            'recall@k': [],
            'ndcg@k': [],
            'map@k': [],
            'coverage': self.calculate_coverage(test_data),
            'popularity_bias': self.calculate_popularity_bias(test_data)
        }
        
        logger.info("Initialized metrics dictionary")
        
        for i, (user_id, actual_events) in enumerate(user_events.items()):
            if i % 100 == 0:
                logger.debug(f"Processing user {i+1}/{len(user_events)}")
            recommended = self._recommend(user_id)
            metrics['precision@k'].append(self._precision(actual_events, recommended))
            metrics['recall@k'].append(self._recall(actual_events, recommended))
            metrics['ndcg@k'].append(self._ndcg(actual_events, recommended))
            metrics['map@k'].append(self._average_precision(actual_events, recommended))
        
        avg_precision = np.mean(metrics['precision@k'])
        avg_recall = np.mean(metrics['recall@k'])
        
        results = {
            'precision@k': avg_precision,
            'recall@k': avg_recall,
            'ndcg@k': np.mean(metrics['ndcg@k']),
            'map@k': np.mean(metrics['map@k']),
            'coverage': metrics['coverage'],
            'popularity_bias': metrics['popularity_bias'],
            'f1_score': 2 * (avg_precision * avg_recall) / (avg_precision + avg_recall) 
                if (avg_precision + avg_recall) > 0 else 0
        }
        
        logger.info("Evaluation completed. Metrics calculated:")
        for metric, value in results.items():
            logger.info(f"{metric}: {value:.4f}")
        
        return results

    def _precision(self, actual: set, recommended: list) -> float:
        """Calculate precision@k"""
        relevant = len(set(recommended) & actual)
        precision = relevant / len(recommended) if recommended else 0
        logger.debug(f"Precision calculation - relevant: {relevant}, recommended: {len(recommended)}, precision: {precision:.4f}")  
        return precision

    def _recall(self, actual: set, recommended: list) -> float:
        """Calculate recall@k"""
        relevant = len(set(recommended) & actual)
        recall = relevant / len(actual) if actual else 0
        logger.debug(f"Recall calculation - relevant: {relevant}, actual: {len(actual)}, recall: {recall:.4f}")
        return recall

    def _ndcg(self, actual: set, recommended: list) -> float:
        """Calculate Normalized Discounted Cumulative Gain"""
        relevances = [1 if event in actual else 0 for event in recommended]
        discounts = np.log2(np.arange(2, len(relevances) + 2))
        dcg = np.sum(relevances / discounts)
        ideal_relevances = [1] * min(len(actual), len(recommended))
        idcg = np.sum(ideal_relevances / np.log2(np.arange(2, len(ideal_relevances) + 2)))
        ndcg = dcg / idcg if idcg > 0 else 0
        logger.debug(f"NDCG calculation - DCG: {dcg:.4f}, IDCG: {idcg:.4f}, NDCG: {ndcg:.4f}")
        return ndcg

    def _average_precision(self, actual: set, recommended: list) -> float:
        """Calculate Average Precision"""
        relevant = []
        for i, event in enumerate(recommended):
            if event in actual:
                relevant.append(self._precision(actual, recommended[:i+1]))
        ap = np.mean(relevant) if relevant else 0
        logger.debug(f"Average Precision calculation - relevant points: {len(relevant)}, AP: {ap:.4f}")
        return ap

    def calculate_coverage(self, test_data: pd.DataFrame) -> float:
        """Calculate what percentage of events can be recommended"""
        logger.info("Calculating coverage metric")
        all_events = set(self.item_decoder.values())
        recommended_events = set()
        
        for user_id in test_data['user_id'].unique():
            recommended_events.update(self._recommend(user_id))
        coverage = len(recommended_events) / len(all_events)
        logger.info(f"Coverage: {len(recommended_events)}/{len(all_events)} = {coverage:.4f}")
        return coverage

    def calculate_popularity_bias(self, test_data: pd.DataFrame) -> float:
        """Calculate how biased recommendations are toward popular events"""
        logger.info("Calculating popularity bias metric")
        event_popularity = test_data['event_id'].value_counts().to_dict()
        recommendations_popularity = []
        
        for user_id in test_data['user_id'].unique():
            for event_id in self._recommend(user_id):
                recommendations_popularity.append(event_popularity.get(event_id, 0))
        
        avg_rec_pop = np.mean(recommendations_popularity) if recommendations_popularity else 0
        avg_all_pop = np.mean(list(event_popularity.values()))
        
        bias = avg_rec_pop / avg_all_pop if avg_all_pop > 0 else 0
        logger.info(f"Popularity bias: {bias:.4f} (rec avg: {avg_rec_pop:.2f}, all avg: {avg_all_pop:.2f})")
        
        return bias

    def _recommend(self, user_id: int) -> List[str]:
        """Generate recommendations for a single user"""
        logger.debug(f"Generating recommendations for user {user_id}")
        
        try:
            user_encoded = self.user_encoder[user_id]
            
            user_items = csr_matrix((1, len(self.item_encoder)), dtype=np.float32)
            
            implicit_recs = self.implicit_model.recommend(
                userid=user_encoded,
                user_items=user_items,
                N=self.config.k * 3,
                filter_already_liked_items=False
            )
            
            logger.debug(f"Implicit model returned {len(implicit_recs[0])} recommendations")
            
            recommended_events = [self.item_decoder[item] for item in implicit_recs[0]]
            organizer_ids = [self._get_organizer_for_event(event) for event in recommended_events]
            organizer_encoded = [self.organizer_encoder.get(o, 0) for o in organizer_ids]
            
            user_array = np.array([user_encoded] * len(implicit_recs[0]))
            event_array = np.array(implicit_recs[0])
            organizer_array = np.array(organizer_encoded)
            
            nn_scores = self.nn_model.predict(
                [user_array, event_array, organizer_array], 
                verbose=0
            )
            nn_scores = self.scaler.inverse_transform(nn_scores.reshape(-1, 1)).flatten()
            
            combined_scores = implicit_recs[1] * 0.6 + nn_scores * 0.4
            top_indices = np.argsort(combined_scores)[::-1][:self.config.k]
            
            final_recommendations = [self.item_decoder[implicit_recs[0][i]] for i in top_indices]
            logger.debug(f"Generated {len(final_recommendations)} final recommendations")
            
            return final_recommendations
        
        except Exception as e:
            logger.error(f"Error generating recommendations for user {user_id}: {str(e)}")
            return []

    def _get_organizer_for_event(self, event_id: str) -> str:
        """Helper method to get organizer for an event"""
        organizer = str(hash(event_id) % 1000)
        logger.debug(f"Getting organizer for event {event_id} -> {organizer}")
        return organizer

    def save_results(self):
        """Run evaluation and save metrics"""
        logger.info("Starting evaluation and saving results")
        
        try:
            test_data = pd.read_csv(self.config.test_data_path)
            metrics = self.evaluate_recommendations(test_data)
            
            full_results = {
                **metrics,
                "model_parameters": self.config.all_params,
                "num_users": len(self.user_encoder),
                "num_events": len(self.item_encoder),
                "num_organizers": len(self.organizer_encoder),
                "evaluation_time": pd.Timestamp.now().isoformat()
            }
            logger.info(f"Saving results to {self.config.metrics_file_name}")

            
            with open(self.config.metrics_file_name, 'w') as f:
                json.dump(full_results, f, indent=4)
            
            logger.info(f"Evaluation results saved to {self.config.metrics_file_name}")
            
        except Exception as e:
            logger.error(f"Error during evaluation or saving results: {str(e)}")
            raise

In [9]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    evaluator = HybridRecommenderEvaluator(config=model_evaluation_config)
    evaluator.save_results()
except Exception as e:
    logger.exception("Error during model evaluation")
    raise e

[2025-07-01 11:39:05,590: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-07-01 11:39:05,592: INFO: common: yaml file: params.yaml loaded successfully]
[2025-07-01 11:39:05,594: INFO: common: created directory at: artifacts]
[2025-07-01 11:39:05,594: INFO: common: created directory at: artifacts/model_evaluation]
[2025-07-01 11:39:05,594: INFO: 2512233877: Initializing HybridRecommenderEvaluator]
[2025-07-01 11:39:05,594: INFO: 2512233877: Loading models and encoders]
[2025-07-01 11:39:05,627: INFO: 2512233877: Loaded implicit model from artifacts/model_trainer/implicit_model.joblib]
[2025-07-01 11:39:05,741: INFO: 2512233877: Loaded neural network model from artifacts/model_trainer/nn_model.h5]
[2025-07-01 11:39:05,758: INFO: 2512233877: Loaded scaler from artifacts/model_trainer/scaler.joblib]
[2025-07-01 11:39:05,798: INFO: 2512233877: Loaded encoders from artifacts/model_trainer/encoders.joblib]


  from .autonotebook import tqdm as notebook_tqdm


[2025-07-01 11:39:05,798: INFO: 2512233877: Successfully extracted user, item and organizer encoders]
[2025-07-01 11:39:05,798: INFO: 2512233877: Created inverse mappings for encoders]
[2025-07-01 11:39:05,798: INFO: 2512233877: HybridRecommenderEvaluator initialized successfully]
[2025-07-01 11:39:05,798: INFO: 2512233877: Starting evaluation and saving results]
[2025-07-01 11:39:05,904: INFO: 2512233877: Starting recommendation evaluation]
[2025-07-01 11:39:05,920: INFO: 2512233877: Filtered test data from 50000 to 50000 records]
[2025-07-01 11:39:06,024: INFO: 2512233877: Prepared evaluation data for 4999 users]
[2025-07-01 11:39:06,024: INFO: 2512233877: Calculating coverage metric]
[2025-07-01 11:47:07,759: INFO: 2512233877: Coverage: 4657/10000 = 0.4657]
[2025-07-01 11:47:07,760: INFO: 2512233877: Calculating popularity bias metric]
[2025-07-01 11:54:59,992: INFO: 2512233877: Popularity bias: 0.9837 (rec avg: 4.95, all avg: 5.03)]
[2025-07-01 11:54:59,993: INFO: 2512233877: Initi