In [1]:
from pathlib import Path
import logging
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
MODELS_DIR = Path("C:\\Users\\allan\\Documents\\recommendation_system\\src\\models")
DATA_DIR = Path("C:\\Users\\allan\\Documents\\recommendation_system\\data\\processed")

print(MODELS_DIR, DATA_DIR)

C:\Users\allan\Documents\recommendation_system\src\models C:\Users\allan\Documents\recommendation_system\data\processed


In [3]:
# Cell 3: Helper Functions
def load_model(filename):
    filepath = MODELS_DIR / filename
    if not filepath.exists():
        raise FileNotFoundError(f"Model file {filename} not found in {MODELS_DIR}.")
    return joblib.load(filepath)

def load_processed_data(filename="processed_data.csv"):
    filepath = DATA_DIR / filename
    if not filepath.exists():
        raise FileNotFoundError(f"Data file {filename} not found in {DATA_DIR}.")
    return pd.read_csv(filepath)

In [4]:
def evaluate_model(knn_model, feature_matrix, index, top_n=10, n_samples=100):
    """
    Evaluate the recommendation model using Precision, Recall, MRR, and Coverage.
    """
    logging.info("Evaluating the recommendation model.")
    
    random_indices = np.random.choice(len(index), size=n_samples, replace=False)
    precision_scores, recall_scores, reciprocal_ranks = [], [], []
    all_recommended_items = set()
    total_items = len(index)
    detailed_data = []

    for idx in random_indices:
        _, indices = knn_model.kneighbors(feature_matrix[idx].reshape(1,-1), n_neighbors=top_n + 1)
        recommended_items = set(indices[0][1:])
        all_recommended_items.update(recommended_items)
        
        # Simulate ground truth (replace with your ground truth logic)
        simulated_truth = set(np.random.choice(len(index), size=top_n, replace=False))
        
        intersection = recommended_items & simulated_truth
        precision = len(intersection) / len(recommended_items) if recommended_items else 0
        recall = len(intersection) / len(simulated_truth) if simulated_truth else 0
        precision_scores.append(precision)
        recall_scores.append(recall)
        
        reciprocal_rank = 0
        for rank, item in enumerate(indices[0][1:], start=1):
            if item in simulated_truth:
                reciprocal_rank = 1 / rank
                break
        reciprocal_ranks.append(reciprocal_rank)

        detailed_data.append({
            "Index": idx,
            "Precision": precision,
            "Recall": recall,
            "Reciprocal Rank": reciprocal_rank,
            "Recommended Items": len(recommended_items)
        })
    
    coverage = len(all_recommended_items) / total_items
    results = {
        "Precision": np.mean(precision_scores),
        "Recall": np.mean(recall_scores),
        "MRR": np.mean(reciprocal_ranks),
        "Coverage": coverage
    }
    logging.info(f"Evaluation Results: {results}")
    return results, pd.DataFrame(detailed_data)

In [5]:
def plot_metrics_heatmap(detailed_metrics):
    """
    Plota um heatmap das métricas detalhadas.
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(detailed_metrics.drop(columns=["product_id"]).corr(), annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap of Metrics")
    plt.show()

In [6]:
def plot_cumulative_mrr(detailed_metrics):
    """
    Plota o MRR cumulativo.
    """
    detailed_metrics = detailed_metrics.sort_values(by="MRR", ascending=False).reset_index(drop=True)
    cumulative_mrr = np.cumsum(detailed_metrics["MRR"]) / np.arange(1, len(detailed_metrics) + 1)

    plt.figure(figsize=(10, 6))
    plt.plot(cumulative_mrr, label="Cumulative MRR", color="blue")
    plt.xlabel("Product Rank")
    plt.ylabel("Cumulative MRR")
    plt.title("Cumulative Mean Reciprocal Rank (MRR)")
    plt.legend()
    plt.grid()
    plt.show()

In [7]:
def plot_recommendation_coverage(total_recommended_items, total_items):
    """
    Plota a cobertura das recomendações.
    """
    coverage = len(total_recommended_items) / total_items * 100
    labels = ["Recommended", "Not Recommended"]
    sizes = [coverage, 100 - coverage]

    plt.figure(figsize=(8, 8))
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=["skyblue", "lightgray"])
    plt.title("Recommendation Coverage")
    plt.axis("equal")  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()

In [8]:
def plot_precision_recall_distribution(detailed_metrics):
    """
    Plota a distribuição de Precision e Recall.
    """
    plt.figure(figsize=(12, 6))

    sns.histplot(detailed_metrics["Precision"], kde=True, color="green", label="Precision", bins=15)
    sns.histplot(detailed_metrics["Recall"], kde=True, color="blue", label="Recall", bins=15)

    plt.xlabel("Score")
    plt.ylabel("Frequency")
    plt.title("Distribution of Precision and Recall")
    plt.legend()
    plt.show()

In [9]:
# Load models and data
knn_model = load_model("based_content_knn_model.pkl")
feature_matrix = load_model("based_content_feature_matrix.pkl")
index = load_model("based_content_index.pkl")
data = load_processed_data()

metrics, detailed_metrics = evaluate_model(knn_model, feature_matrix, index)
print("\n=== Métricas Gerais ===")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

plot_metrics_heatmap(detailed_metrics)
plot_cumulative_mrr(detailed_metrics)
plot_recommendation_coverage(set(index[:100]), len(index))
plot_precision_recall_distribution(detailed_metrics)


=== Métricas Gerais ===
Precision: 0.0000
Recall: 0.0000
MRR: 0.0000
Coverage: 0.0000


KeyError: "['product_id'] not found in axis"

<Figure size 1000x800 with 0 Axes>