In [22]:
import numpy as np
import polars as pl
import pickle

class ContentBasedRecommender:
    """
    A content-based recommender system based on joke embeddings and features.
    """

    def __init__(self, similarity_matrix, joke_ids):
        self.similarity_matrix = similarity_matrix
        self.joke_ids = joke_ids

    def recommend(self, joke_id, top_n=10):
        """
        Recommends similar jokes to the given joke ID.

        Args:
            joke_id (int): ID of the joke to recommend similar jokes for.
            top_n (int): Number of similar jokes to return.

        Returns:
            list: Top-N recommended joke IDs.
        """
        if joke_id not in self.joke_ids:
            raise ValueError(f"Joke ID {joke_id} not found in the dataset.")
        
        idx = self.joke_ids.index(joke_id)
        similarity_scores = self.similarity_matrix[idx]
        top_indices = np.argsort(similarity_scores)[::-1][1:top_n + 1]
        return [self.joke_ids[i] for i in top_indices]

# Load data
train_path = "../data/processed/train_data.csv"
jokes_path = "../data/processed/jokes_with_clusters.parquet"

train_df = pl.read_csv(train_path)
jokes_df = pl.read_parquet(jokes_path)

# Step 1: Merge jokes with features and prepare embeddings
jokes_features = jokes_df.select(
    ["jokeId", "text_length", "word_count", "num_ratings", "avg_rating", "rating_std", "embeddings"]
)

# Expand embeddings into separate columns for processing
embeddings = np.vstack(jokes_features["embeddings"].to_list())
jokes_features = jokes_features.drop("embeddings")

# Combine features and embeddings
combined_features = np.hstack([
    jokes_features.drop("jokeId").to_numpy(),
    embeddings
])

# Step 2: Compute cosine similarity matrix
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(combined_features)

# Step 3: Save similarity matrix and joke IDs using pickle
joke_ids = jokes_features["jokeId"].to_list()
model_path = "../models/content_based_recommender.pkl"
with open(model_path, "wb") as f:
    pickle.dump({"similarity_matrix": similarity_matrix, "joke_ids": joke_ids}, f)

print(f"Model saved to {model_path}")

# Step 4: Define function to load and recommend
def recommend_from_pickle(joke_id, top_n=10):
    with open(model_path, "rb") as f:
        data = pickle.load(f)
    
        similarity_matrix = data["similarity_matrix"]
        joke_ids = data["joke_ids"]
    
    if joke_id not in joke_ids:
        raise ValueError(f"Joke ID {joke_id} not found in the dataset.")
    
    idx = joke_ids.index(joke_id)
    similarity_scores = similarity_matrix[idx]
    top_indices = np.argsort(similarity_scores)[::-1][1:top_n + 1]
    return [joke_ids[i] for i in top_indices]

# Example usage
joke_id = joke_ids[0]  # Replace with a valid jokeId
recommendations = recommend_from_pickle(joke_id)
print(f"Recommendations for joke {joke_id}: {recommendations}")


In [23]:
import numpy as np
import polars as pl
from sklearn.metrics.pairwise import cosine_similarity
import onnx
import onnxruntime as ort
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn
import pickle
import os

class ContentBasedRecommender:
    """
    A content-based recommender system based on joke embeddings and features.
    """

    def __init__(self, similarity_matrix, joke_ids):
        self.similarity_matrix = similarity_matrix
        self.joke_ids = joke_ids

    def recommend(self, joke_id, top_n=10):
        """
        Recommends similar jokes to the given joke ID.

        Args:
            joke_id (int): ID of the joke to recommend similar jokes for.
            top_n (int): Number of similar jokes to return.

        Returns:
            list: Top-N recommended joke IDs.
        """
        if joke_id not in self.joke_ids:
            raise ValueError(f"Joke ID {joke_id} not found in the dataset.")
        
        idx = self.joke_ids.index(joke_id)
        similarity_scores = self.similarity_matrix[idx]
        top_indices = np.argsort(similarity_scores)[::-1][1:top_n + 1]
        return [self.joke_ids[i] for i in top_indices]


# Load data
train_path = "../data/processed/train_data.csv"
jokes_path = "../data/processed/jokes_with_clusters.parquet"

train_df = pl.read_csv(train_path)
jokes_df = pl.read_parquet(jokes_path)

# Step 1: Merge jokes with features and prepare embeddings
jokes_features = jokes_df.select(
    ["jokeId", "text_length", "word_count", "num_ratings", "avg_rating", "rating_std", "embeddings"]
)

# Expand embeddings into separate columns for processing
embeddings = np.vstack(jokes_features["embeddings"].to_list())
jokes_features = jokes_features.drop("embeddings")

# Combine features and embeddings
combined_features = np.hstack([
    jokes_features.drop("jokeId").to_numpy(),
    embeddings
])

# Step 2: Compute cosine similarity matrix
similarity_matrix = cosine_similarity(combined_features)

# Step 3: Save similarity matrix and recommender model in ONNX format
joke_ids = jokes_features["jokeId"].to_list()
recommender = ContentBasedRecommender(similarity_matrix, joke_ids)

# Save similarity matrix and joke IDs in ONNX
onnx_path = "../models/content_based_recommender.onnx"

# ONNX conversion (no direct way to save similarity; wrap in an API-like structure)
input_type = [('input_joke_index', FloatTensorType([None, 1]))]
onnx_model = convert_sklearn(recommender, initial_types=input_type)
onnx.save_model(onnx_model, onnx_path)

print(f"Content-Based Recommender saved as ONNX at {onnx_path}")

# Step 4: Usage example
# Loading and recommending
ort_session = ort.InferenceSession(onnx_path)

def recommend_from_onnx(joke_id, top_n=10):
    joke_index = joke_ids.index(joke_id)
    input_feed = {'input_joke_index': np.array([[joke_index]], dtype=np.float32)}
    output = ort_session.run(None, input_feed)[0]
    top_indices = np.argsort(output)[::-1][1:top_n + 1]
    return [joke_ids[i] for i in top_indices]

# Example usage:
joke_id = joke_ids[0]  # Replace with a valid jokeId
recommendations = recommend_from_onnx(joke_id)
print(f"Recommendations for joke {joke_id}: {recommendations}")

MissingShapeCalculator: Unable to find a shape calculator for type '<class '__main__.ContentBasedRecommender'>'.
It usually means the pipeline being converted contains a
transformer or a predictor with no corresponding converter
implemented in sklearn-onnx. If the converted is implemented
in another library, you need to register
the converted so that it can be used by sklearn-onnx (function
update_registered_converter). If the model is not yet covered
by sklearn-onnx, you may raise an issue to
https://github.com/onnx/sklearn-onnx/issues
to get the converter implemented or even contribute to the
project. If the model is a custom model, a new converter must
be implemented. Examples can be found in the gallery.
