In [1]:
import os

import numpy as np
import bottleneck as bn
from hyperopt import hp
from recpack.pipelines import PipelineBuilder, HyperoptInfo
from recpack.pipelines.registries import ALGORITHM_REGISTRY
from recpack.preprocessing.filters import NMostPopular
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.scenarios import StrongGeneralization

from database.utils_database_connector.core import Database
from database.rec_data import *
from recommenders.ease import myEASE

In [2]:
db = Database("fc4eosc")

communities = {"beopen", "dh-ch", "enermaps", "eosc", "dariah"}

base_directory = "communities"

ALGORITHM_REGISTRY.register(myEASE.__name__, myEASE) # register if not already registered

# Define time threshold and number of evaluations
SECONDS = 12*3600; EVALUATIONS = 25

BATCH_SIZE = 20000 # batch size for getting recs

# Define recommendation list sizes for metrics
K = [10, 20, 50]

In [3]:
def get_topn_indices(R_hat, n):
    """
    Helper function to get sorted indices of top-n items in each row of R_hat.
    """
    users = R_hat.shape[0]
    
    # find the indices that partition the array so that the first n elements are the largest n elements
    idx_topn_part = bn.argpartition(-R_hat, n, axis=1)

    # keep only the largest n elements of R_hat
    topn_part = R_hat[np.arange(users)[:, np.newaxis], idx_topn_part[:, :n]]

    # find the indeces of the sorted top-n predicted relevance scores in R_hat
    idx_part = np.argsort(-topn_part, axis=1)
    idx_topn = idx_topn_part[np.arange(users)[:, np.newaxis], idx_part]
    
    return idx_topn

In [4]:
for community in communities:

    # Collect data
    df = get_citations_by_community(db, community)
    if df.empty: print('DataFrame is empty!')

    df_pp = DataFramePreprocessor("result_id", "author_id")  # define preprocessor

    pop_limit = min(df["result_id"].nunique(), int(1e3)) # conditionally set the limit for NMostPopular filter

    # Define filters
    n_most_popular_filter = NMostPopular(pop_limit, "result_id")

    # Apply filters
    df_pp.add_filter(n_most_popular_filter)

    # Create interaction matrix object
    im = df_pp.process(df)

    scenario = StrongGeneralization(validation=True, seed=1452)
    scenario.split(im)

    # Set optimisation details
    optimisation_info_ease = HyperoptInfo(
        {"l2": hp.loguniform("l2", np.log(1e0), np.log(1e4))},
        timeout = SECONDS,
        max_evals = EVALUATIONS,
    )

    results_folder = os.path.join(base_directory, f"{community}-results")
    if not os.path.exists(results_folder): os.makedirs(results_folder)
    
    # Start pipeline for fine-tuning
    pb = PipelineBuilder(folder_name=results_folder)
    pb.set_data_from_scenario(scenario)
    pb.add_algorithm("myEASE", optimisation_info=optimisation_info_ease, params={"method": "item"})
    pb.set_optimisation_metric("NDCGK", 10)
    pb.add_metric("NDCGK", K)
    pb.add_metric("RecallK", K)
    pb.add_metric("CoverageK", K)

    pipe = pb.build()
    pipe.run()
    pipe.save_metrics() # save results

    # Training model with optimal parameters
    opt_results = pipe.optimisation_results
    myEASE_rows = opt_results[opt_results["algorithm"] == "myEASE"]
    opt_myEASE_row = myEASE_rows.loc[myEASE_rows["NDCGK_" + str(10)].idxmax()]
    ease_params = {"l2": opt_myEASE_row["params"]["l2"]}

    num_users = im.values.shape[0]

    model = myEASE(ease_params["l2"], method="item")
    model.fit(im)

    # Process recommendations in batches
    for start_index in range(0, num_users, BATCH_SIZE):
        end_index = min(start_index + BATCH_SIZE, num_users)
        batch_data = im.values[start_index:end_index, :]
        author_ids_batch = df_pp.user_id_mapping['author_id'][start_index:end_index].tolist()

        # Get recommendations for the current batch
        predictions = model.predict(batch_data).toarray()
        topn_lists = get_topn_indices(predictions, 20)
        topn_lists_real_ids = [[df_pp.item_id_mapping["result_id"][idx] for idx in topn] for topn in topn_lists]

        data_tuples = prepare_recommendation_data(author_ids_batch, topn_lists_real_ids, community)
        write_recommendations(db, data_tuples)