# Personalized recommendations to authors

### Notebook for training the community recommendation engines.

In [None]:
import numpy as np

import pickle
from scipy.sparse import save_npz

from prettytable import PrettyTable

from recpack.matrix import InteractionMatrix
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import Deduplicate, MinItemsPerUser, NMostPopular
from recpack.pipelines import PipelineBuilder, HyperoptInfo
from recpack.pipelines.registries import ALGORITHM_REGISTRY
from recpack.scenarios import WeakGeneralization

from hyperopt import hp

from src.recommenders.ease import myEASE
from src.helpers.db_interactions import *

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()  # Load the .env file

# Fetch environment variables based on .env file structure
db_host = os.getenv('HOST')
db_user = os.getenv('DB_USER')
db_pass = os.getenv('PASSWORD')
db_port = os.getenv('PORT')

In [None]:
# Define list of available communities
communities = [
    "zbMATH",
    "Transport Research", 
    "Digital Humanities and Cultural Heritage", 
    "Energy Research"
    ]

#### Collect data from DB to train recommenders

In [None]:
for community in communities:
    db_name = "zbmath" if community == "zbMATH" else "fc4eosc"
    df = get_interactions_by_community(db_name, db_host, db_user, db_pass, db_port, community)
    df_pp = DataFramePreprocessor("result_id", "author_id")  # define preprocessor

    pop_limit = min(df["result_id"].nunique(), int(5e3)) # conditionally set the limit for NMostPopular filter
    
    # Define filters
    deduplicate = Deduplicate("result_id", "author_id")
    n_most_popular_filter = NMostPopular(pop_limit, "result_id")
    min_items_per_user_filter = MinItemsPerUser(10, "result_id", "author_id")

    # Apply filters
    df_pp.add_filter(deduplicate)
    df_pp.add_filter(n_most_popular_filter)
    df_pp.add_filter(min_items_per_user_filter) 
    
    # Create interaction matrix object
    im = df_pp.process(df)
    
    # Create the directory structure if it doesn't exist
    directory = f"communities/{community.replace(' ', '-').lower()}"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Save the InteractionMatrix object
    im.save(f"{directory}/im")

    # Save the InteractionMatrix in CSR format
    save_npz(f"{directory}/im_csr.npz", im.values)

    users_tuples = [(row["uid"], row["author_id"], community) for _, row in df_pp.user_id_mapping.iterrows()]
    items_tuples = [
        (int(row["iid"]) if isinstance(row["iid"], np.int64) else row["iid"], 
         int(row["result_id"]) if isinstance(row["result_id"], np.int64) else row["result_id"], 
         community) 
        for _, row in df_pp.item_id_mapping.iterrows()
    ]
    write_users_mappings(db_name, db_host, db_user, db_pass, db_port, users_tuples)
    write_items_mappings(db_name, db_host, db_user, db_pass, db_port, items_tuples)

#### Compute interaction statistics for a particular community

In [None]:
im = InteractionMatrix.load("communities/energy-research/im")

# compute sparsity after filtering
sparsity = 1 - im.density

# table stats
statTable1 = PrettyTable(["data set","|U|","|I|","int(I)","sparsity"])
statTable1.add_row(["Community", str(im.num_active_users), str(im.num_active_items), str(im.num_interactions), str(round(sparsity*100,2))])
print(statTable1)

# compute user popularity (row-wise sum)
user_popularity = np.array(im.binary_values.sum(axis=1)).squeeze()
print("User Popularity - Min:", user_popularity.min(), ", Max:", user_popularity.max())

# compute item popularity (column-wise sum)
item_popularity = np.array(im.binary_values.sum(axis=0)).squeeze()
print("Item Popularity - Min:", item_popularity.min(), ", Max:", item_popularity.max())

#### Train recommenders

In [None]:
ALGORITHM_REGISTRY.register(myEASE.__name__, myEASE) # registration if not already registered

# Define time threshold and number of evaluations
SECONDS = 12*3600; EVALUATIONS = 50

# Define recommendation list sizes for metrics
K = [10, 20, 50]

for community in communities:
    im = InteractionMatrix.load(f"communities/{community.replace(' ', '-').lower()}/im")

    # Define evaluation scenario
    scenario = WeakGeneralization(validation=True, seed=1452)
    scenario.split(im)

    # Set optimisation details
    optimisation_info_ease = HyperoptInfo(
        {"l2": hp.loguniform("l2", np.log(1e0), np.log(1e4))},
        timeout = SECONDS,
        max_evals = EVALUATIONS,
    )

    # Start pipeline for fine-tuning
    pb = PipelineBuilder(folder_name=f"{community.replace(' ', '-').lower()}-results")
    pb.set_data_from_scenario(scenario)
    pb.add_algorithm("myEASE", optimisation_info=optimisation_info_ease, params={"method": "item"})
    pb.set_optimisation_metric("NDCGK", 10)
    pb.add_metric("NDCGK", K)
    pb.add_metric("RecallK", K)
    pb.add_metric("CoverageK", K)

    pipe = pb.build()
    pipe.run()

    # Save optimal parameters
    results = pipe.optimisation_results
    myEASE_rows = results[results["algorithm"] == "myEASE"]
    opt_myEASE_row = myEASE_rows.loc[myEASE_rows["NDCGK_" + str(10)].idxmax()]
    ease_params = {"l2": opt_myEASE_row["params"]["l2"]}

    # Define and train model with optimal parameters
    model = myEASE(ease_params["l2"], method="item")
    model.fit(scenario.full_training_data)

    # Save model
    pickle.dump(model, open(f"communities/{community.replace(' ', '-').lower()}/ease.pkl", "wb"))

    # Save and print results in test set
    pipe.save_metrics()
    print(f"Results for {community}:")
    pipe.get_metrics(short=True)