# zbMATH Community

### Notebook for training the community recommendation engine of zbMATH.

In [None]:
import pandas as pd
import numpy as np

from scipy.sparse import save_npz

from prettytable import PrettyTable

from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import Deduplicate, MinItemsPerUser, MinUsersPerItem
from recpack.pipelines import PipelineBuilder, HyperoptInfo
from recpack.pipelines.registries import ALGORITHM_REGISTRY
from recpack.scenarios import WeakGeneralization

from hyperopt import hp

# models
from recommenders.ease import myEASE

import pickle

In [None]:
# df = pd.read_csv("communities/zbmath/interaction.csv")
# df = pd.read_csv("communities/dariah/interaction.csv")
# df = pd.read_csv("communities/digital-humanities/interaction.csv")
df = pd.read_csv("communities/transport-research/interaction.csv")

# define preprocessor
df_pp = DataFramePreprocessor("item_id", "user_id")

# define filters
deduplicate = Deduplicate("item_id", "user_id")
min_users_per_item_filter = MinUsersPerItem(25, "item_id", "user_id")
min_items_per_user_filter = MinItemsPerUser(25, "item_id", "user_id")

# add filters to pre-processor
df_pp.add_filter(deduplicate)
df_pp.add_filter(min_users_per_item_filter)
df_pp.add_filter(min_items_per_user_filter)

# create interaction matrix object
im = df_pp.process(df)

# save sparse matrix
# save_npz("communities/zbmath/csr", im.binary_values)
# save_npz("communities/dariah/csr", im.binary_values)
# save_npz("communities/digital-humanities/csr", im.binary_values)
save_npz("communities/transport-research/csr", im.binary_values)

# save mappings
# df_pp.item_id_mapping.to_parquet("communities/zbmath/items_mapping.parquet", index=False)
# df_pp.user_id_mapping.to_parquet("communities/zbmath/users_mapping.parquet", index=False)

# df_pp.item_id_mapping.to_parquet("communities/dariah/items_mapping.parquet", index=False)
# df_pp.user_id_mapping.to_parquet("communities/dariah/users_mapping.parquet", index=False)

# df_pp.item_id_mapping.to_parquet("communities/digital-humanities/items_mapping.parquet", index=False)
# df_pp.user_id_mapping.to_parquet("communities/digital-humanities/users_mapping.parquet", index=False)

df_pp.item_id_mapping.to_parquet("communities/transport-research/items_mapping.parquet", index=False)
df_pp.user_id_mapping.to_parquet("communities/transport-research/users_mapping.parquet", index=False)

In [None]:
# compute sparsity after filtering
sparsity = 1 - im.density

# calculate user interaction and item popularity ranges
user_interactions = im.binary_values.sum(axis=1)
item_popularities = im.binary_values.sum(axis=0)
print(f"User interaction ranges from {user_interactions.min()} to {user_interactions.max()}. Item popularity ranges from {item_popularities.min()} to {item_popularities.max()}.")

# table stats
statTable1 = PrettyTable(["data set","|U|","|I|","int(I)","sparsity"])
statTable1.add_row(["Community", str(im.num_active_users), str(im.num_active_items), str(im.num_interactions), str(round(sparsity*100,2))])
print(statTable1)

In [None]:
# define scenario
scenario = WeakGeneralization(validation=True, seed=1452)
scenario.split(im)

# define time threshold
SECONDS = 12*3600

# define number of evaluations
EVALUATIONS = 50

In [None]:
# define optimisation's details for baselines
optimisation_info_ease = HyperoptInfo(
    {
        "l2": hp.loguniform("l2", np.log(1e0), np.log(1e4))
    },
    timeout = SECONDS,
    max_evals = EVALUATIONS,
)

ALGORITHM_REGISTRY.register(myEASE.__name__, myEASE)

# start pipeline
pb = PipelineBuilder()
pb.set_data_from_scenario(scenario)
pb.add_algorithm("myEASE", optimisation_info=optimisation_info_ease, params={"method": "item"})
pb.set_optimisation_metric("NDCGK", 10)
pb.add_metric("NDCGK", [10, 20])
pb.add_metric("RecallK", [10, 20])
pb.add_metric("CoverageK", [10, 20])

pipe = pb.build()
pipe.run()

# save optimal parameters
results = pipe.optimisation_results

# myEASE
myEASE_rows = results[results["algorithm"] == "myEASE"]
opt_myEASE_row = myEASE_rows.loc[myEASE_rows["NDCGK_" + str(10)].idxmax()]
ease_params = {"l2": opt_myEASE_row["params"]["l2"]}

# define model with optimal parameters
model = myEASE(ease_params["l2"],method="item")

# train model
model.fit(scenario.full_training_data)

# save model
# pickle.dump(model, open(f"communities/zbmath/ease.pkl", "wb"))
# pickle.dump(model, open(f"communities/dariah/ease.pkl", "wb"))
# pickle.dump(model, open(f"communities/digital-humanities/ease.pkl", "wb"))
pickle.dump(model, open(f"communities/transport-research/ease.pkl", "wb"))

# # save results in test set
pipe.save_metrics()

# print results in test set
pipe.get_metrics(short=True)