In [3]:
"""
Welcome to the S2AND tutorial!

We will cover a few aspects of the S2AND pipeline:

(1) Load a test dataset.
(2) Fit a pairwise model + bells & whistles.
(3) Fit a clusterer.
(4) Evaluate the pairwise model and the clusterer.
""";

In [4]:
import os

os.environ["OMP_NUM_THREADS"] = "8"

import json
import copy
import argparse
import logging
import pickle
from typing import Dict, Any, Optional, List
from collections import defaultdict

import numpy as np
import pandas as pd

from s2and.data import PDData
from s2and.featurizer import featurize, FeaturizationInfo
from s2and.model import PairwiseModeler, Clusterer, FastCluster
from s2and.eval import pairwise_eval, cluster_eval, facet_eval
from s2and.consts import FEATURIZER_VERSION, DEFAULT_CHUNK_SIZE, PROJECT_ROOT_PATH
from s2and.file_cache import cached_path
from s2and.plotting_utils import plot_facets
from hyperopt import hp

In [5]:
# this is the random seed we used for the ablations table
random_seed = 42
# number of cpus to use
n_jobs = 4

In [6]:
# we're going to load the arnetminer dataset
# and assume that you have it already downloaded to the `S2AND/data/` directory

dataset_name = 'test'
DATA_DIR = os.path.join(PROJECT_ROOT_PATH, 'data')

pddata = PDData(
    papers=os.path.join(DATA_DIR, dataset_name + "_papers.json"),
    name=dataset_name,
    mode="train",  # can also be 'inference' if just predicting
    clusters=os.path.join(DATA_DIR, dataset_name + "_clusters.json"),
    train_pairs=None,  # in case you have predefined splits for the pairwise models
    val_pairs=None,
    test_pairs=None,
    train_pairs_size=100000,  # how many training pairs for the pairwise models?
    val_pairs_size=10000,
    test_pairs_size=10000,
    n_jobs=n_jobs,
    load_name_counts=False,
    random_seed=random_seed,
)

2022-10-10 19:33:34,881 - s2and - INFO - loading papers
2022-10-10 19:33:35,115 - s2and - INFO - loaded papers
2022-10-10 19:33:35,116 - s2and - INFO - loading clusters
2022-10-10 19:33:35,117 - s2and - INFO - loaded clusters, loading specter
2022-10-10 19:33:35,118 - s2and - INFO - loaded specter, loading cluster seeds
2022-10-10 19:33:35,118 - s2and - INFO - loaded cluster seeds
2022-10-10 19:33:35,119 - s2and - INFO - making paper to cluster id
2022-10-10 19:33:35,121 - s2and - INFO - made paper to cluster id
2022-10-10 19:33:35,128 - s2and - INFO - preprocessing papers
Preprocessing papers: 100%|██████████| 3601/3601 [00:01<00:00, 2031.81it/s]
2022-10-10 19:33:37,054 - s2and - INFO - preprocessed papers


In [7]:
# to train the pairwise model, we define which feature categories to use
# here it is all of them
features_to_use = [
    "author_similarity",
    "venue_similarity",
    "year_diff",
    "title_similarity",
    "abstract_similarity",
]

# we store all the information about the features in this convenient wrapper
featurization_info = FeaturizationInfo(features_to_use=features_to_use, featurizer_version=FEATURIZER_VERSION)

# now we can actually go and get the pairwise training, val and test data
train, val, test = featurize(pddata, featurization_info, n_jobs=4, use_cache=False, chunk_size=DEFAULT_CHUNK_SIZE, nan_value=np.nan)  # type: ignore
X_train, y_train, nameless_X_train = train
X_val, y_val, nameless_X_val = val
X_test, y_test, nameless_X_test = test

2022-10-10 19:33:38,284 - s2and - INFO - featurizing train
2022-10-10 19:33:38,286 - s2and - INFO - Creating 2107 pieces of work
2022-10-10 19:33:38,290 - s2and - INFO - Created pieces of work
2022-10-10 19:33:38,291 - s2and - INFO - Cached changed, doing 2107 work in parallel
2022-10-10 19:33:38,576 - s2and - INFO - Work completed
2022-10-10 19:33:38,578 - s2and - INFO - Making numpy arrays for features and labels
2022-10-10 19:33:38,580 - s2and - INFO - Numpy arrays made
2022-10-10 19:33:38,583 - s2and - INFO - featurized train, featurizing val
2022-10-10 19:33:38,584 - s2and - INFO - Creating 418 pieces of work
2022-10-10 19:33:38,586 - s2and - INFO - Created pieces of work
2022-10-10 19:33:38,587 - s2and - INFO - Cached changed, doing 418 work in parallel
2022-10-10 19:33:38,711 - s2and - INFO - Work completed
2022-10-10 19:33:38,714 - s2and - INFO - Making numpy arrays for features and labels
2022-10-10 19:33:38,715 - s2and - INFO - Numpy arrays made
2022-10-10 19:33:38,716 - s2an

In [8]:
# now we define and fit the pairwise modelers
pairwise_modeler = PairwiseModeler(
    n_iter=25,  # number of hyperparameter search iterations
    estimator=None,  # this will use the default LightGBM classifier
    search_space=None,  # this will use the default LightGBM search space
    monotone_constraints=featurization_info.lightgbm_monotone_constraints,  # we use monotonicity constraints to make the model more sensible
    random_state=random_seed,
)
pairwise_modeler.fit(X_train, y_train, X_val, y_val)

100%|██████████| 25/25 [00:05<00:00,  4.32trial/s, best loss: -0.9976514560972196]


<hyperopt.base.Trials at 0x7fcceda7add8>

In [12]:
# now we can fit the clusterer itself
clusterer = Clusterer(
    featurization_info,
    pairwise_modeler.classifier,  # the actual pairwise classifier
    cluster_model=FastCluster(linkage='average'),  # average linkage agglomerative clustering
    search_space={"eps": hp.uniform("choice", 0, 1)},  # the hyperparemetrs for the clustering algorithm
    n_jobs=n_jobs,
    use_cache=False,
    random_state=random_seed,
)
clusterer.fit(pddata)
print(clusterer.best_params)

2022-10-10 19:34:14,067 - s2and - INFO - Fitting clusterer
2022-10-10 19:34:14,164 - s2and - INFO - Making 10 distance matrices
2022-10-10 19:34:14,166 - s2and - INFO - Initializing pairwise_probas
2022-10-10 19:34:14,167 - s2and - INFO - Pairwise probas initialized, starting making all pairs
2022-10-10 19:34:14,168 - s2and - INFO - Featurizing batch 0/1
2022-10-10 19:34:14,169 - s2and - INFO - Getting constraints
2022-10-10 19:34:14,232 - s2and - INFO - Creating 16719 pieces of work
2022-10-10 19:34:14,255 - s2and - INFO - Created pieces of work
2022-10-10 19:34:14,256 - s2and - INFO - Cached changed, doing 16719 work in parallel


Finished loading model, total used 770 iterations


Doing work: 100%|██████████| 16719/16719 [00:00<00:00, 27796.19it/s]
2022-10-10 19:34:15,048 - s2and - INFO - Work completed
2022-10-10 19:34:15,049 - s2and - INFO - Making numpy arrays for features and labels
2022-10-10 19:34:15,055 - s2and - INFO - Numpy arrays made
2022-10-10 19:34:15,059 - s2and - INFO - Making predict flags
2022-10-10 19:34:15,062 - s2and - INFO - Pairwise classification
2022-10-10 19:34:15,106 - s2and - INFO - Starting to make matrices
Writing matrices: 100%|██████████| 16719/16719 [00:00<00:00, 680953.28it/s]
2022-10-10 19:34:15,134 - s2and - INFO - 10 distance matrices made


100%|██████████| 25/25 [00:00<00:00, 117.64trial/s, best loss: -0.769]

2022-10-10 19:34:15,358 - s2and - INFO - Clusterer fit



{'eps': 0.6762519627824752}


In [10]:
# but how good are our models? 
# first, let's look at the quality of the pairwise evaluation
pairwise_metrics = pairwise_eval(
    X_test,
    y_test,
    pairwise_modeler,
    os.path.join(PROJECT_ROOT_PATH, "data", "tutorial_figures"),  # where to put the figures
    "tutorial_figures",  # what to call the figures
    featurization_info.get_feature_names(),
    skip_shap=False,  # if your model isn't a tree-based model, you should put True here and it will not make SHAP figures
)
print(pairwise_metrics)

C extension was not built during install!


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
All-NaN slice encountered


{'AUROC': 0.994, 'Average Precision': 0.993, 'F1': 0.983, 'Precision': 0.985, 'Recall': 0.982}


In [11]:
# we can do the same thing for the clustering performance
cluster_metrics, b3_metrics_per_signature = cluster_eval(
    pddata,
    clusterer,
    split="test",  # which part of the data to evaluate on, can also be 'val'
    use_s2_clusters=False,  # set to true if you want to see how the old S2 system does
)
print(cluster_metrics)

2022-10-10 19:33:55,211 - s2and - INFO - Making 11 distance matrices
2022-10-10 19:33:55,212 - s2and - INFO - Initializing pairwise_probas
2022-10-10 19:33:55,213 - s2and - INFO - Pairwise probas initialized, starting making all pairs
2022-10-10 19:33:55,214 - s2and - INFO - Featurizing batch 0/1
2022-10-10 19:33:55,214 - s2and - INFO - Getting constraints
2022-10-10 19:33:55,235 - s2and - INFO - Creating 6717 pieces of work
2022-10-10 19:33:55,245 - s2and - INFO - Created pieces of work
2022-10-10 19:33:55,246 - s2and - INFO - Cached changed, doing 6717 work in parallel
2022-10-10 19:33:55,637 - s2and - INFO - Work completed
2022-10-10 19:33:55,639 - s2and - INFO - Making numpy arrays for features and labels
2022-10-10 19:33:55,641 - s2and - INFO - Numpy arrays made
2022-10-10 19:33:55,643 - s2and - INFO - Making predict flags
2022-10-10 19:33:55,646 - s2and - INFO - Pairwise classification
2022-10-10 19:33:55,668 - s2and - INFO - Starting to make matrices
Writing matrices: 100%|█████

{'B3 (P, R, F1)': (0.639, 0.981, 0.774), 'Pred bigger ratio (mean, count)': (1.64, 78), 'True bigger ratio (mean, count)': (2.0, 1)}
