In [1]:
"""
Welcome to the S2AND tutorial!

We will cover a few aspects of the S2AND pipeline:

(1) Load a test dataset.
(2) Fit a pairwise model + bells & whistles.
(3) Fit a clusterer.
(4) Evaluate the pairwise model and the clusterer.
""";

In [2]:
import os

os.environ["OMP_NUM_THREADS"] = "8"

import json
import copy
import argparse
import logging
import pickle
from typing import Dict, Any, Optional, List
from collections import defaultdict

import numpy as np
import pandas as pd

from s2and.data import ANDData
from s2and.featurizer import featurize, FeaturizationInfo
from s2and.model import PairwiseModeler, Clusterer, FastCluster
from s2and.eval import pairwise_eval, cluster_eval, facet_eval
from s2and.consts import FEATURIZER_VERSION, DEFAULT_CHUNK_SIZE, PROJECT_ROOT_PATH
from s2and.file_cache import cached_path
from s2and.plotting_utils import plot_facets
from hyperopt import hp



In [3]:
# this is the random seed we used for the ablations table
random_seed = 42
# number of cpus to use
n_jobs = 4

In [4]:
# we're going to load the arnetminer dataset
# and assume that you have it already downloaded to the `S2AND/data/` directory

dataset_name = 'arnetminer'
DATA_DIR = os.path.join(PROJECT_ROOT_PATH, 'data', dataset_name)

anddata = ANDData(
    signatures=os.path.join(DATA_DIR, dataset_name + "_signatures.json"),
    papers=os.path.join(DATA_DIR, dataset_name + "_papers.json"),
    name=dataset_name,
    mode="train",  # can also be 'inference' if just predicting
    specter_embeddings=os.path.join(DATA_DIR, dataset_name + "_specter.pickle"),
    clusters=os.path.join(DATA_DIR, dataset_name + "_clusters.json"),
    block_type="s2",  # can also be 'original'
    train_pairs=None,  # in case you have predefined splits for the pairwise models
    val_pairs=None,
    test_pairs=None,
    train_pairs_size=100000,  # how many training pairs for the pairwise models?
    val_pairs_size=10000,
    test_pairs_size=10000,
    n_jobs=n_jobs,
    load_name_counts=True,  # the name counts derived from the entire S2 corpus need to be loaded separately
    preprocess=True,
    random_seed=random_seed,
)

2020-12-16 12:32:59,849 - s2and - INFO - loading papers
2020-12-16 12:33:01,273 - s2and - INFO - loaded papers
2020-12-16 12:33:01,274 - s2and - INFO - loading signatures
2020-12-16 12:33:01,426 - s2and - INFO - loaded signatures
2020-12-16 12:33:01,429 - s2and - INFO - loading clusters
2020-12-16 12:33:01,437 - s2and - INFO - loaded clusters, loading specter
2020-12-16 12:33:01,509 - s2and - INFO - loaded specter, loading cluster seeds
2020-12-16 12:33:01,510 - s2and - INFO - loaded cluster seeds
2020-12-16 12:33:01,511 - s2and - INFO - making signature to cluster id
2020-12-16 12:33:01,513 - s2and - INFO - made signature to cluster id
2020-12-16 12:33:01,514 - s2and - INFO - loading name counts
2020-12-16 12:33:24,297 - s2and - INFO - loaded name counts
2020-12-16 12:33:24,530 - s2and - INFO - preprocessing papers
Preprocessing papers 1/2: 100%|██████████| 53959/53959 [00:03<00:00, 13828.07it/s]
Preprocessing papers 2/2: 100%|██████████| 53959/53959 [00:09<00:00, 5990.95it/s] 
2020-1

In [5]:
# to train the pairwise model, we define which feature categories to use
# here it is all of them
features_to_use = [
    "name_similarity",
    "affiliation_similarity",
    "email_similarity",
    "coauthor_similarity",
    "venue_similarity",
    "year_diff",
    "title_similarity",
    "reference_features",
    "misc_features",
    "name_counts",
    "embedding_similarity",
    "journal_similarity",
    "advanced_name_similarity",
]

# we also have this special second "nameless" model that doesn't use any name-based features
# it helps to improve clustering performance by preventing model overreliance on names
nameless_features_to_use = [
    feature_name
    for feature_name in features_to_use
    if feature_name not in {"name_similarity", "advanced_name_similarity", "name_counts"}
]

# we store all the information about the features in this convenient wrapper
featurization_info = FeaturizationInfo(features_to_use=features_to_use, featurizer_version=FEATURIZER_VERSION)
nameless_featurization_info = FeaturizationInfo(features_to_use=nameless_features_to_use, featurizer_version=FEATURIZER_VERSION)

# now we can actually go and get the pairwise training, val and test data
train, val, test = featurize(anddata, featurization_info, n_jobs=4, use_cache=False, chunk_size=DEFAULT_CHUNK_SIZE, nameless_featurizer_info=nameless_featurization_info, nan_value=np.nan)  # type: ignore
X_train, y_train, nameless_X_train = train
X_val, y_val, nameless_X_val = val
X_test, y_test, nameless_X_test = test

2020-12-16 12:33:46,027 - s2and - INFO - featurizing train
2020-12-16 12:33:46,040 - s2and - INFO - Creating 100000 pieces of work
2020-12-16 12:33:46,137 - s2and - INFO - Created pieces of work
2020-12-16 12:33:46,138 - s2and - INFO - Cached changed, doing 100000 work in parallel
Doing work: 100%|██████████| 100000/100000 [00:14<00:00, 6732.01it/s]
2020-12-16 12:34:01,778 - s2and - INFO - Work completed
2020-12-16 12:34:01,780 - s2and - INFO - Making numpy arrays for features and labels
2020-12-16 12:34:01,863 - s2and - INFO - Numpy arrays made
2020-12-16 12:34:01,882 - s2and - INFO - featurized train, featurizing val
2020-12-16 12:34:01,885 - s2and - INFO - Creating 10000 pieces of work
2020-12-16 12:34:01,917 - s2and - INFO - Created pieces of work
2020-12-16 12:34:01,918 - s2and - INFO - Cached changed, doing 10000 work in parallel
2020-12-16 12:34:04,273 - s2and - INFO - Work completed
2020-12-16 12:34:04,275 - s2and - INFO - Making numpy arrays for features and labels
2020-12-16 

In [6]:
# now we define and fit the pairwise modelers
pairwise_modeler = PairwiseModeler(
    n_iter=25,  # number of hyperparameter search iterations
    estimator=None,  # this will use the default LightGBM classifier
    search_space=None,  # this will use the default LightGBM search space
    monotone_constraints=featurization_info.lightgbm_monotone_constraints,  # we use monotonicity constraints to make the model more sensible
    random_state=random_seed,
)
pairwise_modeler.fit(X_train, y_train, X_val, y_val)


# as mentioned above, there are 2: one with all features and a nameless one
nameless_pairwise_modeler = PairwiseModeler(
    n_iter=25,
    estimator=None,
    search_space=None,
    monotone_constraints=nameless_featurization_info.lightgbm_monotone_constraints,
    random_state=random_seed,
)
nameless_pairwise_modeler.fit(nameless_X_train, y_train, nameless_X_val, y_val)

100%|██████████| 25/25 [01:51<00:00,  4.46s/trial, best loss: -0.961766757760418]
100%|██████████| 25/25 [01:39<00:00,  3.99s/trial, best loss: -0.8150096589558353]


<hyperopt.base.Trials at 0x7fa72533f6d8>

In [8]:
# now we can fit the clusterer itself
clusterer = Clusterer(
    featurization_info,
    pairwise_modeler.classifier,  # the actual pairwise classifier
    cluster_model=FastCluster(linkage='average'),  # average linkage agglomerative clustering
    search_space={"eps": hp.uniform("choice", 0, 1)},  # the hyperparemetrs for the clustering algorithm
    n_jobs=n_jobs,
    use_cache=False,
    nameless_classifier=nameless_pairwise_modeler.classifier,  # the nameless pairwise classifier
    nameless_featurizer_info=nameless_featurization_info,
    random_state=random_seed,
    use_default_constraints_as_supervision=False,  # this is an option used by the S2 production system but not in the S2AND paper
)
clusterer.fit(anddata)

2020-12-16 12:38:22,052 - s2and - INFO - Fitting clusterer
2020-12-16 12:38:22,093 - s2and - INFO - Making 10 distance matrices
2020-12-16 12:38:22,094 - s2and - INFO - Initializing pairwise_probas
2020-12-16 12:38:22,096 - s2and - INFO - Pairwise probas initialized, starting making all pairs
2020-12-16 12:38:22,096 - s2and - INFO - Featurizing batch 0
2020-12-16 12:38:22,098 - s2and - INFO - Getting constraints


Finished loading model, total used 1728 iterations
Finished loading model, total used 1242 iterations


2020-12-16 12:38:22,313 - s2and - INFO - Creating 160277 pieces of work
Creating work: 160277it [00:00, 724510.10it/s]
2020-12-16 12:38:22,537 - s2and - INFO - Created pieces of work
2020-12-16 12:38:22,538 - s2and - INFO - Cached changed, doing 160277 work in parallel
Doing work: 100%|██████████| 160277/160277 [00:06<00:00, 24641.02it/s]
2020-12-16 12:38:32,758 - s2and - INFO - Work completed
2020-12-16 12:38:32,759 - s2and - INFO - Making numpy arrays for features and labels
2020-12-16 12:38:32,872 - s2and - INFO - Numpy arrays made
2020-12-16 12:38:32,884 - s2and - INFO - Making predict flags
2020-12-16 12:38:32,910 - s2and - INFO - Pairwise classification
2020-12-16 12:38:36,240 - s2and - INFO - Starting to make matrices
Writing matrices: 100%|██████████| 160277/160277 [00:00<00:00, 1036705.22it/s]
2020-12-16 12:38:36,400 - s2and - INFO - 10 distance matrices made


100%|██████████| 25/25 [00:00<00:00, 57.90trial/s, best loss: -0.887]

2020-12-16 12:38:36,882 - s2and - INFO - Clusterer fit





<s2and.model.Clusterer at 0x7fa6a5b8eb00>

In [9]:
# but how good are our models? 
# first, let's look at the quality of the pairwise evaluation
pairwise_metrics = pairwise_eval(
    X_test,
    y_test,
    pairwise_modeler,
    os.path.join(PROJECT_ROOT_PATH, "data", "tutorial_figures"),  # where to put the figures
    "tutorial_figures",  # what to call the figures
    featurization_info.get_feature_names(),
    nameless_classifier=nameless_pairwise_modeler,
    nameless_X=nameless_X_test,
    nameless_feature_names=nameless_featurization_info.get_feature_names(),
    skip_shap=False,  # if your model isn't a tree-based model, you should put True here and it will not make SHAP figures
)
print(pairwise_metrics)

LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
All-NaN slice encountered


{'AUROC': 0.995, 'Average Precision': 0.996, 'F1': 0.947, 'Precision': 0.944, 'Recall': 0.954}


In [10]:
# we can do the same thing for the clustering performance
cluster_metrics, b3_metrics_per_signature = cluster_eval(
    anddata,
    clusterer,
    split="test",  # which part of the data to evaluate on, can also be 'val'
    use_s2_clusters=False,  # set to true if you want to see how the old S2 system does
)
print(cluster_metrics)

2020-12-16 12:39:09,201 - s2and - INFO - Making 13 distance matrices
2020-12-16 12:39:09,202 - s2and - INFO - Initializing pairwise_probas
2020-12-16 12:39:09,203 - s2and - INFO - Pairwise probas initialized, starting making all pairs
2020-12-16 12:39:09,204 - s2and - INFO - Featurizing batch 0
2020-12-16 12:39:09,204 - s2and - INFO - Getting constraints
2020-12-16 12:39:09,263 - s2and - INFO - Creating 44855 pieces of work
2020-12-16 12:39:09,296 - s2and - INFO - Created pieces of work
2020-12-16 12:39:09,296 - s2and - INFO - Cached changed, doing 44855 work in parallel
Doing work: 100%|██████████| 44855/44855 [00:01<00:00, 27100.38it/s]
2020-12-16 12:39:14,681 - s2and - INFO - Work completed
2020-12-16 12:39:14,682 - s2and - INFO - Making numpy arrays for features and labels
2020-12-16 12:39:14,723 - s2and - INFO - Numpy arrays made
2020-12-16 12:39:14,729 - s2and - INFO - Making predict flags
2020-12-16 12:39:14,737 - s2and - INFO - Pairwise classification
2020-12-16 12:39:15,919 - 

{'B3 (P, R, F1)': (0.92, 0.985, 0.951), 'Cluster (P, R F1)': (0.979, 0.997, 0.988), 'Cluster Macro (P, R, F1)': (0.911, 0.977, 0.938), 'Pred bigger ratio (mean, count)': (1.86, 609), 'True bigger ratio (mean, count)': (1.76, 36)}


In [None]:
# if you want to reproduce the facet plots from the S2AND paper, check out the `facet_eval` function also!