In [1]:
# Firstly, compute using our pre-defined functions
transformations = {
    "WyFormerDffCSP++": ("WyckoffTransformer", "DiffCSP++"),
    "SymmCD": ("SymmCD",)
}

In [2]:
import sys
sys.path.append("..")
from evaluation.generated_dataset import GeneratedDataset
from evaluation.novelty import record_to_anonymous_fingerprint

In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.


In [3]:
datasets = {key: GeneratedDataset.from_cache(value) for key, value in transformations.items()}

In [4]:
train = GeneratedDataset.from_cache(("split", "train"))
val = GeneratedDataset.from_cache(("split", "val"))

In [5]:
from collections import Counter
def record_to_naive_fingerprint(record):
    return (
        record["spacegroup_number"],
            frozenset(Counter(
                map(
                    tuple,
                    zip(record["site_symmetries"], record["sites_enumeration"])
                )
            ).items())
    )

SymmCD paper reports 2794 unique and 1654 novel and 3318 unique in training set.
The likely cause is that they don't take into account equivalent positions.

In [6]:
for fingerprinting_function in (record_to_naive_fingerprint, record_to_anonymous_fingerprint):
    print(f"Fingerprinting function: {fingerprinting_function.__name__}")
    train_template_set = frozenset(train.data.apply(fingerprinting_function, axis=1))
    print(f"Unique templates in train: {len(train_template_set)}")
    val_template_set = frozenset(val.data.apply(fingerprinting_function, axis=1))
    train_val_template_set = train_template_set.union(val_template_set)
    sample_size = 1000
    def print_template_stats(dataset):
        templates = frozenset(dataset.apply(fingerprinting_function, axis=1))
        print(f"Unique templates: {len(templates)}")
        print(f"Novel Unique templates w.r.t. train: {len(templates - train_template_set)}")
        print(f"Novel Unique templates w.r.t. train+val: {len(templates - train_val_template_set)}")

    for dataset_name, dataset in datasets.items():
        print(dataset_name)
        print(f"Dataset size: {len(dataset.data)}")
        print_template_stats(dataset.data)
        if len(dataset.data) > sample_size:
            print(f"Sampling {sample_size} records from {dataset_name} dataset")
            sampled_data = dataset.data.sample(sample_size, random_state=42)
            print_template_stats(sampled_data)

Fingerprinting function: record_to_naive_fingerprint
Unique templates in train: 3291
WyFormerDffCSP++
Dataset size: 1000
Unique templates: 596
Novel Unique templates w.r.t. train: 246
Novel Unique templates w.r.t. train+val: 234
SymmCD
Dataset size: 9475
Unique templates: 2277
Novel Unique templates w.r.t. train: 1035
Novel Unique templates w.r.t. train+val: 990
Sampling 1000 records from SymmCD dataset
Unique templates: 510
Novel Unique templates w.r.t. train: 123
Novel Unique templates w.r.t. train+val: 116
Fingerprinting function: record_to_anonymous_fingerprint
Unique templates in train: 2600
WyFormerDffCSP++
Dataset size: 1000
Unique templates: 538
Novel Unique templates w.r.t. train: 191
Novel Unique templates w.r.t. train+val: 184
SymmCD
Dataset size: 9475
Unique templates: 1870
Novel Unique templates w.r.t. train: 750
Novel Unique templates w.r.t. train+val: 719
Sampling 1000 records from SymmCD dataset
Unique templates: 476
Novel Unique templates w.r.t. train: 95
Novel Unique 