In this notebook our goal is to analyze the problem reported on PKG-2277 whew some blocks of authors contain huge profiles related to one OAId but only one S2Id.

In [1]:
import json
import os

import numpy as np
from sklearn.cluster import AgglomerativeClustering

AUTHOR_PATH = '../data/ha-zhang-single/'

In [2]:
def read_jsonl(path):
    """Parse jsonl to list of dicts"""
    with open(path) as f:
        lines = f.readlines()
    return [json.loads(line) for line in lines]


def get_signatures(ap_results):
    """Given the author profile results get all signatures"""
    signatures = []
    for profile in ap_results:
        signatures += profile['signatureIds']
    return signatures


def fill_dmatrix(distances, signature_list):
    """Given distances jsonl and list of signatures fill the distance matrix"""
    dmatrix = np.zeros(shape=(len(signature_list), len(signature_list)))

    signature_to_idx = {
        signature: idx for idx, signature in enumerate(signature_list)
    }

    for distance in distances:
        sig1 = distance['signatureId1']
        sig2 = distance['signatureId2']
        d = distance['distance']
        row = signature_to_idx[sig1]
        column = signature_to_idx[sig2]
        dmatrix[row, column] = d

    dmatrix = dmatrix.T + dmatrix # Transpose to fill symetric elements
    return dmatrix, signature_to_idx


def get_features(sig_id, features):
    """Retrieve all feature vectors realated to the given signature"""
    feature_vectors = []
    for feature_entry in features:
        if feature_entry['signatureId1'] == sig_id or feature_entry['signatureId2'] == sig_id:
            feature_vectors.append(feature_entry['features'])
    return feature_vectors

In [3]:
ap_results = read_jsonl(path=os.path.join(AUTHOR_PATH, 'results.json'))
distances = read_jsonl(path=os.path.join(AUTHOR_PATH, 'distances.json'))
signatures = get_signatures(ap_results)
dmatrix, signature_to_idx = fill_dmatrix(distances, signatures)

In [7]:
# There is one profile in the results that contains more than 6k signatures, lets find it:

for entry in ap_results:
    if len(entry["signatureIds"]) >= 6_000:
        target_profile = entry["signatureIds"]
        break

target_profile = set(target_profile)

In [8]:
distance_info = read_jsonl(path=os.path.join(AUTHOR_PATH, 'features.json'))

In [9]:
# I will filter distances that are calculated for pairs that both signatures belong in the target profile

filterted_distances = [
    distance['calcType'] for distance in distance_info \
        if distance['signatureId1'] in target_profile and distance['signatureId2'] in target_profile
]

In [10]:
np.unique(filterted_distances, return_counts=True)

(array(['orcid'], dtype='<U5'), array([23601885]))

As we can see, 100% of the aprox. 23 million pairs have distances calculated with the orcId rule. That said, we are certain that the problem occurs during the application of the orcId rule and ml model does not take any part in the distance computation.