In [1]:
import os
import random
from random import sample
from datasets import load_dataset, Dataset, DatasetDict
from huggingface_hub import login
import pickle
from dotenv import load_dotenv
load_dotenv()

CITED_NEG_STRIDE = 10
NON_CITED_NEG_STRIDE = 10
NON_CITED_NEG_NUM_EASY = 7

random.seed(42)

### Reading triplets from saved files

In [11]:
# Reading triplets where the pos paper is highly influential and the neg is also cited but not highly influential
cited_neg_triplets = []
for i in range(0, CITED_NEG_STRIDE):
    with open(f"saved_triplets/cited_neg_start_{i}_stride_{CITED_NEG_STRIDE}.pkl", 'rb') as file:
        data = pickle.load(file)
        cited_neg_triplets += data['results']
        
print("Number of triplets:", len(cited_neg_triplets))

Number of triplets: 127180


In [8]:
# Reading triplets where the pos paper is highly influential and the neg is not cited by the query paper. For hard negs these are cited though by the pos paper
non_cited_neg_triplets_easy = []
non_cited_neg_triplets_hard = []
for i in range(0, NON_CITED_NEG_NUM_EASY):
    with open(f"saved_triplets/non_cited_neg_start_{i}_stride_{NON_CITED_NEG_STRIDE}_hard_False.pkl", 'rb') as file:
        data = pickle.load(file)
        non_cited_neg_triplets_easy += data['results']

for i in range(NON_CITED_NEG_NUM_EASY, NON_CITED_NEG_STRIDE):
    with open(f"saved_triplets/non_cited_neg_start_{i}_stride_{NON_CITED_NEG_STRIDE}_hard_True.pkl", 'rb') as file:
        data = pickle.load(file)
        non_cited_neg_triplets_hard += data['results']

easy_len, hard_len = len(non_cited_neg_triplets_easy), len(non_cited_neg_triplets_hard)
print(f"Number of triplets with easy negs: {easy_len} and hard negs: {hard_len}. Total: {easy_len + hard_len}")

def combine_non_cited(max_size, hard_rate):
    """Combining non_cited triplets into one list with a specified ratio of hard negs"""
    size = int(min(max_size, hard_len / hard_rate, easy_len / (1 - hard_rate))) if hard_rate not in [0, 1] else max_size
    easy_triplets = random.sample(non_cited_neg_triplets_easy, int(size*(1-hard_rate)))
    hard_triplets = random.sample(non_cited_neg_triplets_hard, int(size*(hard_rate)))
    elen, hlen = len(easy_triplets), len(hard_triplets)
    print(f"Combining {elen} easy negs with {hlen} hard negs for total size of {elen + hlen}")
    non_cited_neg = easy_triplets + hard_triplets
    random.shuffle(non_cited_neg)
    return non_cited_neg

Number of triplets with easy negs: 68108 and hard negs: 26000. Total: 94108


### Publishing Datasets with different distributions of negative types

In [4]:
HF_ACCESS_TOKEN = os.getenv('HF_ACCESS_TOKEN')
login(token = HF_ACCESS_TOKEN)
HF_USER = os.getenv('HF_USERNAME')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/andrewm/store/hf/huggingface/token
Login successful


In [5]:
cite_prediction_new = load_dataset("allenai/scirepeval", "cite_prediction_new")
def publish(name, max_size, percent_cited_neg, percent_non_cited_neg, hard_rate = 0.4):
    # Collect non-cited triplets 
    if percent_non_cited_neg == 0:
        non_cited_neg_triplets = []
        size = max_size
    else:
        non_cited_neg_triplets = combine_non_cited((max_size*percent_non_cited_neg)//100, hard_rate)
        size = min(max_size, int(len(non_cited_neg_triplets)/percent_non_cited_neg * 100))

    # Combine all triplet types
    cite_pred_percentage = 1-(percent_cited_neg + percent_non_cited_neg)/100
    sampled_indices = sample(range(len(cite_prediction_new['train'])), int(size * cite_pred_percentage))
    cite_pred_subset = cite_prediction_new['train'].select(sampled_indices)
    mixed_dataset = list(cite_pred_subset) + sample(non_cited_neg_triplets, round(size*percent_non_cited_neg/100)) \
                        + sample(cited_neg_triplets, round(size*percent_cited_neg/100))
    random.shuffle(mixed_dataset)

    # Publish to HF
    dataset = Dataset.from_list(mixed_dataset)
    split_dataset = dataset.train_test_split(test_size=0.1)
    combined_dataset = DatasetDict({
        'train': split_dataset['train'],
        'validation': split_dataset['test']  
    })
    ds_name = f"{HF_USER}/{name}"
    print(f"Uploading dataset with name {ds_name} and length {len(mixed_dataset)} to Huggingface")
    combined_dataset.push_to_hub(ds_name)
    print('Dataset published')

In [6]:
# Note the parameters to 'publish' can be changed to create different datasets. Below are some we used
publish("NoInfluentials", 50000, 0, 0)
publish("Influential_CitedNegs_1_Percent", 50000, 1, 0)
publish("Influential_CitedNegs_5_Percent", 50000, 5, 0)
publish("Influential_CitedNegs_10_Percent", 50000, 10, 0)
publish("Influential_NonCitedNegs_10_Percent", 50000, 0, 10)
publish("Influential_NonCitedNegs_10_Percent_large", 100000, 0, 10)
publish("Influential_MixedNegTypes_10_Percent", 50000, 5, 5)


Uploading dataset with name cheafdevo56/NoInfluentials and length 50000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/849 [00:00<?, ?B/s]

Dataset published
Uploading dataset with name cheafdevo56/Influential_CitedNegs_1_Percent and length 50000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Dataset published
Uploading dataset with name cheafdevo56/Influential_CitedNegs_5_Percent and length 50000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/849 [00:00<?, ?B/s]

Dataset published
Uploading dataset with name cheafdevo56/Influential_CitedNegs_10_Percent and length 50000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Dataset published
Combining 3000 easy negs with 2000 hard negs for total size of 5000
Uploading dataset with name cheafdevo56/Influential_NonCitedNegs_10_Percent and length 50000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/849 [00:00<?, ?B/s]

Dataset published
Combining 6000 easy negs with 4000 hard negs for total size of 10000
Uploading dataset with name cheafdevo56/Influential_NonCitedNegs_10_Percent_large and length 100000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/90 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/850 [00:00<?, ?B/s]

Dataset published
Combining 1500 easy negs with 1000 hard negs for total size of 2500
Uploading dataset with name cheafdevo56/Influential_MixedNegTypes_10_Percent and length 50000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/45 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/849 [00:00<?, ?B/s]

Dataset published


In [10]:
# We also publish datasets of only our new types of triples
publish("All_Hard_HIC_Triplets", 176035, 100, 0)
publish("All_EASY_MEDIUM_HIC_Triplets", 100000, 0, 100)

Uploading dataset with name cheafdevo56/All_Hard_HIC_Triplets and length 176035 to Huggingface


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

Dataset published
Combining 39000 easy negs with 26000 hard negs for total size of 65000
Uploading dataset with name cheafdevo56/All_EASY_MEDIUM_HIC_Triplets and length 65000 to Huggingface


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/59 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Dataset published
