# Create the wikismall NSP dataset

1. Load the acloudfan/wikismall dataset
2. In a 50-50 split
    - Pick consecutive sentences set label = 0, is_next
    - Pick sentences from different articles set label = 1, is_not_next
3. Create the dataset 'wikismall-nsp'
4. Save to disk
5. Upload HF

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import random

In [2]:
dataset_name = 'acloudfan/wikismall'

dataset_wikismall = load_dataset(dataset_name)

dataset_wikismall

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 990449
    })
})

In [3]:
dataframe = pd.DataFrame(columns=['id_1','sentence_1','id_2','sentence_2','label'])

num_rows = dataset_wikismall['train'].num_rows  - 1


def  create_sentence_pair_is_next():
    index = random.randrange(num_rows)
    return dataset_wikismall['train'][index], dataset_wikismall['train'][index+1]
    

def  create_sentence_pair_is_not_next():
    index_1 = random.randrange(num_rows)
    index_2 = random.randrange(num_rows)
    return dataset_wikismall['train'][index_1], dataset_wikismall['train'][index_2]


In [4]:
# max num of pairs to be generated = double of number specified
max_num_nsp_pairs =  5000

for i in range(max_num_nsp_pairs):
    # generate is next sentence label = 0
    while True:
        sentence_1, sentence_2 = create_sentence_pair_is_next()
        if sentence_1['id'] == sentence_2['id']:
            # add the sentence pair to dataframe
            dict_df = pd.DataFrame({'id_1': [sentence_1['id']],
                                    'sentence_1': [sentence_1['text']], 
                                    'id_2': [sentence_2['id']],
                                    'sentence_2': [sentence_2['text']], 
                                    'label': [0]})
            
#             print(dict_df)
            dataframe = pd.concat([dataframe,dict_df])
            break
    
    # generate the is not next sentece pair label = 1
    while True:
        sentence_1, sentence_2 = create_sentence_pair_is_not_next()
        if sentence_1['id'] != sentence_2['id']:
            # add the sentence pair to dataframe
            dict_df = pd.DataFrame({'id_1': [sentence_1['id']],
                                    'sentence_1': [sentence_1['text']], 
                                    'id_2': [sentence_2['id']],
                                    'sentence_2': [sentence_2['text']], 
                                    'label': [1]})
            
#             print(dict_df)
            dataframe = pd.concat([dataframe,dict_df])
            break

dataset_wikismall_nsp = Dataset.from_pandas(dataframe).remove_columns(['__index_level_0__'])


In [5]:
dataset_wikismall_nsp[0]

{'id_1': '4839494',
 'sentence_1': 'In his first season in London, he made a few appearances, but never established himself as a first team player.',
 'id_2': '4839494',
 'sentence_2': 'In his second season, he became a fringe player.In January 2004 he joined the French club Lille on loan until the end of the season.',
 'label': 0}

In [6]:
dataset_wikismall_nsp[1]

{'id_1': '21436683',
 'sentence_1': 'The remaining Huigu forces scattered.',
 'id_2': '20149964',
 'sentence_2': 'Today, the two related snacks are often sold by the same stall."',
 'label': 1}

In [7]:
dataset_name = 'acloudfan/wikismall-nsp'

dataset_wikismall_nsp.save_to_disk('./wikismall-nsp')

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
HF_TOKEN='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'

dataset_wikismall_nsp.push_to_hub(dataset_name, token=HF_TOKEN)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/439 [00:00<?, ?B/s]