In [70]:
from datasets import load_dataset
import pandas as pd
import numpy as np

In [50]:
dataset = load_dataset('Zhongxing0129/authorlist_train', trust_remote_code=True, split = 'train')
dataset


Dataset({
    features: ['text', 'label'],
    num_rows: 5812
})

In [53]:
dataset_df = pd.DataFrame(dataset)
dataset_df

Unnamed: 0,text,label
0,"“They will most likely be perfectly fitted,” s...",2
1,"“The wound is not here, it is there!” said Kut...",2
2,When he got home he could not sleep for a long...,2
3,"His efforts had not been in vain. The dinner, ...",2
4,An Austrian officer in a white uniform with gr...,2
...,...,...
5807,This was more than he could stand. How could a...,2
5808,As John Dashwood had no more pleasure in music...,0
5809,Dorian shook his head. “You must not ask me th...,1
5810,"Enscombe however was gracious, gracious in fac...",0


In [54]:
dataset_df['label'].value_counts()

label
2    2117
3    1486
0    1268
1     941
Name: count, dtype: int64

In [167]:
## take only a certain label. Multi-join itself, followed by sampling only 100
def function(df, label_a, label_b, sample_size = 50):
    data_a = df[df['label']==label_a].sample(sample_size, random_state = 7)
    data_a['key'] = '1'
    data_b = df[df['label']==label_b].sample(sample_size, random_state = 7)
    data_b['key'] = '1'
    data = data_a.merge(data_b, on = 'key')
    data.drop('key', axis = 1, inplace = True)
    data.rename(columns = {'text_x':'text_a',
                 'text_y':'text_b',
                 'label_x':'label_a',
                 'label_y':'label_b'}, inplace = True)
    data['model_response'] = np.where(data['label_a'] == data['label_b'], 'Yes','No')

    return data

In [190]:
sample_list = []

for i in range(4):
    for j in range(4):
        if i == j:
            sample_size=20
        else:
            sample_size=10
        sample_list.append(function(dataset_df, i, j, sample_size))

In [191]:
len(sample_list)

16

In [193]:
sample = pd.concat(sample_list, axis = 0)
sample.reset_index(drop = True, inplace = True)

In [194]:
new_col = []

for i in range(len(sample)):
    new_col.append(f"<s>[INST] Author {sample['label_a'][i]} wrote this: '{sample['text_a'][i]}'. Did Author {sample['label_a'][i]} also write this: '{sample['text_b'][i]}'? [/INST] {sample['model_response'][i]} </s>")

In [198]:
sample['text'] = new_col
sample

Unnamed: 0,text_a,label_a,text_b,label_b,model_response,text
0,The observations of her uncle and aunt now beg...,0,The observations of her uncle and aunt now beg...,0,Yes,<s>[INST] Author 0 wrote this: 'The observatio...
1,The observations of her uncle and aunt now beg...,0,When all of the house that was open to general...,0,Yes,<s>[INST] Author 0 wrote this: 'The observatio...
2,The observations of her uncle and aunt now beg...,0,“My objection is this; though I think very wel...,0,Yes,<s>[INST] Author 0 wrote this: 'The observatio...
3,The observations of her uncle and aunt now beg...,0,"“There, papa!—Now you must be satisfied—Our ow...",0,Yes,<s>[INST] Author 0 wrote this: 'The observatio...
4,The observations of her uncle and aunt now beg...,0,"“What!” said Mrs. Weston, “have not you finish...",0,Yes,<s>[INST] Author 0 wrote this: 'The observatio...
...,...,...,...,...,...,...
2795,"“See!” cried madame, pointing with her knife. ...",3,"‘That I am sure of,’ she answered, with uncomm...",3,Yes,<s>[INST] Author 3 wrote this: '“See!” cried m...
2796,"“See!” cried madame, pointing with her knife. ...",3,"“Well!” said Stryver, slapping the desk with h...",3,Yes,<s>[INST] Author 3 wrote this: '“See!” cried m...
2797,"“See!” cried madame, pointing with her knife. ...",3,"Ham was a boat-builder in these days, having i...",3,Yes,<s>[INST] Author 3 wrote this: '“See!” cried m...
2798,"“See!” cried madame, pointing with her knife. ...",3,"‘Mr. Spenlow’s in Court, ma’am,’ said the dry ...",3,Yes,<s>[INST] Author 3 wrote this: '“See!” cried m...


In [200]:
## prepare folder path
import os

## create directories to hold the model when they don't exist
working_dir = './datasets'

## it is recommended to store the models seperately
output_dir_prompt = os.path.join(working_dir, 'stylometry')

## create the directories if they don't exist
if not os.path.exists(output_dir_prompt):
    os.mkdir(output_dir_prompt)

In [201]:
sample.to_csv(output_dir_prompt+'/zhongxing0129-authorlist_train-v1.csv', index = False)