In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [3]:
%cd /content/drive/MyDrive/MGTD

/content/drive/MyDrive/MGTD


In [4]:
# Read files as dataframes
train_df = pd.read_json("./data/subtaskB_train.jsonl", lines=True)
dev_df = pd.read_json("./data/subtaskB_dev.jsonl", lines=True)

In [5]:
# Define sources and labels
sources = ['wikihow', 'reddit', 'arxiv', 'wikipedia']
labels = ['human', 'chatGPT', 'cohere', 'bloomz', 'davinci', 'dolly']

In [6]:
def extract_and_move_samples(train_df, dev_df, source, label, num_samples):
    """
    For each specified source (wikihow, reddit, arxiv, wikipedia), extract 3000 samples.
    Ensure that within these 3000 samples from each source, there are equal numbers (500) of each label (human, chatGPT, cohere, bloomz, davinci, dolly).
    Move these samples from the training set to the development set.
    """
    # Extract samples
    samples = train_df[(train_df['source'] == source) & (train_df['model'] == label)].sample(n=num_samples, random_state=42)

    # Remove samples from train and add to dev
    train_df = train_df.drop(samples.index)
    dev_df = pd.concat([dev_df, samples], ignore_index=True)

    return train_df, dev_df

In [7]:
# Move equally distributed data samples from original train set to dev set
for source in sources:
    for label in labels:
        train_df, dev_df = extract_and_move_samples(train_df, dev_df, source, label, 500)

In [8]:
# Split the new dev set into validation set and test set
dev_df['stratify_col'] = dev_df['model'].astype(str) + "_" + dev_df['source'].astype(str)
val_df, test_df = train_test_split(dev_df, stratify=dev_df['stratify_col'], test_size=0.5, random_state=42)

# Drop 'stratify_col' from both dataframes, no longer needed
val_df = val_df.drop('stratify_col', axis=1)
test_df = test_df.drop('stratify_col', axis=1)

In [9]:
# Check the shape of the datasets
print("Shape of train_df:", train_df.shape)
print("Shape of val_df:", val_df.shape)
print("Shape of test_df:", test_df.shape)

# Check the distribution of the model classes in the train, validation and test sets
print("Train set class distribution:\n", train_df['label'].value_counts(normalize=True))
print("Validation set class distribution:\n", val_df['label'].value_counts(normalize=True))
print("Test set class distribution:\n", test_df['label'].value_counts(normalize=True))

# Check the distribution of the source classes in the train, validation and test sets
print("Train set source distribution:\n", train_df['source'].value_counts(normalize=True))
print("Validation set source distribution:\n", val_df['source'].value_counts(normalize=True))
print("Test set source distribution:\n", test_df['source'].value_counts(normalize=True))

Shape of train_df: (59027, 5)
Shape of val_df: (7500, 5)
Shape of test_df: (7500, 5)
Train set class distribution:
 3    0.169397
4    0.169380
0    0.169363
1    0.169329
5    0.164365
2    0.158165
Name: label, dtype: float64
Validation set class distribution:
 2    0.166667
0    0.166667
1    0.166667
4    0.166667
5    0.166667
3    0.166667
Name: label, dtype: float64
Test set class distribution:
 3    0.166667
0    0.166667
2    0.166667
4    0.166667
1    0.166667
5    0.166667
Name: label, dtype: float64
Train set source distribution:
 wikihow      0.254104
reddit       0.254104
arxiv        0.254070
wikipedia    0.237722
Name: source, dtype: float64
Validation set source distribution:
 reddit       0.2
wikipedia    0.2
arxiv        0.2
peerread     0.2
wikihow      0.2
Name: source, dtype: float64
Test set source distribution:
 arxiv        0.2
wikihow      0.2
wikipedia    0.2
peerread     0.2
reddit       0.2
Name: source, dtype: float64


In [10]:
# Save the new splits (train:val:test = 80:10:10, label and source all equally distributed)
train_df.to_json("./data/updated_subtaskB_train.jsonl", orient='records', lines=True)
val_df.to_json("./data/updated_subtaskB_validation.jsonl", orient='records', lines=True)
test_df.to_json("./data/updated_subtaskB_test.jsonl", orient='records', lines=True)