Making a new dataset splits which can be also used

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [2]:
# load the provided dataset files
train_file_path = "F:\\applied_deep_learning\\data\\SemEval2024-Task8\\SubtaskB\\subtaskB_train.jsonl"
dev_file_path = "F:\\applied_deep_learning\\data\\SemEval2024-Task8\\SubtaskB\\subtaskB_dev.jsonl"

# reading the into pandas dataframe
train_df = pd.read_json(train_file_path, lines=True)
dev_df = pd.read_json(dev_file_path, lines=True)

In [3]:
# find the row with the highest word count in train_df
max_word_count_row_train = train_df['text'].apply(lambda x: len(x.split())).idxmax()

# get the word count of the row in train_df
max_word_count_train = len(train_df['text'][max_word_count_row_train].split())

# find text with a token size above 512 and count it in train_df
words_above_train = train_df['text'].apply(lambda x: len(x.split())).loc[train_df['text'].apply(lambda x: len(x.split())) > 512].count()

# print the results for train_df
print("Number of words above 512 in train_df:", words_above_train)
print("Highest word count in train_df:", max_word_count_train)

Number of words above 512 in train_df: 9985
Highest word count in train_df: 16991


In [4]:
# find the row with the highest word count
max_word_count_row = dev_df['text'].apply(lambda x: len(x.split())).idxmax()

# get the word count of the row
max_word_count = len(dev_df['text'][max_word_count_row].split())

# find text with a token size above 512 and count it
words_above = dev_df['text'].apply(lambda x: len(x.split())).loc[dev_df['text'].apply(lambda x: len(x.split())) > 512].count()

# print
print("Number of words above 512:", words_above)

# print the count
print("Highest word count:", max_word_count)

Number of words above 512: 188
Highest word count: 1208


In [5]:
def process_json_file(file_path):
    # read the json into dataframe
    df_jsonl = pd.read_json(file_path, lines=True)

    # explode 'human_text' and assign 'human' to the 'model' column
    df_human = df_jsonl[['human_text']].explode('human_text')
    df_human.rename(columns={'human_text': 'text'}, inplace=True)
    df_human['model'] = 'human'

    # explode 'machine_text' and assign corresponding 'model' value
    df_machine = df_jsonl[['machine_text', 'model']].explode('machine_text')
    df_machine.rename(columns={'machine_text': 'text'}, inplace=True)

    # combine the two DataFrames
    df_combined = pd.concat([df_human, df_machine], ignore_index=True)

    # drop rows where 'text' is NaN (if any)
    df_combined.dropna(subset=['text'], inplace=True)

    return df_combined

In [6]:
# define the file paths
file_paths = [
    "F:\\applied_deep_learning\\data\\original_M4_data\\peerread_bloomz.jsonl",
    "F:\\applied_deep_learning\\data\\original_M4_data\\peerread_chatgpt.jsonl",
    "F:\\applied_deep_learning\\data\\original_M4_data\\peerread_cohere.jsonl",
    "F:\\applied_deep_learning\\data\\original_M4_data\\peerread_davinci.jsonl",
    "F:\\applied_deep_learning\\data\\original_M4_data\\peerread_dolly.jsonl",
    "F:\\applied_deep_learning\\data\\original_M4_data\\peerread_llama.jsonl"
]

# load each file into a separate dataframe and name them according to the files
df_chatgpt = process_json_file(file_paths[1])
df_cohere = process_json_file(file_paths[2])
df_davinci = process_json_file(file_paths[3])
df_dolly = process_json_file(file_paths[4])


In [7]:
print(df_chatgpt.shape)
print(df_cohere.shape)
print(df_davinci.shape)
print(df_dolly.shape)

(8142, 2)
(9928, 2)
(8142, 2)
(9928, 2)


In [8]:
# lets do it for the bloomz file which has a slightly different format

df_bloomz = pd.read_json(file_paths[0], lines=True)
df_bloomz.head()

Unnamed: 0,source,id,title,abstract,human_reviews,davinci_reviews,chatgpt_reviews,prompts,bloom_reviews,score,probas,logits,model
0,PeerRead/acl_2017/,104,Bridge Text and Knowledge by Learning Multi-Pr...,Integrating text and knowledge into a unified ...,[- Strengths:\n* Outperforms ALIGN in supervis...,[\n\nThis paper presents a novel approach to b...,[Title: Bridge Text and Knowledge by Learning ...,[Please write a peer review for the paper give...,[The authors propose to learn multi-prototype ...,0.268284,"[0.4606933594, 0.1846923828, 0.1619873047, 0.3...","[19.46875, 17.359375, 18.515625, 22.15625, 18....",bigscience/bloomz
1,PeerRead/acl_2017/,105,Morphological Inflection Generation with Hard ...,We present a neural model for morphological in...,[- Strengths:\nThe idea of hard monotonic atte...,[\n\nThis paper presents a novel approach to m...,[Title: Morphological Inflection Generation wi...,[Please write a peer review for the paper give...,[The authors propose an attention mechanism to...,0.20811,"[0.4658203125, 0.2700195312, 0.1539306641, 0.2...","[19.125, 17.234375, 18.25, 20.1875, 16.625, 19...",bigscience/bloomz
2,PeerRead/acl_2017/,107,Weakly Supervised Cross-Lingual Named Entity R...,The state-of-the-art named entity recognition ...,[This paper presents several weakly supervised...,[\n\nThis paper presents a novel approach to w...,"[General Comments:\nThe paper titled ""Weakly S...",[Please write a peer review for the paper give...,[The authors propose an effective method to re...,0.254676,"[0.4738769531, 0.3132324219, 0.2371826172, 0.4...","[18.953125, 21.234375, 19.78125, 21.875, 20.57...",bigscience/bloomz
3,PeerRead/acl_2017/,108,A Multigraph-based Model for Overlapping Entit...,"In this paper, we propose a new model for pred...","[- Strengths: the paper is well-written, excep...",[\n\nThis paper proposes a multigraph-based mo...,[\n\nOverview:\nThe paper proposes a multigrap...,[Please write a peer review for the paper give...,[The authors present an interesting approach t...,0.299974,"[0.4094238281, 0.2763671875, 0.1287841797, 0.1...","[19.03125, 18.046875, 18.15625, 19.171875, 26....",bigscience/bloomz
4,PeerRead/acl_2017/,117,Improved Neural Relation Detection for Knowled...,Relation detection is a core component of many...,[- Strengths: The paper addresses a relevant t...,[\n\nThis paper presents a novel approach for ...,[\n\nTitle: Improved Neural Relation Detection...,[Please write a peer review for the paper give...,[The authors present an improved neural relati...,0.266504,"[0.4230957031, 0.3024902344, 0.1875, 0.3144531...","[19.28125, 17.953125, 18.6875, 22.21875, 18.40...",bigscience/bloomz


In [9]:
# explode 'bloom_reviews' and assign 'bloomz' to the 'model' column

df_bloomz = df_bloomz[['bloom_reviews']].explode('bloom_reviews')
df_bloomz.rename(columns={'bloom_reviews': 'text'}, inplace=True)
df_bloomz['model'] = 'bloomz'
df_bloomz.head()

Unnamed: 0,text,model
0,The authors propose to learn multi-prototype e...,bloomz
0,The main idea of this work is to learn multi-p...,bloomz
0,The authors present their work very clearly. ...,bloomz
0,The authors present their work very clearly. ...,bloomz
1,The authors propose an attention mechanism to ...,bloomz


In [10]:
# lets concat all the dataframes into one

df_merge = pd.concat([df_chatgpt, df_cohere, df_davinci, df_dolly, df_bloomz], ignore_index=True)

print(df_merge.shape)
print(df_merge.head())

(38480, 2)
                                                text  model
0  - Strengths:\n* Outperforms ALIGN in supervise...  human
1  This paper addresses the problem of disambigua...  human
2  - Strengths:\nGood ideas, simple neural learni...  human
3  - Strengths:\nThe idea of hard monotonic atten...  human
4  - Strengths: A new encoder-decoder model is pr...  human


In [11]:
# lets drop the duplicates in the text column

df_merge = df_merge.drop_duplicates(subset=['text'])
df_merge.shape

(14566, 2)

In [12]:
# lets check if the classes are named the same in all the dataframes

unique_classes_train = train_df['model'].unique()
unique_classes_dev = dev_df['model'].unique()
unique_classes_merge = df_merge['model'].unique()

print("Unique classes in train_df:", unique_classes_train)
print("Unique classes in dev_df:", unique_classes_dev)
print("Unique classes in df_merge:", unique_classes_merge)


Unique classes in train_df: ['chatGPT' 'human' 'cohere' 'davinci' 'bloomz' 'dolly']
Unique classes in dev_df: ['chatGPT' 'human' 'davinci' 'cohere' 'bloomz' 'dolly']
Unique classes in df_merge: ['human' 'chatgpt' 'cohere' 'davinci' 'dolly-v2' 'bloomz']


In [14]:
# lets map the classes to the same name
    
class_mapping = {
    'chatgpt': 'chatGPT',
    'davinci': 'davinci',
    'cohere': 'cohere',
    'human': 'human',
    'bloomz': 'bloomz',
    'dolly-v2': 'dolly'
}

df_merge['model'] = df_merge['model'].map(class_mapping)

print("Unique classes in df_merge:", df_merge['model'].unique())


Unique classes in df_merge: ['human' 'chatGPT' 'cohere' 'davinci' 'dolly' 'bloomz']


In [15]:
# group by 'model' and 'label' from the original test set to see the mappings of the labels and the models
model_label_mapping = dev_df.groupby(['model', 'label']).size()

print(model_label_mapping)

model    label
bloomz   4        500
chatGPT  1        500
cohere   2        500
davinci  3        500
dolly    5        500
human    0        500
dtype: int64


In [16]:
# define a dictionary mapping for labels
label_mapping = {
    'human': 0,
    'chatGPT': 1,
    'cohere': 2,
    'davinci': 3,
    'bloomz': 4,
    'dolly': 5
}

# map the 'model' column to labels using the dictionary mapping
df_merge['label'] = df_merge['model'].map(label_mapping)

In [17]:
# lets check the mapping

model_label_mapping_merge = df_merge.groupby(['model', 'label']).size()

print(model_label_mapping_merge)

model    label
bloomz   4        2334
chatGPT  1        2344
cohere   2        2342
davinci  3        2344
dolly    5        2344
human    0        2858
dtype: int64


In [18]:
# lets compare the dataframe 

print(df_merge.head())
print(dev_df.head())

                                                text  model  label
0  - Strengths:\n* Outperforms ALIGN in supervise...  human      0
1  This paper addresses the problem of disambigua...  human      0
2  - Strengths:\nGood ideas, simple neural learni...  human      0
3  - Strengths:\nThe idea of hard monotonic atten...  human      0
4  - Strengths: A new encoder-decoder model is pr...  human      0
                                                text    model    source  \
0  Overall, I found the paper "Machine Comprehens...  chatGPT  peerread   
1  This paper "Machine Comprehension Using Match-...  chatGPT  peerread   
2  The paper presents an end-to-end neural archit...  chatGPT  peerread   
3  This paper proposes an end-to-end neural archi...  chatGPT  peerread   
4  Title: Incorporating long-range consistency in...  chatGPT  peerread   

   label    id  
0      1  1844  
1      1  1845  
2      1  1846  
3      1  1847  
4      1  1848  


In [19]:
# lets head the the source column into the df_merge dataframe
df_merge['source'] = "peerread"

df_merge.head()

Unnamed: 0,text,model,label,source
0,- Strengths:\n* Outperforms ALIGN in supervise...,human,0,peerread
1,This paper addresses the problem of disambigua...,human,0,peerread
2,"- Strengths:\nGood ideas, simple neural learni...",human,0,peerread
3,- Strengths:\nThe idea of hard monotonic atten...,human,0,peerread
4,- Strengths: A new encoder-decoder model is pr...,human,0,peerread


#### As we now have the full dataset of the Peerread domain available, we can make new splits to use!

So for the first we will split the provided trainset into train data and validation data. we have 71k datapoint and split 90:10 train:validation. our test set whas a fixed size of 14.500 datapoints. so we get a ~ 75:8:17 split.

For the second new split we will mix all domains into train:validation:test with an equal distribuation of models and domains in every split with a 80:10:10 ratio.

In [20]:
# First dataset which is the one for the competition

# split the train data into train and validation sets
# as we want stratification along the domain an model columns we will combine them into one column as train_test_split function can only take one column as input
train_df['stratify_col'] = train_df['model'].astype(str) + "_" + train_df['source'].astype(str)

# split the data stratifying on the new column
train_set_comp, val_set_comp = train_test_split(train_df, stratify=train_df['stratify_col'], test_size=0.1, random_state=42)

# drop 'stratify_col' from both dataframes, no longer needed
train_set_comp = train_set_comp.drop('stratify_col', axis=1)
val_set_comp = val_set_comp.drop('stratify_col', axis=1)

# now we copy the new constructe "peerread" dataset as test set
test_set_comp = df_merge.copy()

# lets check the shape of the datasets
print("Shape of train_set_comp:", train_set_comp.shape)
print("Shape of val_set_comp:", val_set_comp.shape)
print("Shape of test_set_comp:", test_set_comp.shape)

# lets check the distribution of the model classes in the train, validation and test sets
print("Train set class distribution:\n", train_set_comp['label'].value_counts(normalize=True))
print("Validation set class distribution:\n", val_set_comp['label'].value_counts(normalize=True))
print("Test set class distribution:\n", test_set_comp['label'].value_counts(normalize=True))

# lets check the distribution of the source classes in the train, validation and test sets
print("Train set source distribution:\n", train_set_comp['source'].value_counts(normalize=True))
print("Validation set source distribution:\n", val_set_comp['source'].value_counts(normalize=True))
print("Test set source distribution:\n", test_set_comp['source'].value_counts(normalize=True))

Shape of train_set_comp: (63924, 5)
Shape of val_set_comp: (7103, 5)
Shape of test_set_comp: (14566, 4)
Train set class distribution:
 label
3    0.168935
4    0.168919
0    0.168904
1    0.168888
5    0.164758
2    0.159596
Name: proportion, dtype: float64
Validation set class distribution:
 label
4    0.168943
0    0.168943
3    0.168943
1    0.168802
5    0.164719
2    0.159651
Name: proportion, dtype: float64
Test set class distribution:
 label
0    0.196210
1    0.160923
3    0.160923
5    0.160923
2    0.160785
4    0.160236
Name: proportion, dtype: float64
Train set source distribution:
 source
reddit       0.253410
wikihow      0.253410
arxiv        0.253379
wikipedia    0.239800
Name: proportion, dtype: float64
Validation set source distribution:
 source
reddit       0.253414
arxiv        0.253414
wikihow      0.253414
wikipedia    0.239758
Name: proportion, dtype: float64
Test set source distribution:
 source
peerread    1.0
Name: proportion, dtype: float64


In [21]:
# lets check for nan values in the train, validation and test sets
print("Train set nan values:\n", train_set_comp.isna().sum())
print("Validation set nan values:\n", val_set_comp.isna().sum())
print("Test set nan values:\n", test_set_comp.isna().sum())

Train set nan values:
 text      0
model     0
source    0
label     0
id        0
dtype: int64
Validation set nan values:
 text      0
model     0
source    0
label     0
id        0
dtype: int64
Test set nan values:
 text      0
model     0
label     0
source    0
dtype: int64


In [22]:
# lets make the second dataset with different split an stratification
# first lets combine train and test set into one dataframe
train_test_set = pd.concat([train_df, df_merge], ignore_index=True)

# lets check the shape of the datasets
print("Shape of train_test_set:", train_test_set.shape)
print('Head the first rows of the train_test_set:\n', train_test_set.head())

Shape of train_test_set: (85593, 6)
Head the first rows of the train_test_set:
                                                 text    model   source  label  \
0  Forza Motorsport is a popular racing game that...  chatGPT  wikihow      1   
1  Buying Virtual Console games for your Nintendo...  chatGPT  wikihow      1   
2  Windows NT 4.0 was a popular operating system ...  chatGPT  wikihow      1   
3  How to Make Perfume\n\nPerfume is a great way ...  chatGPT  wikihow      1   
4  How to Convert Song Lyrics to a Song'\n\nConve...  chatGPT  wikihow      1   

    id     stratify_col  
0  0.0  chatGPT_wikihow  
1  1.0  chatGPT_wikihow  
2  2.0  chatGPT_wikihow  
3  3.0  chatGPT_wikihow  
4  4.0  chatGPT_wikihow  


In [23]:
# we have NAN values in the id and straify_col columns
# lets fill the id column with the index values and make a new column for the stratify_col
train_test_set['id'] = train_test_set.index
train_test_set['stratify_col'] = train_test_set['model'].astype(str) + "_" + train_test_set['source'].astype(str)

# lets check for nan values in the train test set  
print("Train test set nan values:\n", train_test_set.isna().sum())



Train test set nan values:
 text            0
model           0
source          0
label           0
id              0
stratify_col    0
dtype: int64


In [24]:
# lest split the train test set into the mixed domain 70:15:15 split
train_set_mix, val_set_mix = train_test_split(train_test_set, stratify=train_test_set['stratify_col'], test_size=0.20, random_state=42)
val_set_mix, test_set_mix = train_test_split(val_set_mix, stratify=val_set_mix['stratify_col'], test_size=0.50, random_state=42)

# drop 'stratify_col' from all dataframes, no longer needed
train_set_mix = train_set_mix.drop('stratify_col', axis=1)
val_set_mix = val_set_mix.drop('stratify_col', axis=1)
test_set_mix = test_set_mix.drop('stratify_col', axis=1)

# lets check the shape of the datasets
print("Shape of train_set_mix:", train_set_mix.shape)
print("Shape of val_set_mix:", val_set_mix.shape)
print("Shape of test_set_mix:", test_set_mix.shape)

# lets check the distribution of the model classes in the train, validation and test sets
print("Train set class distribution:\n", train_set_mix['label'].value_counts(normalize=True))
print("Validation set class distribution:\n", val_set_mix['label'].value_counts(normalize=True))
print("Test set class distribution:\n", test_set_mix['label'].value_counts(normalize=True))

# lets check the distribution of the source classes in the train, validation and test sets
print("Train set source distribution:\n", train_set_mix['source'].value_counts(normalize=True))
print("Validation set source distribution:\n", val_set_mix['source'].value_counts(normalize=True))
print("Test set source distribution:\n", test_set_mix['source'].value_counts(normalize=True))

# lets check for nan values in the train, validation and test sets
print("Train set nan values:\n", train_set_mix.isna().sum())
print("Validation set nan values:\n", val_set_mix.isna().sum())
print("Test set nan values:\n", test_set_mix.isna().sum())

Shape of train_set_mix: (68474, 5)
Shape of val_set_mix: (8559, 5)
Shape of test_set_mix: (8560, 5)
Train set class distribution:
 label
0    0.173555
3    0.167567
1    0.167523
4    0.167436
5    0.164106
2    0.159812
Name: proportion, dtype: float64
Validation set class distribution:
 label
0    0.173502
3    0.167660
4    0.167543
1    0.167426
5    0.164038
2    0.159832
Name: proportion, dtype: float64
Test set class distribution:
 label
0    0.173598
1    0.167640
3    0.167523
4    0.167407
5    0.164136
2    0.159696
Name: proportion, dtype: float64
Train set source distribution:
 source
wikihow      0.210284
reddit       0.210284
arxiv        0.210255
wikipedia    0.198995
peerread     0.170181
Name: proportion, dtype: float64
Validation set source distribution:
 source
reddit       0.210305
arxiv        0.210305
wikihow      0.210305
wikipedia    0.198972
peerread     0.170113
Name: proportion, dtype: float64
Test set source distribution:
 source
wikihow      0.210280
arxiv

In [25]:
# lets save the datasets as jsonl files
train_set_comp.to_json('data/train_set_comp.jsonl', orient='records', lines=True)
val_set_comp.to_json('data/val_set_comp.jsonl', orient='records', lines=True)
test_set_comp.to_json('data/test_set_comp.jsonl', orient='records', lines=True)

train_set_mix.to_json('data/train_set_mix.jsonl', orient='records', lines=True)
val_set_mix.to_json('data/val_set_mix.jsonl', orient='records', lines=True)
test_set_mix.to_json('data/test_set_mix.jsonl', orient='records', lines=True)