This competition is Kernels only and each Kaggle kernel runs for nine hours. This approximately covers half of the training examples with BERT. So the dataset was split into:

1. Training part 1
2. Training part 2
3. Validation

Two forms of the data were saved, one which keeps the target as float, one which binarizes the target. Note that BERT uses binary targets out of the box and the evaluation metric converts floats into binary labels prior to evaluation

In [1]:
import pandas as pd

In [None]:
import pandas as pd
import pathlib
from sklearn.model_selection import train_test_split

In [None]:
train=pd.read_csv(pathlib.Path.cwd().joinpath('data_raw', 'train.csv'))
train.head()

In [None]:
#alpha is dummy column for bert (shows that there are no sentence pairs 'a' and 'b')
train['alpha']=['a']*train.shape[0]
train=train[['id', 'target', 'alpha', 'comment_text']]
train['comment_text'] = train['comment_text'].replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True)

In [None]:
#split train and validation
train_sub,dev=train_test_split(train, test_size=0.05, random_state=42)

#shuffle train
train_sub = train_sub.sample(frac=1, random_state=42).reset_index(drop=True)

#split train into two parts
train_sub_p1=train_sub.iloc[0:train_sub.shape[0]//2,:]
train_sub_p2=train_sub.iloc[train_sub.shape[0]//2:,:]

In [None]:
#save as tsv files for bert, keep targets as float - this will be used with modified bert that accepts floats for targets
train_sub.to_csv(pathlib.Path.cwd().joinpath('data_float','complete','train.tsv'), sep='\t', header=False, index=False)
dev.to_csv(pathlib.Path.cwd().joinpath('data_float','complete','dev.tsv'), sep='\t', header=False, index=False)
train_sub_p1.to_csv(pathlib.Path.cwd().joinpath('data_float','p1','train.tsv'), sep='\t', header=False, index=False)
train_sub_p2.to_csv(pathlib.Path.cwd().joinpath('data_float','p2','train.tsv'), sep='\t', header=False, index=False)
dev.to_csv(pathlib.Path.cwd().joinpath('data_float','p1','dev.tsv'), sep='\t', header=False, index=False)
dev.to_csv(pathlib.Path.cwd().joinpath('data_float','p2','dev.tsv'), sep='\t', header=False, index=False)

In [None]:
#also save files with binarized targets
train_bin=train_sub.copy(deep=True)
dev_bin=dev.copy(deep=True)
train_bin_p1=train_sub_p1.copy(deep=True)
train_bin_p2=train_sub_p2.copy(deep=True)

for df in [train_bin, dev_bin, train_bin_p1, train_bin_p2]:
    df['target']=(df['target']>=0.5).astype(int)

train_bin.to_csv(pathlib.Path.cwd().joinpath('data_bin','complete','train.tsv'), sep='\t', header=False, index=False)
dev_bin.to_csv(pathlib.Path.cwd().joinpath('data_bin','complete','dev.tsv'), sep='\t', header=False, index=False)
train_bin_p1.to_csv(pathlib.Path.cwd().joinpath('data_bin','p1','train.tsv'), sep='\t', header=False, index=False)
train_bin_p2.to_csv(pathlib.Path.cwd().joinpath('data_bin','p2','train.tsv'), sep='\t', header=False, index=False)
dev_bin.to_csv(pathlib.Path.cwd().joinpath('data_bin','p1','dev.tsv'), sep='\t', header=False, index=False)
dev_bin.to_csv(pathlib.Path.cwd().joinpath('data_bin','p2','dev.tsv'), sep='\t', header=False, index=False)