# Bring the dataset into the format expected by the BERT evaluation code

In [None]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

STRATEGY = {
    'q': ['processed_question_locution'],
    'pq': ['processed_preceding_locution', 'processed_question_locution'],
    'qr': ['processed_question_locution', 'processed_response_locution_merged'],
    'pr': ['processed_preceding_locution', 'processed_response_locution_merged'],
    'pqr': ['processed_preceding_locution', 'processed_question_locution', 'processed_response_locution_merged']
}

def create_datasets(df, type='train'):
    # merge list of response locutions
    temp = df['processed_response_locutions'].values
    joined = [''.join(eval(t)) for t in temp]
    df['processed_response_locution_merged'] = joined
    df = df.fillna('')

    # create data set for each strategy
    for strategy, merges in STRATEGY.items():
        # just concatenating merges
        df[strategy] = df[merges].values.tolist()
        df[strategy] = df[strategy].str.join(' ')

        # rename to expected columns and save
        df.rename(columns={strategy: 'sentence1', 'question_type': 'label'})[['sentence1', 'label']].to_csv(f'data/ts_{strategy}_{type}.csv', index=False)

In [None]:
train = pd.read_csv('./source/QT-Questions-train-over.csv')
test = pd.read_csv('./source/QT-Questions-test.csv')

In [None]:
create_datasets(train, 'train')
create_datasets(test, 'test')

In [None]:
train = pd.read_csv('./source/QT-Questions-train-over.csv')
test = pd.read_csv('./source/QT-Questions-test.csv')

train.rename(columns={'processed_question_locution': 'sentence1', 'question_type': 'label'})[['sentence1', 'label']].to_csv('./data/ts_train.csv', index=False)
test.rename(columns={'processed_question_locution': 'sentence1', 'question_type': 'label'})[['sentence1', 'label']].to_csv('./data/ts_test.csv', index=False)