In [1]:
from collections import defaultdict
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [2]:
def load_data(file_path):
    """
    Load the dataset from a TSV file.
    
    :param file_path (str): The path to the .tsv file containing the data.
    
    :return pandas.DataFrame: The loaded dataset.
    """
    data = pd.read_csv(file_path, sep='\t')
    return data

In [3]:
def rename_column(data, column, new_column_name):
    """
    Rename a column in a pandas.DataFrame.
    
    :param data (pandas.DataFrame): The dataset.
    :param column (str): The name of the column to rename.
    :param new_column_name (str): The new name of the column.
    
    :return pandas.DataFrame: The dataset with the renamed column.
    """
    data.rename(columns={column: new_column_name}, inplace=True)
    return data

In [4]:
def sort_data_by_toxicity(data, toxicity_column='ref_tox'):
    """
    Sort the dataset by the specified toxicity column.

    :param data (pandas.DataFrame): The dataset to sort.
    :param toxicity_column (str): The column name of the toxicity to sort by.
    
    :return pandas.DataFrame: The sorted dataset.
    """
    return data.sort_values(toxicity_column)

In [5]:
def remove_uncertain_toxicity_data(sorted_data, lower_bound, upper_bound):
    """
    Remove entries where toxicity is not determined properly by slicing the dataset.

    :param sorted_data (pandas.DataFrame): The sorted dataset.
    :param lower_bound (int): The lower index to slice from.
    :param upper_bound (int): The upper index to slice to.
    
    :return pandas.DataFrame: The dataset with uncertain toxicity data removed.
    """
    part1 = sorted_data.iloc[:lower_bound]
    part2 = sorted_data.iloc[upper_bound:]
    return pd.concat([part1, part2], axis=0)

In [6]:
def swap_translation_and_reference(part_data):
    """
    Swap the 'reference' and 'translation' columns in the dataset, and their corresponding toxicity scores.

    :param part_data (pandas.DataFrame): The part of the dataset to manipulate.
    
    :return pandas.DataFrame: The dataset with 'reference' and 'translation' columns swapped.
    """
    ref_temp = part_data['reference'].copy()
    ref_value = part_data['ref_tox'].copy()

    part_data['reference'] = part_data['translation']
    part_data['translation'] = ref_temp
    part_data['ref_tox'] = part_data['trn_tox']
    part_data['trn_tox'] = ref_value

    return part_data

In [7]:
def filter_by_translation_toxicity(data, threshold=0.3):
    """
    Filter rows based on the translation toxicity.

    :param data (pandas.DataFrame): The dataset to filter.
    :param threshold (float): The toxicity threshold to filter by.
    
    :return pandas.DataFrame: The filtered dataset.
    """
    return data[data.trn_tox < threshold]

In [8]:
def save_data_to_csv(data, file_path):
    """
    Save the DataFrame to a CSV file.

    :param data (pandas.DataFrame): The dataset to save.
    :param file_path (str): The file path to save the dataset to.
    """
    data.to_csv(file_path, index=False)

In [9]:
def split_data(data, test_size=0.2, valid_size=0.1):
    """
    Split the dataset into training, validation, and test sets.

    :param data (pandas.DataFrame): The dataset to split.
    :param test_size (float): The proportion of the dataset to include in the test split.
    :param valid_size (float): The proportion of the dataset to include in the validation split.
    
    :return tuple: A tuple containing the train, validation, and test sets.
    """
    # First split to get the test set
    train_valid_data, test_data = train_test_split(data, test_size=test_size, random_state=42)

    # Adjust valid_size to compensate for the initial split
    valid_size_adjusted = valid_size / (1 - test_size)

    # Now split the remaining data to get the validation set
    train_data, valid_data = train_test_split(train_valid_data, test_size=valid_size_adjusted, random_state=42)

    return train_data, valid_data, test_data

In [10]:
file_path = '../data/raw/filtered.tsv'
sorted_file_path = '../data/internal/data.csv'
train_file_path = '../data/internal/train_data.csv'
valid_file_path = '../data/internal/valid_data.csv'
test_file_path = '../data/internal/test_data.csv'

In [11]:
dataset = load_data(file_path)

# Rename the first column to 'id'
dataset = rename_column(dataset, dataset.columns[0], 'id')

# Sort by reference toxicity
sorted_data = sort_data_by_toxicity(dataset)

# Remove uncertain toxicity data
cleaned_data = remove_uncertain_toxicity_data(sorted_data, 200_000, 300_000)

# Swap reference and translation
cleaned_data = swap_translation_and_reference(cleaned_data)

# Filter by translation toxicity
filtered_data = filter_by_translation_toxicity(cleaned_data)

# Save to new CSV
save_data_to_csv(filtered_data, sorted_file_path)

In [None]:
# Split the data into train, validation, and test sets
train_data, valid_data, test_data = split_data(filtered_data)

# Save the splits to CSV files
save_data_to_csv(train_data, '../data/internal/train_data.csv')
save_data_to_csv(valid_data, '../data/internal/valid_data.csv')
save_data_to_csv(test_data, '../data/internal/test_data.csv')