In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv('../data/raw/filtered.tsv', sep='\t')
df = df.rename(columns={'Unnamed: 0': 'index', 'lenght_diff': 'length_diff'})
df = df.set_index('index')
df = df.drop_duplicates()

df.head()

Unnamed: 0_level_0,reference,translation,similarity,length_diff,ref_tox,trn_tox
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


# Data Preprocessing

In [3]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    text = " ".join(text.split())
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Rejoin tokens into a single string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

df['reference'] = df['reference'].apply(preprocess_text)
df['translation'] = df['translation'].apply(preprocess_text)

In [4]:
df.head()

Unnamed: 0_level_0,reference,translation,similarity,length_diff,ref_tox,trn_tox
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,alkar flooding psychic waste explains high lev...,alkar floods mental waste would explain high l...,0.785171,0.010309,0.014195,0.981983
1,youre getting nasty,youre becoming disgusting,0.749687,0.071429,0.065473,0.999039
2,well could spare life one,well spare life,0.919051,0.268293,0.213313,0.985068
3,ah monkey youve got snap,monkey wake,0.664333,0.309524,0.053362,0.994215
4,ive got orders put,orders kill,0.726639,0.181818,0.009402,0.999348


# Data Building

We have filtered the dataset based on insights gathered from our Exploratory Data Analysis (EDA). The filtering criteria, derived from the EDA results, are as follows:

- **Maximum Length Difference**: Text pairs with a length difference less than 15 are included.
- **Maximum Translation Toxicity**: Translation texts with toxicity levels below 0.002 are included.
- **Minimum Reference Toxicity**: Reference texts with toxicity levels above 0.95 are considered toxic.
- **Minimum Similarity**: Text pairs with a cosine similarity greater than 0.8 are included.

By applying these filters, we ensure that the dataset contains high-quality, meaningful data points for training, aligning with the insights obtained from our Exploratory Data Analysis.


In [5]:
MAX_LENGTH_DIFFERENCE = 15
MAX_TRANSLATION_TOXICITY = 0.002
MIN_REFERENCE_TOXICITY = 0.95
MIN_SIMILARITY = 0.8

filtered_df = df[
    (df['length_diff'] < MAX_LENGTH_DIFFERENCE) &
    (df['trn_tox'] < MAX_TRANSLATION_TOXICITY) &
    (df['ref_tox'] > MIN_REFERENCE_TOXICITY) &
    (df['similarity'] > MIN_SIMILARITY)
]

In [6]:
filtered_df.head()

Unnamed: 0_level_0,reference,translation,similarity,length_diff,ref_tox,trn_tox
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22,real life starts first time fuck kid,boy real life starts first,0.866697,0.319149,0.998222,0.000114
29,hey leave poor bastard alone,leave poor man alone,0.857554,0.257143,0.999382,0.000578
41,told waste fucking time,told waste time,0.904062,0.183673,0.995877,0.000479
43,swear god best thing ever life save little son...,swear god best thing ive ever done life save l...,0.932305,0.022472,0.999071,0.0009
71,dont shit,dont anything,0.806763,0.137931,0.995474,4.7e-05


In [7]:
import os

INTERIM_DIR = "../data/interim"
DF_TSV_PATH = os.path.join(INTERIM_DIR, "filtered_df.tsv")
filtered_df.to_csv(DF_TSV_PATH, sep='\t', index=False)