# Data exploration

## Imports

In [1]:
import pandas as pd
import warnings
import pickle
warnings.filterwarnings('ignore')

## Raw data exploration and visualization

In [2]:
dataset = pd.read_csv('../data/raw/filtered_paranmt/filtered.tsv', sep="\t", index_col=[0])

In [3]:
dataset.head()

Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 577777 entries, 0 to 577776
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   reference    577777 non-null  object 
 1   translation  577777 non-null  object 
 2   similarity   577777 non-null  float64
 3   lenght_diff  577777 non-null  float64
 4   ref_tox      577777 non-null  float64
 5   trn_tox      577777 non-null  float64
dtypes: float64(4), object(2)
memory usage: 30.9+ MB


In [5]:
dataset.describe()

Unnamed: 0,similarity,lenght_diff,ref_tox,trn_tox
count,577777.0,577777.0,577777.0,577777.0
mean,0.758469,0.157652,0.541372,0.43449
std,0.092695,0.108057,0.457571,0.458904
min,0.600001,0.0,3.3e-05,3.3e-05
25%,0.681105,0.066667,0.012171,0.000707
50%,0.754439,0.141791,0.806795,0.085133
75%,0.831244,0.238095,0.990469,0.973739
max,0.95,0.4,0.999724,0.99973


In [6]:
print(f"Avg. reference toxicity: {dataset['ref_tox'].mean()}")

Avg. reference toxicity: 0.5413717990275281


In [7]:
print(f"Avg. translation toxicity: {dataset['trn_tox'].mean()}")

Avg. translation toxicity: 0.4344898352213311


In [8]:
print((dataset['trn_tox'] >= dataset['ref_tox']).value_counts())

False    319142
True     258635
dtype: int64


### The data is inconsistent. Reference needs to be switched around pairwise for values and sentences because it's not always less harmful.

# Data preprocessing (the copy of the code from src.data.make_dataset)

## Imports

In [9]:
import os
import pickle
import requests
import argparse
import pandas as pd
from torch.utils.data import Dataset
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Darya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Darya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Darya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Class MyDataset

In [10]:
class MyDataset(Dataset):
    def __init__(self, data_path):
        self.raw_data = pd.read_csv(data_path, sep='\t', index_col=0)
        
        data = pd.DataFrame()
        data['toxic'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['reference'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['translation']])
        data['normal'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['translation'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['reference']])
        data['toxic_reduction'] = abs(self.raw_data['ref_tox'] - self.raw_data['trn_tox'])
        self.data = data


    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

## Prepare tha data, create dataset, transform and save

In [11]:
def clean_text(text):
    
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # replace punctuation with spaces
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    
    # remove digits
    text = re.sub(r'\d+', '', text)
    
    # remove non-ascii characters
    text = text.encode("ascii", errors="ignore").decode()
    
    # remove urls
    text = re.sub(r'http\S+', '', text)
    
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    
    # lowercase
    text = text.lower()

    return text

In [12]:
def preprocess():
    
    data_dir = "../data/raw/filtered_paranmt/"
    dataset_path = "../data/interim/text_dataset.pkl"
    filename = "filtered.tsv"
    
    
    # create dataset
    dataset = MyDataset(os.path.join(data_dir, filename))
    

    # transform dataset
    # clear text
    dataset.data.toxic = dataset.data.toxic.apply(clean_text)
    dataset.data.normal = dataset.data.normal.apply(clean_text)

    # tokenize text
    dataset.data.toxic = dataset.data.toxic.apply(word_tokenize)
    dataset.data.normal = dataset.data.normal.apply(word_tokenize)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    dataset.data.toxic = dataset.data.toxic.apply(lambda text: [word for word in text if word not in stop_words])
    dataset.data.normal = dataset.data.normal.apply(lambda text: [word for word in text if word not in stop_words])
    
    # lemmatize text
    lemmatizer = WordNetLemmatizer()
    dataset.data.toxic = dataset.data.toxic.apply(lambda text: [lemmatizer.lemmatize(word) for word in text])
    dataset.data.normal = dataset.data.normal.apply(lambda text: [lemmatizer.lemmatize(word) for word in text])
    
    
    # save dataset
    if not os.path.exists(os.path.dirname(dataset_path)):
        os.makedirs(os.path.dirname(dataset_path))
    pickle.dump(dataset, open(dataset_path, 'wb'))

## Dataset exploration

In [13]:
df = pickle.load(open('../data/interim/text_dataset.pkl', 'rb')).data

In [14]:
df.head()

Unnamed: 0,toxic,normal,toxic_reduction
5,"[gon, na, child, genetic, disorder, gon, na, d...","[going, breed, kid, genetic, disorder, make, die]",0.915109
6,"[laughing, u, kick, as]","[laughing, u, show]",0.999361
7,"[maine, short, black, people, back]","[much, black, maine]",0.814971
11,"[spirit, cursed, walking, back, road, waterway...","[soul, cursed, guard, path, say, encounter, un...",0.698517
13,"[come, cal, leave, shit, alone]","[come, cal, put]",0.999357


### Let's explore the toxic and normal words in details

In [15]:
toxic_words = list(set([word for sentence in df['toxic'].values for word in sentence]).difference(set([word for sentence in df['normal'].values for word in sentence])))

In [16]:
toxic_words[:20]

['shirshu',
 'indio',
 'cornishman',
 'useg',
 'rosette',
 'torrturing',
 'godzillawill',
 'pervading',
 'joely',
 'nurtured',
 'schlecks',
 'swrong',
 'luminescent',
 'goif',
 'ekes',
 'seminiferous',
 'cvalda',
 'mirthless',
 'blp',
 'eidu']

In [17]:
with open('../data/interim/toxic_words.pkl', 'wb') as f:
    pickle.dump(toxic_words, f)

### Many of the words in this collection are simply misspelled words or phrases that don't belong in sentences.