# Code to make the dataset from the data

## Imports

In [1]:
import os
import pickle
import requests
import argparse
import pandas as pd
from torch.utils.data import Dataset
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Darya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Darya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Darya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Class MyDataset

In [2]:
class MyDataset(Dataset):
    def __init__(self, data_path):
        self.raw_data = pd.read_csv(data_path, sep='\t', index_col=0)
        
        data = pd.DataFrame()
        data['toxic'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['reference'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['translation']])
        data['normal'] = pd.concat([self.raw_data[self.raw_data['ref_tox'] > self.raw_data['trn_tox']]['translation'], self.raw_data[self.raw_data['ref_tox'] < self.raw_data['trn_tox']]['reference']])
        data['toxic_reduction'] = abs(self.raw_data['ref_tox'] - self.raw_data['trn_tox'])
        self.data = data


    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]

## Main: prepare tha data, create dataset, transform and save

In [3]:
def clean_text(text):
    
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    
    # replace punctuation with spaces
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    
    # remove digits
    text = re.sub(r'\d+', '', text)
    
    # remove non-ascii characters
    text = text.encode("ascii", errors="ignore").decode()
    
    # remove urls
    text = re.sub(r'http\S+', '', text)
    
    # remove html tags
    text = re.sub(r'<.*?>', '', text)
    
    # lowercase
    text = text.lower()

    return text

In [4]:
if __name__ == '__main__':
    
    data_dir = "../../data/raw/filtered_paranmt/"
    dataset_path = "../../data/interim/text_dataset.pkl"
    filename = "filtered.tsv"
    
    
    # create dataset
    dataset = MyDataset(os.path.join(data_dir, filename))
    

    # transform dataset
    # clear text
    dataset.data.toxic = dataset.data.toxic.apply(clean_text)
    dataset.data.normal = dataset.data.normal.apply(clean_text)

    # tokenize text
    dataset.data.toxic = dataset.data.toxic.apply(word_tokenize)
    dataset.data.normal = dataset.data.normal.apply(word_tokenize)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    dataset.data.toxic = dataset.data.toxic.apply(lambda text: [word for word in text if word not in stop_words])
    dataset.data.normal = dataset.data.normal.apply(lambda text: [word for word in text if word not in stop_words])
    
    # lemmatize text
    lemmatizer = WordNetLemmatizer()
    dataset.data.toxic = dataset.data.toxic.apply(lambda text: [lemmatizer.lemmatize(word) for word in text])
    dataset.data.normal = dataset.data.normal.apply(lambda text: [lemmatizer.lemmatize(word) for word in text])
    
    
    # save dataset
    if not os.path.exists(os.path.dirname(dataset_path)):
        os.makedirs(os.path.dirname(dataset_path))
    pickle.dump(dataset, open(dataset_path, 'wb'))
