In [2]:
import pandas as pd
import numpy as np
# import ujson as json
import time, datetime
import torch, nltk, re, random
from transformers import BertTokenizer
from nltk.stem.porter import *
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler


import warnings
warnings.filterwarnings("ignore", category=UserWarning) 


## Data Loading & Preprocessing

In [3]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('datasets/reddit.csv')
stemmer = PorterStemmer()


# 22324 Posts
df = df[['text', 'hate_speech_idx']]
# NOTE: For neutral speech: hate_speech_idx == NaN (later replaced by '0')

# print(df.head(10))    

# Expand intertwined rows
for i, row in df.iterrows():
    text = row['text'].strip().split('\n')
    # Replace NaN with 0 for hate_speech_idx column.
    type = '0' if pd.isnull(df.iloc[i, 1]) else row['hate_speech_idx'].strip('[]').split(',')[0]

    row['text'], row['hate_speech_idx'] = text, type
df = df.explode('text', ignore_index=True) # https://stackoverflow.com/questions/39011511/pandas-expand-rows-from-list-data-available-in-column

def preprocess(text_string): # Ref: https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb
    """ Accepts a text string and replaces:
        1) urls with URLHERE
        2) lots of whitespace with one instance
        3) mentions with MENTIONHERE
        Get standardized counts of urls and mentions w/o caring about specific people mentioned
    @ retrun 
       List of stemmed words in a sentence
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)

    parsed_text = " ".join(re.split("[.,!?:\t\n\">]", parsed_text.lower()))  # Doc: https://docs.python.org/3/library/re.html?highlight=split#re.split
    stemmed_text = [stemmer.stem(t) for t in parsed_text.split()]

    return stemmed_text[1:]

df = pd.concat([df['text'].apply(preprocess), df['hate_speech_idx'].astype(int)], axis = 1)
print(df.head(10))
print('Dataset Length: [{}]'.format(len(df)))
df_subclass = df.loc[df['hate_speech_idx']==0]
print('---- Number of [neutral] tweets: {} ({}%)'.format(len(df_subclass), round(len(df_subclass)/len(df), 4) * 100))
print(df_subclass['text'].head(5))
df_subclass = df.loc[df['hate_speech_idx']!=0]
print('---- Number of [HATEFUL] tweets: {} ({}%)'.format(len(df_subclass), round(len(df_subclass)/len(df), 4) * 100))
print(df_subclass['text'].head(5))


                                                text  hate_speech_idx
0  [a, subsect, of, retard, hungarian, ohh, boy, ...                1
1  [hiii, just, got, off, work, 444, is, mainli, ...                1
2  [wow, i, guess, soyboy, are, the, same, in, ev...                1
3  [owen, benjamin', soyboy, song, goe, for, ever...                1
4  [y'all, hear, sumn, by, all, mean, i, live, in...                3
5                                        [[removed]]                3
6  [ah, a, liber, ha, slip, in, you, can, tell, b...                3
7  [wouldn't, the, defend, or, whatev, they, are,...                0
8                       ['inclusive', =, not, white]                0
9  [“harvard, is, work, to, be, more, inclus, ”, ...                0
Dataset Length: [22324]
---- Number of [neutral] tweets: 5335 (23.9%)
7     [wouldn't, the, defend, or, whatev, they, are,...
8                          ['inclusive', =, not, white]
9     [“harvard, is, work, to, be, more, inclus,