In [1]:
import os
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

In [2]:
dataset_dir = os.path.join('./dataset')
os.listdir(dataset_dir)

['balanced_polished_data.csv',
 'polished_data.csv',
 'raw_data.csv',
 'test.csv',
 'test_labels.csv',
 'test_set.csv',
 'train.csv',
 'train_dev_set.csv']

In [3]:
data = pd.read_csv(os.path.join(dataset_dir + '/raw_data.csv'))

In [4]:
data.head()

Unnamed: 0,story,category
0,"As a librarian, I've been threatened with stal...",0
1,"I worked as an office manager, and the only wo...",0
2,I used to work for a call center and the men t...,0
3,"When I was 13 or 14, I was babysitting two kid...",0
4,I work in law enforcement. I started out as a ...,0


In [5]:
data_nan_idx = data[pd.isnull(data['story'])].index.tolist()
data.loc[data_nan_idx, 'story'] = np.nan
data.loc[data_nan_idx, 'category'] = np.nan

In [6]:
data = data.dropna().reset_index(drop = True)

In [7]:
print(data.loc[data_nan_idx, 'story'])
print(data.loc[data_nan_idx, 'category'])

14303    what i wanna do im josh nanni i say fuck ameri...
14315    No, maybe you should get a life instead of bei...
14325    you people are pretty overzealous with this wh...
14329    Collectonian is a bitch. A low class prostitut...
14331    You're a stupid cunt \n\nFuck you dumb arse, y...
14615    And finally, the ONLY reason I created the art...
14629    Hey Jac16888, how very chickenshit of you to t...
14634    You're welcome and I'm gonna vandalize every s...
14650    sup niggaz can u unprotect so i can post like ...
14653    I'M PISSED! WIKIPEDIA ADMINS REALLY SUCK! I'M ...
14665    You are homosexual. You fuck people through th...
14673    Good grief have you nothing useful to do with ...
14705    Hanibal911You're a bastard Pro-Assad.Hanibal91...
14706    Cena = Gets too much shit from people for no r...
14715    , I hope you're on the next plane just so we c...
14718    How is ""Ooooh, I'll be blocked! IMAGINE THAT!...
14728    THE LINK IS THERE. I have an idea. Why not del.

In [8]:
data.loc[15289]

story       you fuck misterwiki.   you fuck misterwiki.   ...
category                                                    1
Name: 15289, dtype: object

In [9]:
data = data.sample(frac=1).reset_index(drop = True)

In [10]:
print(data.loc[15289].story)
print(data.iloc[15289].category)

Opinion is opinion and everyone has one of those. You really think all of these aliens were dropped into a volcano based on no evidence, that alien souls are in your body? Fucking idiots. My opinion is more correct than yours
1.0


In [11]:
data.head()

Unnamed: 0,story,category
0,"I stood at the bus stop, a man stepped on my t...",0.0
1,I was born with a craniofacial disease called ...,1.0
2,"""\n\n DILDO! \n\nJust so we're on the same pag...",1.0
3,It was 2 o clock in the noon.. I was heading t...,0.0
4,In afternoon time I was out with my brother an...,0.0


In [12]:
def count_categories(df, sexual_harassment = 0, bullying = 0, others = 0):
    for i in range(len(df)):
        if df.iloc[i].category == 0:
            sexual_harassment+=1
        elif df.iloc[i].category == 1:
            bullying+=1
        else:
            others+=1
    return sexual_harassment, bullying, others

sexual_harassment, bullying, others = count_categories(data, 0, 0, 0)
print('No. of Sexual Harassment Data: ', sexual_harassment)
print('No. of Bullying Data: ', bullying)
print('No. of Other Data: ', others)

No. of Sexual Harassment Data:  12806
No. of Bullying Data:  9074
No. of Other Data:  0


#### Converting to lower case
#### Replacing regex, symbols, and remove x in text
#### Removing digits and stop words

In [13]:
REPLACE_REGX = re.compile('[/(){}\[\]\|@,;]')
REPLACE_NUMBERS_SYMBOLS = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = REPLACE_REGX.sub('', text)
    text = REPLACE_NUMBERS_SYMBOLS.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [14]:
data['story'] = data['story'].apply(clean_text)
data['story'] = data['story'].str.replace('\d+', '')

In [15]:
data.head()

Unnamed: 0,story,category
0,stood bus stop man stepped toes asked looked h...,0.0
1,born craniofacial disease called stickler synd...,1.0
2,dildo page youre less man marine corps shouldv...,1.0
3,clock noon heading classes change two buses r...,0.0
4,afternoon time brother sister guys passed us o...,0.0


In [16]:
df = data.copy()

In [17]:
len(df)

21880

In [18]:
counter = 1
for i in range(len(df)):
    
    if data.iloc[i].category == 0:
#         print(data.iloc[i].category)
        df.drop(index=i, inplace=True)
        counter += 1
        if counter == 3732:
            break
print(counter)

3732


In [19]:
sexual_harassment, bullying, others = count_categories(df, 0, 0, 0)
print('No. of Sexual Harassment Data: ', sexual_harassment)
print('No. of Bullying Data: ', bullying)
print('No. of Other Data: ', others)

No. of Sexual Harassment Data:  9075
No. of Bullying Data:  9074
No. of Other Data:  0


In [20]:
df.to_csv(os.path.join(dataset_dir + '/balanced_polished_data.csv'), index=False)

In [21]:
len(df)

18149

In [22]:
df = df.sample(frac=1).reset_index(drop=True)

In [23]:
def prepare_train_test_set(df, test_percentage = .9):
    no_of_data = round(len(df) * test_percentage)
    
    print('Preparing {} dat for trainset'.format(no_of_data))
    train_set = df[:no_of_data]
    
    print('Preparing {} dat for test'.format(len(df)-no_of_data))
    test_set = df[no_of_data:]
    
    return train_set, test_set

In [24]:
train_set, test_set = prepare_train_test_set(df)

Preparing 16334 dat for trainset
Preparing 1815 dat for test


In [25]:
print('No of train data {}'.format(len(train_set)))
print('No of test data {}'.format(len(test_set)))

No of train data 16334
No of test data 1815


In [26]:
train_set = train_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

In [27]:
train_set.head()

Unnamed: 0,story,category
0,hello silly wubba wubba mitt romney dictator,1.0
1,man staring smiling near house tried cross pat...,0.0
2,happened road bus stop named gulabi bagh laksh...,0.0
3,old man promised buy touch screen phone accept...,0.0
4,contador gofuckyourselflove dc,1.0


In [28]:
test_set.head()

Unnamed: 0,story,category
0,young teenage girls clothes torn street young ...,0.0
1,somebody tires touch sensitive part,0.0
2,happened late evening travelling subjected var...,0.0
3,gotta love send noted racist antisemite kinda ...,1.0
4,yo nigga reverting edits use common sense,1.0


In [54]:
train_set.to_csv(os.path.join(dataset_dir + '/train_dev_set.csv'), index=False)
test_set.to_csv(os.path.join(dataset_dir + '/test_set.csv'), index=False)