In [1]:
import os
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

In [2]:
dataset_dir = os.path.join('./dataset')
os.listdir(dataset_dir)

['balanced_polished_data.csv',
 'polished_data.csv',
 'raw_data.csv',
 'test.csv',
 'test_labels.csv',
 'test_set.csv',
 'train.csv',
 'train_dev_set.csv']

In [3]:
data = pd.read_csv(os.path.join(dataset_dir + '/raw_data.csv'))

In [4]:
data.head()

Unnamed: 0,story,category
0,"As a librarian, I've been threatened with stal...",0
1,"I worked as an office manager, and the only wo...",0
2,I used to work for a call center and the men t...,0
3,"When I was 13 or 14, I was babysitting two kid...",0
4,I work in law enforcement. I started out as a ...,0


In [5]:
data_nan_idx = data[pd.isnull(data['story'])].index.tolist()
data.loc[data_nan_idx, 'story'] = np.nan
data.loc[data_nan_idx, 'category'] = np.nan

In [6]:
data = data.dropna().reset_index(drop = True)

In [7]:
print(data.loc[data_nan_idx, 'story'])
print(data.loc[data_nan_idx, 'category'])

14303    what i wanna do im josh nanni i say fuck ameri...
14315    No, maybe you should get a life instead of bei...
14325    you people are pretty overzealous with this wh...
14329    Collectonian is a bitch. A low class prostitut...
14331    You're a stupid cunt \n\nFuck you dumb arse, y...
14615    And finally, the ONLY reason I created the art...
14629    Hey Jac16888, how very chickenshit of you to t...
14634    You're welcome and I'm gonna vandalize every s...
14650    sup niggaz can u unprotect so i can post like ...
14653    I'M PISSED! WIKIPEDIA ADMINS REALLY SUCK! I'M ...
14665    You are homosexual. You fuck people through th...
14673    Good grief have you nothing useful to do with ...
14705    Hanibal911You're a bastard Pro-Assad.Hanibal91...
14706    Cena = Gets too much shit from people for no r...
14715    , I hope you're on the next plane just so we c...
14718    How is ""Ooooh, I'll be blocked! IMAGINE THAT!...
14728    THE LINK IS THERE. I have an idea. Why not del.

In [8]:
data.loc[15289]

story       you fuck misterwiki.   you fuck misterwiki.   ...
category                                                    1
Name: 15289, dtype: object

In [9]:
data = data.sample(frac=1).reset_index(drop = True)

In [10]:
print(data.loc[15289].story)
print(data.iloc[15289].category)

Opinion is opinion and everyone has one of those. You really think all of these aliens were dropped into a volcano based on no evidence, that alien souls are in your body? Fucking idiots. My opinion is more correct than yours
1.0


In [11]:
data.head()

Unnamed: 0,story,category
0,"I stood at the bus stop, a man stepped on my t...",0.0
1,I was born with a craniofacial disease called ...,1.0
2,"""\n\n DILDO! \n\nJust so we're on the same pag...",1.0
3,It was 2 o clock in the noon.. I was heading t...,0.0
4,In afternoon time I was out with my brother an...,0.0


In [53]:
def count_categories(df, sexual_harassment = 0, bullying = 0, others = 0):
    for i in range(len(df)):
        if df.iloc[i].category == 0:
            sexual_harassment+=1
        elif df.iloc[i].category == 1:
            bullying+=1
        else:
            others+=1
    return sexual_harassment, bullying, others

In [None]:
sexual_harassment, bullying, others = count_categories(data, 0, 0, 0)
print('No. of Sexual Harassment Data: ', sexual_harassment)
print('No. of Bullying Data: ', bullying)
print('No. of Other Data: ', others)

#### Converting to lower case
#### Replacing regex, symbols, and remove x in text
#### Removing digits and stop words

In [13]:
REPLACE_REGX = re.compile('[/(){}\[\]\|@,;]')
REPLACE_NUMBERS_SYMBOLS = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = REPLACE_REGX.sub('', text)
    text = REPLACE_NUMBERS_SYMBOLS.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

In [14]:
data['story'] = data['story'].apply(clean_text)
data['story'] = data['story'].str.replace('\d+', '')

In [15]:
data.head()

Unnamed: 0,story,category
0,stood bus stop man stepped toes asked looked h...,0.0
1,born craniofacial disease called stickler synd...,1.0
2,dildo page youre less man marine corps shouldv...,1.0
3,clock noon heading classes change two buses r...,0.0
4,afternoon time brother sister guys passed us o...,0.0


In [16]:
df = data.copy()

In [17]:
len(df)

21880

In [18]:
counter = 1
for i in range(len(df)):
    
    if data.iloc[i].category == 0:
#         print(data.iloc[i].category)
        df.drop(index=i, inplace=True)
        counter += 1
        if counter == 3732:
            break
print(counter)

3732


In [19]:
sexual_harassment, bullying, others = count_categories(df, 0, 0, 0)
print('No. of Sexual Harassment Data: ', sexual_harassment)
print('No. of Bullying Data: ', bullying)
print('No. of Other Data: ', others)

No. of Sexual Harassment Data:  9075
No. of Bullying Data:  9074
No. of Other Data:  0


In [20]:
df.to_csv(os.path.join(dataset_dir + '/balanced_polished_data.csv'), index=False)

In [21]:
len(df)

18149

In [22]:
df = df.sample(frac=1).reset_index(drop=True)

In [23]:
def prepare_train_test_set(df, test_percentage = .9):
    no_of_data = round(len(df) * test_percentage)
    
    print('Preparing {} dat for trainset'.format(no_of_data))
    train_set = df[:no_of_data]
    
    print('Preparing {} dat for test'.format(len(df)-no_of_data))
    test_set = df[no_of_data:]
    
    return train_set, test_set

In [24]:
train_set, test_set = prepare_train_test_set(df)

Preparing 16334 dat for trainset
Preparing 1815 dat for test


In [25]:
print('No of train data {}'.format(len(train_set)))
print('No of test data {}'.format(len(test_set)))

No of train data 16334
No of test data 1815


In [26]:
train_set = train_set.reset_index(drop=True)
test_set = test_set.reset_index(drop=True)

In [27]:
train_set.head()

Unnamed: 0,story,category
0,hello silly wubba wubba mitt romney dictator,1.0
1,man staring smiling near house tried cross pat...,0.0
2,happened road bus stop named gulabi bagh laksh...,0.0
3,old man promised buy touch screen phone accept...,0.0
4,contador gofuckyourselflove dc,1.0


In [28]:
test_set.head()

Unnamed: 0,story,category
0,young teenage girls clothes torn street young ...,0.0
1,somebody tires touch sensitive part,0.0
2,happened late evening travelling subjected var...,0.0
3,gotta love send noted racist antisemite kinda ...,1.0
4,yo nigga reverting edits use common sense,1.0


In [54]:
train_set.to_csv(os.path.join(dataset_dir + '/train_dev_set.csv'), index=False)
test_set.to_csv(os.path.join(dataset_dir + '/test_set.csv'), index=False)

### Data checking for duplicates and misslabel

In [14]:
print(os.listdir(dataset_dir))
df = pd.read_csv(os.path.join(dataset_dir,'test_set.csv'))
df.head()

['balanced_polished_data.csv', 'polished_data.csv', 'raw_data.csv', 'test.csv', 'test_labels.csv', 'test_set.csv', 'train.csv', 'train_dev_set.csv']


Unnamed: 0,story,category
0,summer afternoon walking street boys motorbik...,0.0
1,taxi driver harassed girl university buea junc...,0.0
2,passing ground near indira nagar comments pass...,0.0
3,never found supportive environment speaking as...,1.0
4,watch mouth dare call troll ugly piece shit,1.0


In [16]:
def detect_duplicates_and_mislabeled(df):
    similarities_list = list()
    visited = list()
    df_len = len(df)
    for i in range(df_len):
        for j in range(i+1, df_len):
            if j not in visited:
                if df.iloc[i].story == df.iloc[j].story:
                    visited.append(j)
                    similarities_list.append(j)
                    print('Story: \n', df.iloc[i].story)
                    print('Category: \n', df.iloc[i].category)
            elif j in visited:
                break
    return similarities_list

In [17]:
similarities_list_test_set = detect_duplicates_and_mislabeled(df)

Story: 
 one person snatch chain
Category: 
 0.0
Story: 
 went one shop sarojninagar guy stared
Category: 
 0.0
Story: 
 freshman year high school diagnosed scoliosis undergo many months testing childrens hospital finally fitting boston brace kind brace goes around entire torso extending deep armpit halfway butt back diggin deep hip front throughout high school wore brace  hours day taking shower girl high school peak time care looks couldnt change fact whole body stiff wide little portion butt stuck brace looked really funny specifically remember one girl always pestering would constantly say ally whats butt ally look stiff knowing exactly going struggling finished high school years wearing back brace whole way constantly worried ways hide brace people said thought concealed clothes knew everybody noticed everybody talked friends supportive time wish people understand much things say hurt
Category: 
 1.0
Story: 
 place near nandan inspera datta mandir often dimly lit usually groups ha

In [18]:
similarities_list_test_set

[1092, 209, 639, 1000, 976, 1497, 1808, 1405, 1238, 1330, 1527]

In [19]:
len(similarities_list_test_set)

11

In [20]:
similarities_in_test = np.asarray(similarities_list_test_set)
len(np.unique(similarities_in_test))

11

In [21]:
with open('similarities_index_in_test_set.txt', 'w') as file:
    for s in similarities_in_test:
        file.write(str(s)+'\n')

In [36]:
similarities_train_dev = list()
with open('similarities_index_in_train_dev.txt', 'r') as file:
#     file.readline
    val = file.readlines()
#     val = val[0]
    similarities_train_dev.append(val)

In [37]:
similarities_train_dev

[['15657\n',
  '5746\n',
  '3102\n',
  '2582\n',
  '1212\n',
  '120\n',
  '1191\n',
  '452\n',
  '359\n',
  '346\n',
  '934\n',
  '731\n',
  '1978\n',
  '1274\n',
  '1598\n',
  '1547\n',
  '1968\n',
  '2074\n',
  '2496\n',
  '2215\n',
  '2422\n',
  '2441\n',
  '2837\n',
  '3058\n',
  '4066\n',
  '4977\n',
  '3408\n',
  '4015\n',
  '3858\n',
  '4791\n',
  '4385\n',
  '4290\n',
  '4726\n',
  '4706\n',
  '4653\n',
  '5665\n',
  '5049\n',
  '5015\n',
  '5516\n',
  '5399\n',
  '5726\n',
  '5744\n',
  '14313\n',
  '10881\n',
  '8350\n',
  '6008\n',
  '6104\n',
  '7759\n',
  '7931\n',
  '6267\n',
  '7060\n',
  '6489\n',
  '6913\n',
  '6962\n',
  '6672\n',
  '7048\n',
  '7541\n',
  '7306\n',
  '7397\n',
  '7726\n',
  '8077\n',
  '8210\n',
  '8299\n',
  '8986\n',
  '8484\n',
  '8651\n',
  '8861\n',
  '10797\n',
  '10449\n',
  '9738\n',
  '10190\n',
  '9290\n',
  '9243\n',
  '9472\n',
  '10065\n',
  '10572\n',
  '10510\n',
  '10766\n',
  '14039\n',
  '12110\n',
  '11757\n',
  '11294\n',
  '11610

In [38]:
similarities_train_dev = np.asarray(similarities_train_dev)
similarities_train_dev.shape

(1, 118)

In [46]:
similarities_train = list()
for i in similarities_train_dev[0]:
    i = i.split('\n')
    i = int(i[0])
    similarities_train.append(i)

In [47]:
similarities_train

[15657,
 5746,
 3102,
 2582,
 1212,
 120,
 1191,
 452,
 359,
 346,
 934,
 731,
 1978,
 1274,
 1598,
 1547,
 1968,
 2074,
 2496,
 2215,
 2422,
 2441,
 2837,
 3058,
 4066,
 4977,
 3408,
 4015,
 3858,
 4791,
 4385,
 4290,
 4726,
 4706,
 4653,
 5665,
 5049,
 5015,
 5516,
 5399,
 5726,
 5744,
 14313,
 10881,
 8350,
 6008,
 6104,
 7759,
 7931,
 6267,
 7060,
 6489,
 6913,
 6962,
 6672,
 7048,
 7541,
 7306,
 7397,
 7726,
 8077,
 8210,
 8299,
 8986,
 8484,
 8651,
 8861,
 10797,
 10449,
 9738,
 10190,
 9290,
 9243,
 9472,
 10065,
 10572,
 10510,
 10766,
 14039,
 12110,
 11757,
 11294,
 11610,
 11520,
 11599,
 11956,
 12990,
 12283,
 12254,
 12828,
 12393,
 12593,
 12579,
 12980,
 13876,
 13247,
 13717,
 13287,
 13491,
 13666,
 13936,
 13947,
 14250,
 14686,
 14533,
 14457,
 14581,
 15617,
 15291,
 15223,
 15208,
 15201,
 15568,
 15986,
 15976,
 15742,
 16304,
 16043]

### Dropping indices of the mislabeled and duplicate data

In [48]:
print(os.listdir(dataset_dir))
train_data = pd.read_csv(os.path.join(dataset_dir,'train_dev_set.csv'))
train_data.head()

['balanced_polished_data.csv', 'polished_data.csv', 'raw_data.csv', 'test.csv', 'test_labels.csv', 'test_set.csv', 'train.csv', 'train_dev_set.csv']


Unnamed: 0,story,category
0,train delhi rewaricommenting,0.0
1,friend mine child belonged good family parents...,0.0
2,motherfucker pabloyou sorry beaner assand your...,1.0
3,hey queer andrew stop messing aroundis mum les...,1.0
4,njgw unemployed mother fucking bastard loser r...,1.0


In [49]:
def drop_mislabeled_and_duplicate_data(data):
    for i in range(len(data)):
        if i in similarities_train:
            print('Index {} is getting dropped'.format(i))
            data.drop(index=i, inplace=True)

In [50]:
drop_mislabeled_and_duplicate_data(train_data)

Index 120 is getting dropped
Index 346 is getting dropped
Index 359 is getting dropped
Index 452 is getting dropped
Index 731 is getting dropped
Index 934 is getting dropped
Index 1191 is getting dropped
Index 1212 is getting dropped
Index 1274 is getting dropped
Index 1547 is getting dropped
Index 1598 is getting dropped
Index 1968 is getting dropped
Index 1978 is getting dropped
Index 2074 is getting dropped
Index 2215 is getting dropped
Index 2422 is getting dropped
Index 2441 is getting dropped
Index 2496 is getting dropped
Index 2582 is getting dropped
Index 2837 is getting dropped
Index 3058 is getting dropped
Index 3102 is getting dropped
Index 3408 is getting dropped
Index 3858 is getting dropped
Index 4015 is getting dropped
Index 4066 is getting dropped
Index 4290 is getting dropped
Index 4385 is getting dropped
Index 4653 is getting dropped
Index 4706 is getting dropped
Index 4726 is getting dropped
Index 4791 is getting dropped
Index 4977 is getting dropped
Index 5015 is ge

In [51]:
train_data = train_data.reset_index(drop=True)

In [54]:
sexual_harassment, bullying, others = count_categories(train_data, 0, 0, 0)
print('No. of Sexual Harassment Data: ', sexual_harassment)
print('No. of Bullying Data: ', bullying)
print('No. of Other Data: ', others)

No. of Sexual Harassment Data:  8092
No. of Bullying Data:  8124
No. of Other Data:  0


In [55]:
print(os.listdir(dataset_dir))
test_data = pd.read_csv(os.path.join(dataset_dir,'test_set.csv'))
test_data.head()

['balanced_polished_data.csv', 'polished_data.csv', 'raw_data.csv', 'test.csv', 'test_labels.csv', 'test_set.csv', 'train.csv', 'train_dev_set.csv']


Unnamed: 0,story,category
0,summer afternoon walking street boys motorbik...,0.0
1,taxi driver harassed girl university buea junc...,0.0
2,passing ground near indira nagar comments pass...,0.0
3,never found supportive environment speaking as...,1.0
4,watch mouth dare call troll ugly piece shit,1.0


In [56]:
drop_mislabeled_and_duplicate_data(test_data)

Index 120 is getting dropped
Index 346 is getting dropped
Index 359 is getting dropped
Index 452 is getting dropped
Index 731 is getting dropped
Index 934 is getting dropped
Index 1191 is getting dropped
Index 1212 is getting dropped
Index 1274 is getting dropped
Index 1547 is getting dropped
Index 1598 is getting dropped


In [57]:
test_data = test_data.reset_index(drop=True)

In [58]:
sexual_harassment, bullying, others = count_categories(test_data, 0, 0, 0)
print('No. of Sexual Harassment Data: ', sexual_harassment)
print('No. of Bullying Data: ', bullying)
print('No. of Other Data: ', others)

No. of Sexual Harassment Data:  889
No. of Bullying Data:  915
No. of Other Data:  0
