
# Full pipeline for preprocessing data we want use to make predictions

In [1]:
from typing import List

from experiments.plain_proccessing_util import read_raw_comment_txt, gen_pos_tags
import preprocessing.preproc_classification_data as preproc
import preprocessor as p
import os
import pickle
import itertools
from preprocessing.twitter_pos_tagger import get_pos_of_file
import numpy as np 

In [2]:
#path to comments that are taken directly from test (raw)
exp_folder = r'experiments/data/exp_4/adv'
path_raw_test = os.path.join(exp_folder,'test_comments.csv')

#folder to save preprocessed data in
save_folder = exp_folder


#path to corpus etc-

dump_folder = "preprocessing/dump_2"
tweet_pos_folder = os.path.join(save_folder,"for_pos")



## Tokenizing and sentence mapping

In [3]:
#read the raw data
comm_sent_list, comm_label_list = preproc.readRawCommSentData(path_raw_test)

Missing label in sent: ['150', '', 'They also say their smarter than men but let\'s be honest here men have always been the smarter gender just because we are perceived as "thick" and "unable to comprehend" we have always been the ones who make stuff that works isn\'t stupid and it helps in general we don\'t make things that are "unhelpful" as such an look at all the "greatest" people in history nearly all of them are men so this isn\'t sexiest it\'s just the truth oh wait I forgot that truth from a "non feminist" or actually valid points are "sexist" sorry my bad I didn\'t mean to trigger you all I\'m really sorry for you people being a bunch of children who play the "big bully" but really if someone ever fights back the "big bully" will just sook and tell lies to get the "bad people" in trouble but seriously GROW UP!', '', '']
Missing label in sent: ['478', 'Abby', 'MARXISM', '', '']
Missing label in sent: ['1682', 'Ciara', '@user16151 you criticize someone for having the wrong educa

In [4]:
for idx,comment in enumerate(comm_sent_list):
    for sent in comment:
        if '\n' in sent:
            print("yikes",idx,sent)

In [5]:
for i in range(5):
    print(comm_sent_list[i])
    print(comm_label_list[i])

["@user06652 You can't be serious?", 'Just google "United Nations".', 'You may learn something.']
[0, 0, 0]
['credit growing Feminist = Misandrist communists southern']
[1]
['benefit familiar private communists activism Where the fuck do these fat cunts learn this shit?ï»¿ student posting wives games de']
[1]
['bearing holy shit i cant xDD these vids kill me LMAOï»¿']
[0]
['dear bearing.', 'you can not be "mansplaining" anything.', 'you can only "bearsplain".', 'may be "malebearsplain" XDï»¿']
[0, 0, 0, 0]


In [6]:
comm_sent_list

[["@user06652 You can't be serious?",
  'Just google "United Nations".',
  'You may learn something.'],
 ['credit growing Feminist = Misandrist communists southern'],
 ['benefit familiar private communists activism Where the fuck do these fat cunts learn this shit?ï»¿ student posting wives games de'],
 ['bearing holy shit i cant xDD these vids kill me LMAOï»¿'],
 ['dear bearing.',
  'you can not be "mansplaining" anything.',
  'you can only "bearsplain".',
  'may be "malebearsplain" XDï»¿'],
 ['BEARING IS AWESOMEï»¿'],
 ['instead of being a crazy cat lady.', 'She will be a crazy bird ladyï»¿'],
 ['southern validation online communists bias service mass I feel sorry for the birds being around such a disgusting racist feminazi.ï»¿ message centipede liked tend direct collective janice'],
 ['um wise oxford credit ghomeshi 20 collective propaganda abram labor posted Wow, I actually thought she was decent looking, until she opened her fat stupid mouth and all that shit fell out. cults source

In [7]:
comm_sent_list = np.array(comm_sent_list)
comm_label_list = np.array(comm_label_list)

In [8]:
#create mapping index in merged list to actual indices of the same comment
comm_map_dict = preproc.mapCommSent(comm_sent_list)
comm_map_dict

{0: [0, 3],
 1: [3, 4],
 2: [4, 5],
 3: [5, 6],
 4: [6, 10],
 5: [10, 11],
 6: [11, 13],
 7: [13, 14],
 8: [14, 16],
 9: [16, 18],
 10: [18, 22],
 11: [22, 24],
 12: [24, 26],
 13: [26, 28],
 14: [28, 29],
 15: [29, 30],
 16: [30, 31],
 17: [31, 33],
 18: [33, 35],
 19: [35, 36],
 20: [36, 37],
 21: [37, 38],
 22: [38, 39],
 23: [39, 40],
 24: [40, 43],
 25: [43, 44],
 26: [44, 45],
 27: [45, 46],
 28: [46, 48],
 29: [48, 49],
 30: [49, 51],
 31: [51, 52],
 32: [52, 54],
 33: [54, 55],
 34: [55, 58],
 35: [58, 59],
 36: [59, 61],
 37: [61, 62],
 38: [62, 66],
 39: [66, 67],
 40: [67, 68],
 41: [68, 69],
 42: [69, 70],
 43: [70, 71],
 44: [71, 74],
 45: [74, 75],
 46: [75, 76],
 47: [76, 77],
 48: [77, 78],
 49: [78, 80],
 50: [80, 82],
 51: [82, 83],
 52: [83, 84],
 53: [84, 85],
 54: [85, 86],
 55: [86, 87],
 56: [87, 88],
 57: [88, 90],
 58: [90, 91],
 59: [91, 92],
 60: [92, 93],
 61: [93, 94],
 62: [94, 95],
 63: [95, 96],
 64: [96, 97],
 65: [97, 98],
 66: [98, 101],
 67: [101, 10

In [9]:
#merge the each comment into a single string and label, then one hot encode the label ([1 0] is toxic, [0,1] is benign)
merged_comm_list = [" ".join(sent_list) for sent_list in comm_sent_list]
merged_label_list = [np.max(label_list) for label_list in comm_label_list]
merged_label_list = [[label, 1 - label] for label in merged_label_list]

In [10]:
for i in range(5):
    print(merged_comm_list[i])
    print(merged_label_list[i])

@user06652 You can't be serious? Just google "United Nations". You may learn something.
[0, 1]
credit growing Feminist = Misandrist communists southern
[1, 0]
benefit familiar private communists activism Where the fuck do these fat cunts learn this shit?ï»¿ student posting wives games de
[1, 0]
bearing holy shit i cant xDD these vids kill me LMAOï»¿
[0, 1]
dear bearing. you can not be "mansplaining" anything. you can only "bearsplain". may be "malebearsplain" XDï»¿
[0, 1]


In [11]:
with open(os.path.join(save_folder, "test_raw_comm.data"), "wb") as handle:
    pickle.dump((merged_comm_list, merged_label_list), handle)

In [12]:
#now create  dataset which is comment subsection wise instead of full comment (one hot encode label)
flat_sent_list = list(itertools.chain.from_iterable(comm_sent_list))
flat_label_list = list(itertools.chain.from_iterable(comm_label_list))
flat_label_list =[[label, 1 - label] for label in flat_label_list]

In [13]:
for i in range(5):
    print(flat_sent_list[i])
    print(flat_label_list[i])

@user06652 You can't be serious?
[0, 1]
Just google "United Nations".
[0, 1]
You may learn something.
[0, 1]
credit growing Feminist = Misandrist communists southern
[1, 0]
benefit familiar private communists activism Where the fuck do these fat cunts learn this shit?ï»¿ student posting wives games de
[1, 0]


In [14]:
#dump flattened (subsection wise) training data 
with open(os.path.join(save_folder, "test_raw_sent.data"), "wb") as handle:
    pickle.dump((flat_sent_list, flat_label_list), handle)

In [15]:
#dump dictionary with comment idx -> idx range in flattened dataset 
with open(os.path.join(save_folder, "test_map.data"), "wb") as handle:
    pickle.dump(comm_map_dict, handle)

In [16]:
#tokenize 

print("Tokenization...")
sent_list = [p.tokenize(sent) for sent in merged_comm_list]
for i in range(5):
    print(sent_list[i])
sent_list = [preproc.clean_str(sent).split()[:] for sent in sent_list]
for i in range(5):
    print(sent_list[i])

Tokenization...
$MENTION$ You can't be serious? Just google "United Nations". You may learn something.
credit growing Feminist = Misandrist communists southern
benefit familiar private communists activism Where the fuck do these fat cunts learn this shit? student posting wives games de
bearing holy shit i cant xDD these vids kill me LMAO
dear bearing. you can not be "mansplaining" anything. you can only "bearsplain". may be "malebearsplain" XD
['$mention$', 'you', 'ca', "n't", 'be', 'serious', '\\?', 'just', 'google', 'united', 'nations', 'you', 'may', 'learn', 'something']
['credit', 'growing', 'feminist', 'misandrist', 'communists', 'southern']
['benefit', 'familiar', 'private', 'communists', 'activism', 'where', 'the', 'fuck', 'do', 'these', 'fat', 'cunts', 'learn', 'this', 'shit', '\\?', 'student', 'posting', 'wives', 'games', 'de']
['bearing', 'holy', 'shit', 'i', 'cant', 'xdd', 'these', 'vids', 'kill', 'me', 'lmao']
['dear', 'bearing', 'you', 'can', 'not', 'be', 'mansplaining', '

In [17]:
#write the tokenized and preprocessed sentences with their labels to the appropriate out file 
with open(os.path.join(save_folder, "test_comm.data"), "wb") as handle:
    pickle.dump((sent_list, merged_label_list), handle)

In [18]:
#do above but for sentence level data
print("Tokenization...")
sent_list = [p.tokenize(sent) for sent in flat_sent_list]
for i in range(5):
    print(sent_list[i])
sent_list = [preproc.clean_str(sent).split()[:] for sent in sent_list]
for i in range(5):
    print(sent_list[i])


Tokenization...
$MENTION$ You can't be serious?
Just google "United Nations".
You may learn something.
credit growing Feminist = Misandrist communists southern
benefit familiar private communists activism Where the fuck do these fat cunts learn this shit? student posting wives games de
['$mention$', 'you', 'ca', "n't", 'be', 'serious', '\\?']
['just', 'google', 'united', 'nations']
['you', 'may', 'learn', 'something']
['credit', 'growing', 'feminist', 'misandrist', 'communists', 'southern']
['benefit', 'familiar', 'private', 'communists', 'activism', 'where', 'the', 'fuck', 'do', 'these', 'fat', 'cunts', 'learn', 'this', 'shit', '\\?', 'student', 'posting', 'wives', 'games', 'de']


In [19]:
#write the tokenized and preprocessed sentences with their labels to the appropriate out file 
with open(os.path.join(save_folder, "test_sent.data"), "wb") as handle:
    pickle.dump((sent_list, flat_label_list), handle)

In [20]:
#files we will be using: sentence level (preprocessed) data files, mapping file and we output to attention file
sent_data_file = "test_sent.data"
map_file = "test_map.data"
attention_list = []



#for each comment index
for comm_ind in comm_map_dict:
    #get corresponding start and end index (range) of sentences in that comment
    sent_start_ind, sent_end_ind = comm_map_dict[comm_ind]
    attention_vec = []
    
    #create attention vector by simply getting the label of each sentence in a comment and creating a vector out of it
    #the vector is of size (#of tokens in the comment and has a 1 for every word in a sentence marked toxec and 0 otherwise)
    for sent_ind in range(sent_start_ind, sent_end_ind):
        try:
            attention_vec = attention_vec + [flat_label_list[sent_ind][0]] * len(sent_list[sent_ind])
        except:
            print("sent_ind: {}, len sent_label_list: {}, len sent_list: {}".format(sent_ind, len(flat_label_list),
                                                                                    len(sent_list)))
    attention_list.append(attention_vec[:])

for i in range(5):
    print(sent_list[i])
    print(flat_label_list[i])
    print(attention_list[i])



['$mention$', 'you', 'ca', "n't", 'be', 'serious', '\\?']
[0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['just', 'google', 'united', 'nations']
[0, 1]
[1, 1, 1, 1, 1, 1]
['you', 'may', 'learn', 'something']
[0, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
['credit', 'growing', 'feminist', 'misandrist', 'communists', 'southern']
[1, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['benefit', 'familiar', 'private', 'communists', 'activism', 'where', 'the', 'fuck', 'do', 'these', 'fat', 'cunts', 'learn', 'this', 'shit', '\\?', 'student', 'posting', 'wives', 'games', 'de']
[1, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [21]:

#save the attention data
with open(os.path.join(save_folder, "test_attention.data"), "wb") as handle:
    pickle.dump(attention_list, handle)

## POS Tagging


In [22]:
#create for_pos if it does not already exist

if not os.path.isdir(tweet_pos_folder): 
    os.mkdir(tweet_pos_folder)

In [23]:
#file we will be pulling from and writing to 
raw_comm_data_file = "test_raw_comm.data"





pos_comment_list = []
max_sent_num = 5000
ind = 0

for i in range(5):
    print(merged_comm_list[i])
    
    
while (ind < len(merged_comm_list)):
    comm_list = merged_comm_list[ind: ind + max_sent_num]
    
    #name of file to save newline seperated comments in 
    for_pos_filename = f"{raw_comm_data_file}_{ind}.txt"
    
    #path of file to save newline seperated comments in 
    for_pos_filepath = os.path.join(tweet_pos_folder,for_pos_filename)
    
    
    #first, we write these tweets (newline seperated) into a text file
    with open (for_pos_filepath, 'w',encoding="utf-8") as f:
        for comm in comm_list:
            f.write(str(comm+'\n'))
            
            
    #then, we can run the pos tagger on that file and get the tags for each token
    pos_tags = get_pos_of_file(for_pos_filepath)
    
    #for each comment 
    pos_list = [comm[1] for comm in pos_tags]
    pos_comment_list = pos_comment_list + pos_list
            
            
    
    ind += max_sent_num
    



@user06652 You can't be serious? Just google "United Nations". You may learn something.
credit growing Feminist = Misandrist communists southern
benefit familiar private communists activism Where the fuck do these fat cunts learn this shit?ï»¿ student posting wives games de
bearing holy shit i cant xDD these vids kill me LMAOï»¿
dear bearing. you can not be "mansplaining" anything. you can only "bearsplain". may be "malebearsplain" XDï»¿


In [24]:
for i in range(5):
    print(pos_comment_list[i])
    
print(len(pos_comment_list))

['@', 'O', 'V', 'V', 'A', ',', 'R', '^', ',', '^', 'N', ',', ',', 'O', 'V', 'V', 'N', ',']
['N', 'V', 'A', ',', '^', 'N', 'A']
['V', 'A', 'A', 'N', 'N', 'R', 'D', 'N', 'V', 'D', 'A', 'N', 'V', 'D', 'N', ',', 'G', 'N', 'V', 'N', 'N', 'P']
['V', 'A', 'N', 'O', 'V', 'E', 'D', 'N', 'V', 'O', 'G']
['N', 'V', ',', 'O', 'V', 'R', 'V', ',', 'V', ',', 'N', ',', 'O', 'V', 'R', ',', 'N', ',', ',', 'V', 'V', ',', 'N', ',', '^']
11540


In [25]:
#save th pos tags per comment
with open(os.path.join(save_folder, "test_comm_pos.data"), "wb") as handle:
    pickle.dump(pos_comment_list, handle)

In [26]:
#do the same as above for the sentence level

#file we will be pulling from and writing to 
raw_sent_data_file = "test_raw_sent.data"





pos_comment_list = []
max_sent_num = 5000
ind = 0

for i in range(5):
    print(flat_sent_list[i])
    
    
while (ind < len(flat_sent_list)):
    comm_list = flat_sent_list[ind: ind + max_sent_num]
    
    #name of file to save newline seperated comments in 
    for_pos_filename = f"{raw_sent_data_file}_{ind}.txt"
    
    #path of file to save newline seperated comments in 
    for_pos_filepath = os.path.join(tweet_pos_folder,for_pos_filename)
    
    
    #first, we write these tweets (newline seperated) into a text file
    with open (for_pos_filepath, 'w',encoding="utf-8") as f:
        for comm in comm_list:
            f.write(str(comm+'\n'))
            
            
    #then, we can run the pos tagger on that file and get the tags for each token
    pos_tags = get_pos_of_file(for_pos_filepath)
    
    #for each comment 
    pos_list = [comm[1] for comm in pos_tags]
    pos_comment_list = pos_comment_list + pos_list
            
            
    
    ind += max_sent_num
    



@user06652 You can't be serious?
Just google "United Nations".
You may learn something.
credit growing Feminist = Misandrist communists southern
benefit familiar private communists activism Where the fuck do these fat cunts learn this shit?ï»¿ student posting wives games de


In [27]:
for i in range(5):
    print(pos_comment_list[i])
    
print(len(pos_comment_list))

['@', 'O', 'V', 'V', 'A', ',']
['R', '^', ',', '^', 'N', ',', ',']
['O', 'V', 'V', 'N', ',']
['N', 'V', 'A', ',', '^', 'N', 'A']
['V', 'A', 'A', 'N', 'N', 'R', 'D', 'N', 'V', 'D', 'A', 'N', 'V', 'D', 'N', ',', 'G', 'N', 'V', 'N', 'N', 'P']
26324


In [28]:
#save th pos tags per sentence
with open(os.path.join(save_folder, "test_sent_pos.data"), "wb") as handle:
    pickle.dump(pos_comment_list, handle)

That's it !