# This notebook conducts the full data preprocessing pipeline 

As opposed to the original code, this pipeline is entirely in dataframes instead of multiple pickle files, which was causing alot of confusion (one for comment level and one for sentence level) for the data BEFORE splitting into training and test

In [6]:
!pip install gensim==3.8 --user

Collecting gensim==3.8
  Using cached gensim-3.8.0.tar.gz (23.4 MB)
Collecting smart_open>=1.7.0
  Using cached smart_open-5.2.1-py3-none-any.whl (58 kB)
Building wheels for collected packages: gensim
  Building wheel for gensim (setup.py): started
  Building wheel for gensim (setup.py): finished with status 'done'
  Created wheel for gensim: filename=gensim-3.8.0-cp36-cp36m-win_amd64.whl size=24193578 sha256=8521519a26729e8f0db68e4b693f1695f16d2f841d2200fe9c117dd1d0d3551d
  Stored in directory: c:\users\apra\appdata\local\pip\cache\wheels\b0\e6\b1\fbf9b07a8571e12ae1c47edb27bedb3246495ea4c7f4703cd0
Successfully built gensim
Installing collected packages: smart-open, gensim
Successfully installed gensim-3.8.0 smart-open-5.2.1


In [187]:
#imports
from preprocessing import preproc_classification_data as preproc
from preprocessing import build_vocab_embed as bve
import pandas as pd 
import preprocessor as p
import os
import itertools
import numpy as np
from tqdm import tqdm
from preprocessing.twitter_pos_tagger import get_pos_of_file
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import time 

In [3]:
# hyperparameters
max_sent_len = 100
unk = "<UNK>"
pad = "<PAD/>"
emb_dim = 300

In [193]:
#defining paths: these should be the only inputs
#path to raw data
raw_input_file_path = r'preprocessing/data/Anonymized_Sentences_Classified.csv'

#path to folder to save processed data in
out_folder_path = r'preprocessed/try_1'

#path to word to vec embedding 
wiki_embed_fn = r'preprocessing/word_emb/GoogleNews-vectors-negative300.bin'

In [8]:
#creating paths 
if not os.path.isdir(out_folder_path):
    os.mkdir(out_folder_path)

#csv folder to save preprocessed comment level data in
out_comment_path = os.path.join(out_folder_path,'comments_df')
#csv folder to save preprocessed sentence level data in
out_sentence_path = os.path.join(out_folder_path, 'sentence_df')
#txt file to save corpus
out_corpus_path = os.path.join(out_folder_path, 'corpus.txt')
#txt file to save POS corpus
out_pos_corpus_path = os.path.join(out_folder_path, 'pos_corpus.txt')

#pickle file to save vocab
out_vocab_path = os.path.join(out_folder_path, 'corpus.txt')
#pickle file to save POS vocab
out_pos_vocab_path = os.path.join(out_folder_path, 'pos_corpus.txt')


#path to save tuned embeddings
tune_embed_fn = os.path.join(out_folder_path, "youtube_{}d.txt".format(emb_dim))
#path to save initial embeddings
embed_pkl = os.path.join(out_folder_path, "init_embed.pkl")
#path to save normalized initial embeddings
norm_embed_pkl = os.path.join(out_folder_path, "norm_init_embed.pkl")

In [13]:
#load all data 

data = pd.read_csv(raw_input_file_path)

In [14]:
data

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive
0,1,Lauren,@user06652 You can't be serious?,No,No
1,1,Lauren,"Just google ""United Nations"".",No,No
2,1,Lauren,You may learn something.,No,No
3,2,Lauren,Feminist = Misandrist,No,Yes
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes
...,...,...,...,...,...
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes
26369,13039,Tiana,"""100 lkies"". Try 6k :D",No,No
26370,13040,Tiana,You bet your faggot ass im sending you money r...,No,Yes


In [57]:
#utils to filter out data that: has no label or has a faulty comment index

def clean_raw_data(raw_df):
    print(f"length before filtering {len(raw_df)}")
    #keep track of list of row indices to drop
    to_drop = []
    unlabeled_cnt = 0
    empty_sent_cnt = 0
    empty_comment_cnt = 0
    for idx, row in tqdm(raw_df.iterrows(), total=raw_df.shape[0]):
        comment_ind = row["Comment #"]
        sent_label = str(row["Abusive"])
        sent = row["Sentence"]
        
        #try to cast comment to int
        try:
            comment_ind = int(comment_ind)
        except:
            print("empty comment_ind", idx)
            empty_comment_cnt +=1
            to_drop.append(idx)
            continue
        
        #skip any empty sentences
        if (pd.isna(sent) or (sent.strip=="")):
            print("empty sentence", idx)
            empty_sent_cnt +=1
            to_drop.append(idx)
            continue
        if (sent_label.strip() == "No"):
            sent_label = 0
        elif (sent_label.strip() == "Yes"):
            sent_label = 1
        else:
            print("Missing label in comment: {} sentence {}".format(comment_ind, idx))
            unlabeled_cnt += 1
            to_drop.append(idx)
            continue
    
    print("# of unlabeld sents: {}".format(unlabeled_cnt))
    print("# of sents without comment: {}".format(empty_comment_cnt))
    print("# of empty sentences {} ".format(empty_sent_cnt))
    
    
    #create new filtered dataframe
    filtered_df = raw_df.drop(to_drop)
    print(f"length after filtering {len(filtered_df)}")
    return filtered_df
        
        
        
        
        

In [58]:
clean_data = clean_raw_data(data)

length before filtering 26372


  0%|                               | 0/26372 [00:00<?, ?it/s]

Missing label in comment: 150 sentence 234


  4%|▋                  | 985/26372 [00:00<00:02, 9752.74it/s]

Missing label in comment: 478 sentence 634


 11%|██                | 2970/26372 [00:00<00:02, 9572.55it/s]

Missing label in comment: 1682 sentence 2875
empty comment_ind 2876


 15%|██▋               | 3930/26372 [00:00<00:02, 9159.12it/s]

Missing label in comment: 2316 sentence 4183
Missing label in comment: 2316 sentence 4184
Missing label in comment: 2473 sentence 4388
Missing label in comment: 2543 sentence 4505
Missing label in comment: 2604 sentence 4608


 22%|███▉              | 5855/26372 [00:00<00:02, 9407.23it/s]

Missing label in comment: 2744 sentence 4823
Missing label in comment: 3625 sentence 6199


 33%|██████            | 8803/26372 [00:00<00:01, 9658.92it/s]

Missing label in comment: 3845 sentence 6863
Missing label in comment: 4157 sentence 7468
Missing label in comment: 4264 sentence 7713
Missing label in comment: 4343 sentence 7808
Missing label in comment: 4421 sentence 7926
Missing label in comment: 4422 sentence 7927
Missing label in comment: 4467 sentence 7993
Missing label in comment: 4472 sentence 7999
Missing label in comment: 4559 sentence 8140
Missing label in comment: 4637 sentence 8333
Missing label in comment: 4655 sentence 8404
Missing label in comment: 4727 sentence 8570
Missing label in comment: 4743 sentence 8595


 45%|███████▌         | 11821/26372 [00:01<00:01, 9906.52it/s]

Missing label in comment: 5211 sentence 9740
Missing label in comment: 5258 sentence 9884
Missing label in comment: 5295 sentence 9976
Missing label in comment: 5460 sentence 10460


 61%|█████████▋      | 15972/26372 [00:01<00:01, 10328.47it/s]

empty sentence 13831


 76%|████████████▏   | 20155/26372 [00:02<00:00, 10168.77it/s]

empty sentence 17354
empty sentence 17357
empty sentence 17358
empty sentence 17365
empty sentence 17366
empty sentence 17389
empty sentence 17390
empty sentence 17400
empty sentence 17654
empty sentence 17674
empty sentence 17772
empty sentence 17779
empty sentence 17786


100%|█████████████████| 26372/26372 [00:02<00:00, 9880.86it/s]

empty sentence 23720
empty sentence 24273
empty sentence 24282
empty sentence 24319
empty sentence 24320
empty sentence 25504
# of unlabeld sents: 27
# of sents without comment: 1
# of empty sentences 20 
length after filtering 26324





In [85]:
#add column that's just the label

clean_data['label'] = clean_data['Abusive'].apply(lambda row: 0 if row=="No" else 1)
clean_data['binarized_label'] = clean_data['label'].apply(lambda row: [row,1-row])

In [86]:
clean_data

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive,label,binarized_label
0,1,Lauren,@user06652 You can't be serious?,No,No,0,"[0, 1]"
1,1,Lauren,"Just google ""United Nations"".",No,No,0,"[0, 1]"
2,1,Lauren,You may learn something.,No,No,0,"[0, 1]"
3,2,Lauren,Feminist = Misandrist,No,Yes,1,"[1, 0]"
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes,1,"[1, 0]"
...,...,...,...,...,...,...,...
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No,0,"[0, 1]"
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes,1,"[1, 0]"
26369,13039,Tiana,"""100 lkies"". Try 6k :D",No,No,0,"[0, 1]"
26370,13040,Tiana,You bet your faggot ass im sending you money r...,No,Yes,1,"[1, 0]"


In [138]:
#util to create a dataframe thats joined into comments

def gen_comment_df(data, verbose=True):
    prev_comment_ind = data.loc[0]["Comment #"]
    comm_sent_list = []
    comm_label_list = []
    tmp_sent_list = []
    tmp_label_list = []
    comment_idx_list = [prev_comment_ind]
    comment_range_list = []
    
    start_ind = 0
    
    for idx, row in tqdm(data.iterrows(), total=data.shape[0]):
        comment_ind = row["Comment #"]
        sent_label = row["label"]
        sent = row["Sentence"]
        
        
        sent = sent.strip()
        if (comment_ind == prev_comment_ind):
            tmp_sent_list.append(sent)
            tmp_label_list.append(sent_label)
        else:
            # store prev comment
            comm_sent_list.append(tmp_sent_list[:])
            comm_label_list.append(tmp_label_list[:])
            
            end_ind = idx-1
            comment_range_list.append([start_ind,end_ind])
            
            #start is now the current index
            start_ind = idx
            
            # update tmp
            tmp_sent_list = [sent]
            tmp_label_list = [sent_label]
            prev_comment_ind = comment_ind
            comment_idx_list.append(comment_ind)
            
            
            
            
    # store the last comment
    end_ind = start_ind + len(tmp_sent_list)
    comment_range_list.append([start_ind,end_ind])
    comm_sent_list.append(tmp_sent_list[:])
    comm_label_list.append(tmp_label_list[:])
    # sanity check: print last two comments and labels
    if verbose:
        for i in range(1, 3):
            print("example comment:", comm_sent_list[-i], "labels:", comm_label_list[-i])
    assert (len(comm_sent_list) == len(comm_label_list)), "length of labels and comments don't match"
    comments_df = pd.DataFrame(columns = ['Comment', 'labels', 'comment_idx', "sent_range"])
    
    comments_df['Comment'] = comm_sent_list
    comments_df['labels'] = comm_label_list
    comments_df['comment_idx'] = comment_idx_list
    comments_df['sent_range'] = comment_range_list
    return comments_df
    
    

In [139]:
comments_df = gen_comment_df(clean_data)

100%|█████████████████| 26324/26324 [00:02<00:00, 9308.35it/s]

example comment: ['add me on steam and get cs go lol you would love that shit lol'] labels: [0]
example comment: ['You bet your faggot ass im sending you money right now, keep up the good work numb dick'] labels: [1]





In [140]:
comments_df

Unnamed: 0,Comment,labels,comment_idx,sent_range
0,"[@user06652 You can't be serious?, Just google...","[0, 0, 0]",1,"[0, 2]"
1,[Feminist = Misandrist],[1],2,"[3, 3]"
2,[Where the fuck do these fat cunts learn this ...,[1],3,"[4, 4]"
3,[bearing holy shit i cant xDD these vids kill ...,[0],4,"[5, 5]"
4,"[dear bearing., you can not be ""mansplaining"" ...","[0, 0, 0, 0]",5,"[6, 9]"
...,...,...,...,...
11535,[wait woah I'm no fag but I love your vids],[1],13037,"[26364, 26364]"
11536,[@user13939 Brother thanks for sharing this aw...,"[0, 0, 0, 1]",13038,"[26365, 26368]"
11537,"[""100 lkies"". Try 6k :D]",[0],13039,"[26369, 26369]"
11538,[You bet your faggot ass im sending you money ...,[1],13040,"[26370, 26370]"


In [141]:
comments_df.iloc[11536]["Comment"]

['@user13939 Brother thanks for sharing this awesome creativity with us!',
 'If you need more donations, please open fucking patreon page.',
 'I will donate and I would encourage others to do it.',
 'Do anything it takes to destroy these stupid cunts!']

In [142]:
clean_data[clean_data["Comment #"] == "13038"]

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive,label,binarized_label,tokenized,attention
26365,13038,Tiana,@user13939 Brother thanks for sharing this awe...,No,No,0,"[0, 1]","[$mention$, brother, thanks, for, sharing, thi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
26366,13038,Tiana,"If you need more donations, please open fuckin...",No,No,0,"[0, 1]","[if, you, need, more, donations, ,, please, op...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No,0,"[0, 1]","[i, will, donate, and, i, would, encourage, ot...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes,1,"[1, 0]","[do, anything, it, takes, to, destroy, these, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"


In [143]:
comments_df["merged_comment"] = comments_df["Comment"].apply(lambda row: " ".join(row))
comments_df["merged_comment"]

0        @user06652 You can't be serious? Just google "...
1                                    Feminist = Misandrist
2        Where the fuck do these fat cunts learn this s...
3        bearing holy shit i cant xDD these vids kill m...
4        dear bearing. you can not be "mansplaining" an...
                               ...                        
11535            wait woah I'm no fag but I love your vids
11536    @user13939 Brother thanks for sharing this awe...
11537                               "100 lkies". Try 6k :D
11538    You bet your faggot ass im sending you money r...
11539    add me on steam and get cs go lol you would lo...
Name: merged_comment, Length: 11540, dtype: object

In [144]:
comments_df["merged_label"] = comments_df["labels"].apply(lambda row: [max(row),1-max(row)])

In [145]:
comments_df["merged_label"]

0        [0, 1]
1        [1, 0]
2        [1, 0]
3        [0, 1]
4        [0, 1]
          ...  
11535    [1, 0]
11536    [1, 0]
11537    [0, 1]
11538    [1, 0]
11539    [0, 1]
Name: merged_label, Length: 11540, dtype: object

In [146]:
#preprocess and tokenize both sentence and comment level data
comments_df["tokenized"] = comments_df["merged_comment"].apply(lambda row: preproc.clean_str(p.tokenize(row)).split())


In [147]:
comments_df["tokenized"]

0        [$mention$, you, ca, n't, be, serious, \?, jus...
1                                   [feminist, misandrist]
2        [where, the, fuck, do, these, fat, cunts, lear...
3        [bearing, holy, shit, i, cant, xdd, these, vid...
4        [dear, bearing, you, can, not, be, mansplainin...
                               ...                        
11535    [wait, woah, i'm, no, fag, but, i, love, your,...
11536    [$mention$, brother, thanks, for, sharing, thi...
11537               [100, lkies, try, $number$k, $smiley$]
11538    [you, bet, your, faggot, ass, im, sending, you...
11539    [add, me, on, steam, and, get, cs, go, lol, yo...
Name: tokenized, Length: 11540, dtype: object

In [148]:
clean_data["tokenized"] = clean_data["Sentence"].apply(lambda row: preproc.clean_str(p.tokenize(row)).split())

In [149]:
comments_df["tokenized"].to_list()

[['$mention$',
  'you',
  'ca',
  "n't",
  'be',
  'serious',
  '\\?',
  'just',
  'google',
  'united',
  'nations',
  'you',
  'may',
  'learn',
  'something'],
 ['feminist', 'misandrist'],
 ['where',
  'the',
  'fuck',
  'do',
  'these',
  'fat',
  'cunts',
  'learn',
  'this',
  'shit',
  '\\?'],
 ['bearing',
  'holy',
  'shit',
  'i',
  'cant',
  'xdd',
  'these',
  'vids',
  'kill',
  'me',
  'lmao'],
 ['dear',
  'bearing',
  'you',
  'can',
  'not',
  'be',
  'mansplaining',
  'anything',
  'you',
  'can',
  'only',
  'bearsplain',
  'may',
  'be',
  'malebearsplain',
  'xd'],
 ['bearing', 'is', 'awesome'],
 ['instead',
  'of',
  'being',
  'a',
  'crazy',
  'cat',
  'lady',
  'she',
  'will',
  'be',
  'a',
  'crazy',
  'bird',
  'lady'],
 ['i',
  'feel',
  'sorry',
  'for',
  'the',
  'birds',
  'being',
  'around',
  'such',
  'a',
  'disgusting',
  'racist',
  'feminazi'],
 ['wow',
  ',',
  'i',
  'actually',
  'thought',
  'she',
  'was',
  'decent',
  'looking',
  ',',
  '

In [150]:
#save comment level tokenized data to corpus: newline seperated
def write_to_corpus(tokenized_comment_df,corpus_path):
    sent_list = tokenized_comment_df["tokenized"].to_list()
    corpus_file =  open(corpus_path, "w")
    corpus_file.write("\n".join([" ".join(sent) for sent in sent_list]))
    corpus_file.write("\n")

    #dont forget to close the corpus file!
    corpus_file.close()

    

In [151]:
write_to_corpus(comments_df,out_corpus_path)

In [152]:
clean_data

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive,label,binarized_label,tokenized,attention
0,1,Lauren,@user06652 You can't be serious?,No,No,0,"[0, 1]","[$mention$, you, ca, n't, be, serious, \?]","[0, 0, 0, 0, 0, 0, 0]"
1,1,Lauren,"Just google ""United Nations"".",No,No,0,"[0, 1]","[just, google, united, nations]","[0, 0, 0, 0]"
2,1,Lauren,You may learn something.,No,No,0,"[0, 1]","[you, may, learn, something]","[0, 0, 0, 0]"
3,2,Lauren,Feminist = Misandrist,No,Yes,1,"[1, 0]","[feminist, misandrist]","[1, 1]"
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes,1,"[1, 0]","[where, the, fuck, do, these, fat, cunts, lear...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No,0,"[0, 1]","[i, will, donate, and, i, would, encourage, ot...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes,1,"[1, 0]","[do, anything, it, takes, to, destroy, these, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
26369,13039,Tiana,"""100 lkies"". Try 6k :D",No,No,0,"[0, 1]","[100, lkies, try, $number$k, $smiley$]","[0, 0, 0, 0, 0]"
26370,13040,Tiana,You bet your faggot ass im sending you money r...,No,Yes,1,"[1, 0]","[you, bet, your, faggot, ass, im, sending, you...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [153]:
comments_df

Unnamed: 0,Comment,labels,comment_idx,sent_range,merged_comment,merged_label,tokenized
0,"[@user06652 You can't be serious?, Just google...","[0, 0, 0]",1,"[0, 2]","@user06652 You can't be serious? Just google ""...","[0, 1]","[$mention$, you, ca, n't, be, serious, \?, jus..."
1,[Feminist = Misandrist],[1],2,"[3, 3]",Feminist = Misandrist,"[1, 0]","[feminist, misandrist]"
2,[Where the fuck do these fat cunts learn this ...,[1],3,"[4, 4]",Where the fuck do these fat cunts learn this s...,"[1, 0]","[where, the, fuck, do, these, fat, cunts, lear..."
3,[bearing holy shit i cant xDD these vids kill ...,[0],4,"[5, 5]",bearing holy shit i cant xDD these vids kill m...,"[0, 1]","[bearing, holy, shit, i, cant, xdd, these, vid..."
4,"[dear bearing., you can not be ""mansplaining"" ...","[0, 0, 0, 0]",5,"[6, 9]","dear bearing. you can not be ""mansplaining"" an...","[0, 1]","[dear, bearing, you, can, not, be, mansplainin..."
...,...,...,...,...,...,...,...
11535,[wait woah I'm no fag but I love your vids],[1],13037,"[26364, 26364]",wait woah I'm no fag but I love your vids,"[1, 0]","[wait, woah, i'm, no, fag, but, i, love, your,..."
11536,[@user13939 Brother thanks for sharing this aw...,"[0, 0, 0, 1]",13038,"[26365, 26368]",@user13939 Brother thanks for sharing this awe...,"[1, 0]","[$mention$, brother, thanks, for, sharing, thi..."
11537,"[""100 lkies"". Try 6k :D]",[0],13039,"[26369, 26369]","""100 lkies"". Try 6k :D","[0, 1]","[100, lkies, try, $number$k, $smiley$]"
11538,[You bet your faggot ass im sending you money ...,[1],13040,"[26370, 26370]",You bet your faggot ass im sending you money r...,"[1, 0]","[you, bet, your, faggot, ass, im, sending, you..."


In [154]:
#add attention vector to sentence data
clean_data['attention'] = clean_data.apply(lambda row: [row['label']]*len(row['tokenized']),axis=1)

In [162]:
clean_data.head(30)

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive,label,binarized_label,tokenized,attention
0,1,Lauren,@user06652 You can't be serious?,No,No,0,"[0, 1]","[$mention$, you, ca, n't, be, serious, \?]","[0, 0, 0, 0, 0, 0, 0]"
1,1,Lauren,"Just google ""United Nations"".",No,No,0,"[0, 1]","[just, google, united, nations]","[0, 0, 0, 0]"
2,1,Lauren,You may learn something.,No,No,0,"[0, 1]","[you, may, learn, something]","[0, 0, 0, 0]"
3,2,Lauren,Feminist = Misandrist,No,Yes,1,"[1, 0]","[feminist, misandrist]","[1, 1]"
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes,1,"[1, 0]","[where, the, fuck, do, these, fat, cunts, lear...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
5,4,Lauren,bearing holy shit i cant xDD these vids kill m...,No,No,0,"[0, 1]","[bearing, holy, shit, i, cant, xdd, these, vid...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,5,Lauren,dear bearing.,No,No,0,"[0, 1]","[dear, bearing]","[0, 0]"
7,5,Lauren,"you can not be ""mansplaining"" anything.",No,No,0,"[0, 1]","[you, can, not, be, mansplaining, anything]","[0, 0, 0, 0, 0, 0]"
8,5,Lauren,"you can only ""bearsplain"".",No,No,0,"[0, 1]","[you, can, only, bearsplain]","[0, 0, 0, 0]"
9,5,Lauren,"may be ""malebearsplain"" XDï»¿",No,No,0,"[0, 1]","[may, be, malebearsplain, xd]","[0, 0, 0, 0]"


In [156]:
#now add attention to comment level data simply by cross referencing the comment label
comments_df["attention"] = comments_df["sent_range"].apply(lambda row: list(itertools.chain.from_iterable(clean_data.iloc[row[0]:row[1]]['attention'].to_list())))

In [161]:
comments_df.head(30)

Unnamed: 0,Comment,labels,comment_idx,sent_range,merged_comment,merged_label,tokenized,attention
0,"[@user06652 You can't be serious?, Just google...","[0, 0, 0]",1,"[0, 2]","@user06652 You can't be serious? Just google ""...","[0, 1]","[$mention$, you, ca, n't, be, serious, \?, jus...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,[Feminist = Misandrist],[1],2,"[3, 3]",Feminist = Misandrist,"[1, 0]","[feminist, misandrist]","[1, 1]"
2,[Where the fuck do these fat cunts learn this ...,[1],3,"[4, 4]",Where the fuck do these fat cunts learn this s...,"[1, 0]","[where, the, fuck, do, these, fat, cunts, lear...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
3,[bearing holy shit i cant xDD these vids kill ...,[0],4,"[5, 5]",bearing holy shit i cant xDD these vids kill m...,"[0, 1]","[bearing, holy, shit, i, cant, xdd, these, vid...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[dear bearing., you can not be ""mansplaining"" ...","[0, 0, 0, 0]",5,"[6, 9]","dear bearing. you can not be ""mansplaining"" an...","[0, 1]","[dear, bearing, you, can, not, be, mansplainin...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,[BEARING IS AWESOMEï»¿],[0],6,"[10, 10]",BEARING IS AWESOMEï»¿,"[0, 1]","[bearing, is, awesome]","[0, 0, 0]"
6,"[instead of being a crazy cat lady., She will ...","[0, 0]",7,"[11, 12]",instead of being a crazy cat lady. She will be...,"[0, 1]","[instead, of, being, a, crazy, cat, lady, she,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,[I feel sorry for the birds being around such ...,[1],8,"[13, 13]",I feel sorry for the birds being around such a...,"[1, 0]","[i, feel, sorry, for, the, birds, being, aroun...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
8,"[Wow, I actually thought she was decent lookin...","[1, 1]",9,"[14, 15]","Wow, I actually thought she was decent looking...","[1, 0]","[wow, ,, i, actually, thought, she, was, decen...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
9,"[I'm still laughing., *wipes tear from my eye*...","[0, 0]",10,"[16, 17]",I'm still laughing. *wipes tear from my eye* Y...,"[0, 1]","[i'm, still, laughing, wipes, tear, from, my, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [159]:
clean_data

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive,label,binarized_label,tokenized,attention
0,1,Lauren,@user06652 You can't be serious?,No,No,0,"[0, 1]","[$mention$, you, ca, n't, be, serious, \?]","[0, 0, 0, 0, 0, 0, 0]"
1,1,Lauren,"Just google ""United Nations"".",No,No,0,"[0, 1]","[just, google, united, nations]","[0, 0, 0, 0]"
2,1,Lauren,You may learn something.,No,No,0,"[0, 1]","[you, may, learn, something]","[0, 0, 0, 0]"
3,2,Lauren,Feminist = Misandrist,No,Yes,1,"[1, 0]","[feminist, misandrist]","[1, 1]"
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes,1,"[1, 0]","[where, the, fuck, do, these, fat, cunts, lear...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
...,...,...,...,...,...,...,...,...,...
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No,0,"[0, 1]","[i, will, donate, and, i, would, encourage, ot...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes,1,"[1, 0]","[do, anything, it, takes, to, destroy, these, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
26369,13039,Tiana,"""100 lkies"". Try 6k :D",No,No,0,"[0, 1]","[100, lkies, try, $number$k, $smiley$]","[0, 0, 0, 0, 0]"
26370,13040,Tiana,You bet your faggot ass im sending you money r...,No,Yes,1,"[1, 0]","[you, bet, your, faggot, ass, im, sending, you...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [177]:
def gen_pos_tags(data_df,out_folder_path, raw_col):
    ##adds a row containing POS tags to the given dataframe pased on the column containing row data
    comment_list = data_df[raw_col].to_list()
    pos_comment_list = []
    max_sent_num = 5000
    ind = 0
    
    tweet_pos_folder = os.path.join(out_folder_path, "for_pos")
    if not os.path.isdir(tweet_pos_folder):
        os.mkdir(tweet_pos_folder)
    
    for i in range(5):
        print(comment_list[i])


    while (ind < len(comment_list)):
        comm_list = comment_list[ind: ind + max_sent_num]

        #name of file to save newline seperated comments in 
        for_pos_filename = f"{raw_col}_{ind}.txt"

        #path of file to save newline seperated comments in 
        for_pos_filepath = os.path.join(tweet_pos_folder,for_pos_filename)


        #first, we write these tweets (newline seperated) into a text file
        with open (for_pos_filepath, 'w',encoding="utf-8") as f:
            for comm in comm_list:
                f.write(str(comm+'\n'))


        #then, we can run the pos tagger on that file and get the tags for each token
        pos_tags = get_pos_of_file(for_pos_filepath)

        #for each comment 
        pos_list = [comm[1] for comm in pos_tags]
        pos_comment_list = pos_comment_list + pos_list



        ind += max_sent_num
    assert len(pos_comment_list) == len(data_df)
    data_df['pos_tags'] = pos_comment_list
    
    



In [181]:
gen_pos_tags(comments_df, out_folder_path, "merged_comment")

@user06652 You can't be serious? Just google "United Nations". You may learn something.
Feminist = Misandrist
Where the fuck do these fat cunts learn this shit?ï»¿
bearing holy shit i cant xDD these vids kill me LMAOï»¿
dear bearing. you can not be "mansplaining" anything. you can only "bearsplain". may be "malebearsplain" XDï»¿


In [182]:
comments_df

Unnamed: 0,Comment,labels,comment_idx,sent_range,merged_comment,merged_label,tokenized,attention,pos_tags
0,"[@user06652 You can't be serious?, Just google...","[0, 0, 0]",1,"[0, 2]","@user06652 You can't be serious? Just google ""...","[0, 1]","[$mention$, you, ca, n't, be, serious, \?, jus...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[@, O, V, V, A, ,, R, ^, ,, ^, N, ,, ,, O, V, ..."
1,[Feminist = Misandrist],[1],2,"[3, 3]",Feminist = Misandrist,"[1, 0]","[feminist, misandrist]","[1, 1]","[A, ,, ^]"
2,[Where the fuck do these fat cunts learn this ...,[1],3,"[4, 4]",Where the fuck do these fat cunts learn this s...,"[1, 0]","[where, the, fuck, do, these, fat, cunts, lear...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[R, D, N, V, D, A, N, V, D, N, ,, G]"
3,[bearing holy shit i cant xDD these vids kill ...,[0],4,"[5, 5]",bearing holy shit i cant xDD these vids kill m...,"[0, 1]","[bearing, holy, shit, i, cant, xdd, these, vid...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[V, A, N, O, V, E, D, N, V, O, G]"
4,"[dear bearing., you can not be ""mansplaining"" ...","[0, 0, 0, 0]",5,"[6, 9]","dear bearing. you can not be ""mansplaining"" an...","[0, 1]","[dear, bearing, you, can, not, be, mansplainin...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[N, V, ,, O, V, R, V, ,, V, ,, N, ,, O, V, R, ..."
...,...,...,...,...,...,...,...,...,...
11535,[wait woah I'm no fag but I love your vids],[1],13037,"[26364, 26364]",wait woah I'm no fag but I love your vids,"[1, 0]","[wait, woah, i'm, no, fag, but, i, love, your,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[V, !, L, D, N, &, O, V, D, N]"
11536,[@user13939 Brother thanks for sharing this aw...,"[0, 0, 0, 1]",13038,"[26365, 26368]",@user13939 Brother thanks for sharing this awe...,"[1, 0]","[$mention$, brother, thanks, for, sharing, thi...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[@, N, N, P, V, D, A, N, P, O, ,, P, O, V, A, ..."
11537,"[""100 lkies"". Try 6k :D]",[0],13039,"[26369, 26369]","""100 lkies"". Try 6k :D","[0, 1]","[100, lkies, try, $number$k, $smiley$]","[0, 0, 0, 0, 0]","[,, $, N, ,, ,, V, $, E]"
11538,[You bet your faggot ass im sending you money ...,[1],13040,"[26370, 26370]",You bet your faggot ass im sending you money r...,"[1, 0]","[you, bet, your, faggot, ass, im, sending, you...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[O, V, D, N, N, L, V, O, N, R, R, ,, V, T, D, ..."


In [183]:
gen_pos_tags(clean_data, out_folder_path, "Sentence")

@user06652 You can't be serious?
Just google "United Nations".
You may learn something.
Feminist = Misandrist
Where the fuck do these fat cunts learn this shit?ï»¿


In [184]:
clean_data

Unnamed: 0,Comment #,Assigned to:,Sentence,Prosocial,Abusive,label,binarized_label,tokenized,attention,pos_tags
0,1,Lauren,@user06652 You can't be serious?,No,No,0,"[0, 1]","[$mention$, you, ca, n't, be, serious, \?]","[0, 0, 0, 0, 0, 0, 0]","[@, O, V, V, A, ,]"
1,1,Lauren,"Just google ""United Nations"".",No,No,0,"[0, 1]","[just, google, united, nations]","[0, 0, 0, 0]","[R, ^, ,, ^, N, ,, ,]"
2,1,Lauren,You may learn something.,No,No,0,"[0, 1]","[you, may, learn, something]","[0, 0, 0, 0]","[O, V, V, N, ,]"
3,2,Lauren,Feminist = Misandrist,No,Yes,1,"[1, 0]","[feminist, misandrist]","[1, 1]","[A, ,, ^]"
4,3,Lauren,Where the fuck do these fat cunts learn this s...,No,Yes,1,"[1, 0]","[where, the, fuck, do, these, fat, cunts, lear...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[R, D, N, V, D, A, N, V, D, N, ,, G]"
...,...,...,...,...,...,...,...,...,...,...
26367,13038,Tiana,I will donate and I would encourage others to ...,No,No,0,"[0, 1]","[i, will, donate, and, i, would, encourage, ot...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[O, V, V, &, O, V, V, N, P, V, O, ,]"
26368,13038,Tiana,Do anything it takes to destroy these stupid c...,No,Yes,1,"[1, 0]","[do, anything, it, takes, to, destroy, these, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[V, N, O, V, P, V, D, A, N, ,]"
26369,13039,Tiana,"""100 lkies"". Try 6k :D",No,No,0,"[0, 1]","[100, lkies, try, $number$k, $smiley$]","[0, 0, 0, 0, 0]","[,, $, N, ,, ,, V, $, E]"
26370,13040,Tiana,You bet your faggot ass im sending you money r...,No,Yes,1,"[1, 0]","[you, bet, your, faggot, ass, im, sending, you...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[O, V, D, N, N, L, V, O, N, R, R, ,, V, T, D, ..."


In [None]:
#embedding and POS vocab have already been formed, so we can omit
