In [9]:
import word2vec
import json
import os
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import math
import nltk
from nltk.corpus import stopwords
PROJ_NAME = "BRNN_TOXIC"
MAX_COMMENT_LENGTH = 1500
stops = stopwords.words('english')
LABELS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

In [10]:
with open("config.json",'r') as f:
    config_file = json.load(f)["BASE_CONFIG"]
with open(config_file,'r') as f:
    config = json.load(f)
data_dir=os.path.join(config["data_dir"],PROJ_NAME)
model_dir=os.path.join(config["model_dir"],PROJ_NAME)
out_dir=os.path.join(config["out_dir"],PROJ_NAME)

In [11]:
def adjust_class_balance(df: pd.DataFrame, interested_labels, thresh):
    dfs = {}
    null = df.copy()
    for name in interested_labels:
        dfs[name] = df.loc[(df[name] == 1)]
        null.drop(null[null[name]==1].index,axis=0,inplace=True)
        
    print("NULL:", 100*(len(null)/len(df)))
    for name, d in dfs.items():
        print("Initial percentage of DF for", name, "is", 100*(len(d)/len(df)))
    
    print("Each label will now have at least", thresh*100,"% of the origional df size")
    adjusted_df = null.sample(int(thresh*len(df))) # get a subsample of null cases
    

    for n, d in dfs.items():
        i=0
        for times in range(math.ceil((thresh/(len(d)/len(df))+1))):
            adjusted_df = adjusted_df.append(d)
            i+=1
        print(n,"upsampled",i,"times")
    return adjusted_df
      

generate raw text

In [19]:
i=0
df = pd.read_csv(os.path.join(data_dir, "train.csv"))
a_df = adjust_class_balance(df, LABELS, 1/(len(LABELS)+1))  
lem = WordNetLemmatizer()
with open(os.path.join(data_dir,"comments.txt"),'w') as f:
    for comment in np.array(df["comment_text"].values):
        comment = ' '.join(' '.join(comment.split()).split('\n'))
        clean_comment = []
        if len(word_tokenize(comment))<MAX_COMMENT_LENGTH:
            for word in word_tokenize(comment):
                if word not in stops:
                    clean_comment.append(lem.lemmatize(word.lower()))
            clean_comment = ' '.join(clean_comment)    
            f.write(clean_comment+"\n")
        if i%1000 ==0:
            print((i/len(df))*100,'%')
        i+=1

NULL: 89.83211235124176
Initial percentage of DF for toxic is 9.584448302009765
Initial percentage of DF for severe_toxic is 0.9995550569965721
Initial percentage of DF for obscene is 5.2948217407925
Initial percentage of DF for threat is 0.2995531769557125
Initial percentage of DF for insult is 4.936360616904074
Initial percentage of DF for identity_hate is 0.8804858025581089
Each label will now have at least 14.285714285714285 % of the origional df size
toxic upsampled 3 times
severe_toxic upsampled 16 times
obscene upsampled 4 times
threat upsampled 49 times
insult upsampled 4 times
identity_hate upsampled 18 times
0.0 %
0.6266802865182269 %
1.2533605730364539 %
1.8800408595546811 %
2.5067211460729077 %
3.1334014325911346 %
3.7600817191093623 %
4.386762005627589 %
5.0134422921458155 %
5.640122578664044 %
6.266802865182269 %
6.893483151700497 %
7.520163438218725 %
8.14684372473695 %
8.773524011255178 %
9.400204297773405 %
10.026884584291631 %
10.65356487080986 %
11.280245157328087 %


In [20]:
word2vec.word2vec(os.path.join(data_dir,"comments.txt"), os.path.join(data_dir,"text.bin"), size=100, verbose=True)

Starting training using file /Users/AlexPowers/code/nlp-dl/data/BRNN_TOXIC/comments.txt
Vocab size: 41913
Words in train file: 8165946
Alpha: 0.000010  Progress: 100.00%  Words/thread/sec: 296.21k  

In [21]:
model = word2vec.load(os.path.join(data_dir,"text.bin"))