In [3]:
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing import text
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from tensorflow.keras import Sequential
import math
import nltk
import json
import os
from nltk.corpus import stopwords
PROJ_NAME = "BRNN_TOXIC"
MAX_COMMENT_LENGTH = 1500
stops = stopwords.words('english')
LABELS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
NUM_CLASSES = len(LABELS)

In [4]:
with open("config.json",'r') as f:
    config_file = json.load(f)["BASE_CONFIG"]
with open(config_file,'r') as f:
    config = json.load(f)
data_dir=os.path.join(config["data_dir"],PROJ_NAME)
model_dir=os.path.join(config["model_dir"],PROJ_NAME)
out_dir=os.path.join(config["out_dir"],PROJ_NAME)

In [5]:
def adjust_class_balance(df: pd.DataFrame, interested_labels, thresh):
    dfs = {}
    null = df.copy()
    for name in interested_labels:
        dfs[name] = df.loc[(df[name] == 1)]
        null.drop(null[null[name]==1].index,axis=0,inplace=True)
        
    print("NULL:", 100*(len(null)/len(df)))
    for name, d in dfs.items():
        print("Initial percentage of DF for", name, "is", 100*(len(d)/len(df)))
    
    print("Each label will now have at least", thresh*100,"% of the origional df size")
    adjusted_df = null.sample(int(thresh*len(df))) # get a subsample of null cases
    

    for n, d in dfs.items():
        i=0
        for times in range(math.ceil((thresh/(len(d)/len(df))+1))):
            adjusted_df = adjusted_df.append(d)
            i+=1
        print(n,"upsampled",i,"times")
    return adjusted_df

In [7]:
# df = pd.read_csv(os.path.join(data_dir, "train.csv"))
# a_df = adjust_class_balance(df, LABELS, 1/(len(LABELS)+1))  
# a_df.to_csv(os.path.join(data_dir, "CLASS_ADJUSTED.csv"))
a_df = pd.read_csv(os.path.join(data_dir, "CLASS_ADJUSTED.csv"))

NULL: 89.83211235124176
Initial percentage of DF for toxic is 9.584448302009765
Initial percentage of DF for severe_toxic is 0.9995550569965721
Initial percentage of DF for obscene is 5.2948217407925
Initial percentage of DF for threat is 0.2995531769557125
Initial percentage of DF for insult is 4.936360616904074
Initial percentage of DF for identity_hate is 0.8804858025581089
Each label will now have at least 14.285714285714285 % of the origional df size
toxic upsampled 3 times
severe_toxic upsampled 16 times
obscene upsampled 4 times
threat upsampled 49 times
insult upsampled 4 times
identity_hate upsampled 18 times


In [8]:
tokenizer = text.Tokenizer(num_words=10000)

In [9]:
tokenizer.fit_on_texts(a_df["comment_text"].values)

In [10]:
encoded_docs = tokenizer.texts_to_matrix(a_df["comment_text"].values, mode='count')

In [23]:
np.save(os.path.join(data_dir, "ENCODED_DOCS"),encoded_docs)

KeyboardInterrupt: 

In [60]:
model = Sequential()
model.add(tf.keras.layers.Dense(1000, input_shape=(10000,)))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dense(500))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dense(200))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dense(NUM_CLASSES))
model.add(tf.keras.layers.Activation('sigmoid'))
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
Y = np.array(a_df[LABELS].values)
np.save(os.path.join("ENCODED_LABELS"), Y)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(encoded_docs,Y,test_size=0.33)

In [63]:
model.fit(
    x=X_train,
    y=y_train,
    batch_size=10,
    epochs=50,
    verbose=1,
    shuffle=True,
)

Train on 335 samples, validate on 165 samples
Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x2b2540675a58>

In [64]:
print(model.evaluate(X_test,y_test))



[0.0, 0.27878787896849894]

In [67]:
model.save(os.path.join(model_dir,"IE_CNN"))
model.save_weights(os.path.join(model_dir,"IE_CNN_WEIGHTS"))