In [2]:
import sklearn as sk
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
from collections import Counter
from string import punctuation
import pandas as pd
import os
import math
import json
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from nltk import PorterStemmer
from nltk import word_tokenize
import multiprocessing as mp
PROJ_NAME = "SURVEY_TOXIC"
LABELS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
NUM_CLASSES = len(LABELS)

In [4]:
with open("config.json",'r') as f:
    config_file = json.load(f)["BASE_CONFIG"]
with open(config_file,'r') as f:
    config = json.load(f)
data_dir=os.path.join(config["data_dir"],PROJ_NAME)
model_dir=os.path.join(config["model_dir"],PROJ_NAME)
out_dir=os.path.join(config["out_dir"],PROJ_NAME)

In [82]:
# Models to evaluate
# RandomForest
# SVC, NuSVC -- support vector methods 
# Nearest Neighbor Classification
# Voting Classifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.cluster import DBSCAN, KMeans, MeanShift, AgglomerativeClustering
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
clfs = {
    'ensemble' : {
        'rfc':RandomForestClassifier(n_estimators=20)},
    'svm' : {
        'svc':SVC(), 
        'nsc':NuSVC()},
    'tree' : {
        'dtc':DecisionTreeClassifier()},
}

In [10]:
def adjust_class_balance(df: pd.DataFrame, interested_labels, thresh):
    dfs = {}
    null = df.copy()
    for name in interested_labels:
        dfs[name] = df.loc[(df[name] == 1)]
        null.drop(null[null[name]==1].index,axis=0,inplace=True)
        
    print("NULL:", 100*(len(null)/len(df)))
    for name, d in dfs.items():
        print("Initial percentage of DF for", name, "is", 100*(len(d)/len(df)))
    
    print("Each label will now have at least", thresh*100,"% of the origional df size")
    adjusted_df = null.sample(int(thresh*len(df))) # get a subsample of null cases
    

    for n, d in dfs.items():
        i=0
        for times in range(math.ceil((thresh/(len(d)/len(df))+1))):
            adjusted_df = adjusted_df.append(d)
            i+=1
        print(n,"upsampled",i,"times")
    return adjusted_df

In [11]:
df = pd.read_csv(os.path.join(data_dir, "train.csv"))
a_df = adjust_class_balance(df, LABELS, 1/(len(LABELS)+1))

NULL: 89.83211235124176
Initial percentage of DF for toxic is 9.584448302009765
Initial percentage of DF for severe_toxic is 0.9995550569965721
Initial percentage of DF for obscene is 5.2948217407925
Initial percentage of DF for threat is 0.2995531769557125
Initial percentage of DF for insult is 4.936360616904074
Initial percentage of DF for identity_hate is 0.8804858025581089
Each label will now have at least 14.285714285714285 % of the origional df size
toxic upsampled 3 times
severe_toxic upsampled 16 times
obscene upsampled 4 times
threat upsampled 49 times
insult upsampled 4 times
identity_hate upsampled 18 times


In [12]:
df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
a_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,208213.0,208213.0,208213.0,208213.0,208213.0,208213.0
mean,0.854505,0.254446,0.671385,0.147719,0.653269,0.229448
std,0.3526,0.43555,0.469711,0.354822,0.475931,0.420479
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,1.0,0.0,1.0,0.0
75%,1.0,1.0,1.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
tokenizer = keras.preprocessing.text.Tokenizer()

In [36]:
def clean_data(data):
    i = 0
    l = len(data)
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stops = [ stemmer.stem(lemmatizer.lemmatize(w)) for w in stopwords.words('english')]
    data = [ ' '.join(' '.join(phrase.split(punctuation)).split()) for phrase in data]
    cleaned = []
    for phrase in data:
        if i% 10000 == 0:
            print(i/l)
        i+=1
        temp = []
        for word in word_tokenize(phrase):
            c_w = stemmer.stem(lemmatizer.lemmatize(word))
            if c_w not in stops:
                temp.append(c_w)
        cleaned.append(temp)
    return cleaned

In [37]:
clean = clean_data(a_df["comment_text"])

0.0
0.04802774082309942
0.09605548164619884
0.14408322246929828
0.19211096329239769
0.24013870411549712
0.28816644493859656
0.33619418576169596
0.38422192658479537
0.4322496674078948
0.48027740823099424
0.5283051490540936
0.5763328898771931
0.6243606307002925
0.6723883715233919
0.7204161123464914
0.7684438531695907
0.8164715939926902
0.8644993348157896
0.912527075638889
0.9605548164619885


In [41]:
with open(os.path.join(data_dir, "CLEAN_ADJUSTED.txt"),'w') as f:
    for phrase in clean:
        f.write("%s\n" % ' '.join(phrase))
a_df.to_csv(os.path.join(data_dir, "ADJUSTED.csv"))

In [58]:
def bag_of_words(data:np.array):
    i=0
    l = len(data)
    a = []
    for sent in data:
            for word in sent:
                a.append(word)
    bag = list(dict(Counter(a).most_common(2000)).keys())
    
    vecs = []
    i=0
    for sent in data:
        if i % 10000 == 0:
            print(i/l)
        i+=1
        vec = []
        for word in bag:
            vec.append(int(word in sent))
        vecs.append(vec)
    return bag, np.array(vecs)

y = np.array(a_df[LABELS])
bag, X = bag_of_words(clean)

0.0
0.04802774082309942
0.09605548164619884
0.14408322246929828
0.19211096329239769
0.24013870411549712
0.28816644493859656
0.33619418576169596
0.38422192658479537
0.4322496674078948
0.48027740823099424
0.5283051490540936
0.5763328898771931
0.6243606307002925
0.6723883715233919
0.7204161123464914
0.7684438531695907
0.8164715939926902
0.8644993348157896
0.912527075638889
0.9605548164619885


In [3]:
np.save(os.path.join(data_dir, "X.npy"), X)
np.save(os.path.join(data_dir, "Y.npy"), y)
np.save(os.path.join(data_dir, "BAG.npy"), np.array(bag))

[0;31mSignature:[0m [0mnp[0m[0;34m.[0m[0msave[0m[0;34m([0m[0mfile[0m[0;34m,[0m [0marr[0m[0;34m,[0m [0mallow_pickle[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mfix_imports[0m[0;34m=[0m[0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Save an array to a binary file in NumPy ``.npy`` format.

Parameters
----------
file : file, str, or pathlib.Path
    File or filename to which the data is saved.  If file is a file-object,
    then the filename is unchanged.  If file is a string or Path, a ``.npy``
    extension will be appended to the file name if it does not already
    have one.
arr : array_like
    Array data to be saved.
allow_pickle : bool, optional
    Allow saving object arrays using Python pickles. Reasons for disallowing
    pickles include security (loading pickled data can execute arbitrary
    code) and portability (pickled objects may not be loadable on different
    Python installations, for example if the stored objects req

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X[:50000], y[:50000], test_size=0.33)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
for index, label in enumerate(LABELS):
    print("training for %s"%label)
    y_temp_train = y_train[:,index]
    print(y_temp_train.shape,max(y_temp_train),min(y_temp_train))
    y_temp_test = y_test[:,index]
    print(y_temp_test.shape,max(y_temp_test),min(y_temp_test))
    for classifier_type, dic in clfs.items():
        for name, clf in dic.items():
            print("training %s"%name)
            clfs[classifier_type][name].fit(X_train, y_temp_train)
            score = clfs[classifier_type][name].score(X_test, y_temp_test)
            print("%s has acc of: %d" %(name,score))

training for toxic
(33500,) 1 0
(16500,) 1 0
training rfc
rfc has acc of: 0
training svc
