In [16]:
import pandas as pd
import numpy as np
import random
import json
import pickle
from os import path, makedirs

from sklearn.metrics import roc_curve, auc

In [2]:
# Pretty print JSON objects
def pretty_print(data_dict):
    try:
        print(json.dumps(data_dict, indent=4))
    except TypeError:
        print(data_dict)
    except Exception as e:
        print(str(e))

# Create all required folder paths (recursively)
def create_paths(path_list):
    print('Creating all folder paths... ', end='', flush=True)
    for folder_path in path_list:
        if not path.exists(folder_path):
            makedirs(folder_path)
    print('Done.')

# Dump data to disk if not present
def dump_data(data, name, file_path, force=False):
    if force or not path.isfile(file_path):
        print('Dumping {}... '.format(name), end='', flush=True)
        pickle.dump(data, open(file_path, 'wb'))
        print('Done.')
    else:
        print('Did not dump {}: File already exists in "{}".'.format(name, file_path))

# Load all data sets
def load_data(data_cols, data_path='../data/', clean='_clean', os=''):
    print('Loading data... ', end='', flush=True)
    data_sets = {}
    
    for col in data_cols:
        data_sets[col] = pickle.load(open(data_path+'{}.pkl'.format(col),'rb'))

    print('Done.')
    return data_sets

# Get AUC-ROC valuess for all models and target columns
def get_auc_values(y_test, probabilities, model_list, target_cols, \
                          vec='countvec', features='_features'):
    aucs = {}
    for model in model_list:
        print('\tObtaining auc value for {}...'.format(model))
        aucs[model] = {}
        for target in target_cols:
            fpr, tpr, threshold = roc_curve(y_test[target], probabilities[model][target])
            auc_value = auc(fpr, tpr)
            aucs[model][target] = auc_value
        print('\tDone.')
    return aucs

# Generate mean column-wise AUC for all models
def get_mean_auc(aucs, model_list=None, target_cols=None, plot_type='model'):
    print('\tComputing mean AUCs... ', end='', flush=True)
    mean_aucs = {}
    # Compute mean auc by model
    columns = model_list if plot_type == 'model' else target_cols
    for col in columns:
        mean_aucs[col] = np.mean(list(aucs[col].values()))
    print('Done.')
    return mean_aucs

# Generate a summary AUCs dataframe for all models vs. all target columns
def get_aucs_df(aucs, model_list, target_cols, plot_type='model'):
    print('\tGenerating AUCs DataFrame... ', end='', flush=True)
    aucs_df = pd.DataFrame.from_dict(aucs)
    aucs_df['mean'] = np.mean(aucs_df, axis=1)
    aucs_df.loc['mean'] = np.mean(aucs_df, axis=0)
    print('Done.')
    return aucs_df

# Plot all ROC curves, dump all mean column-wise AUCs, generate summary AUCs dataframe, and return final predictions
def plot_and_dump_results(data_sets, best_refitted_models, model_list, vec, target_cols, plot_type='model', clean='_clean', \
                          os='', features='_features', plots_path='../plots/', pickle_path='../pickle_objects/', force=False):

    pretty_print(aucs)

    mean_aucs = get_mean_auc(aucs, model_list, target_cols, plot_type)

    pretty_print(mean_aucs)

    aucs_df = get_aucs_df(aucs, model_list, target_cols, plot_type)
    
    print('\tAUCs DataFrame for {}:'.format(vec))
    print(aucs_df)

    print('\t', end='', flush=True)
    dump_data(aucs_df, 'AUCs DataFrame', '{}aucs_{}{}.pkl'.format(pickle_path, vec, features), force)

    if plot_type == 'model':
        return probabilities, predictions

In [97]:
# Set random seed
random.seed(1337)

# Specify whether to use cleaned data or not
is_clean, is_os = 1, 0
clean = '_clean' if is_clean else ''
os = '_os' if is_os else ''

# Specify whether to use additional features
use_features = 0
features = '_features' if use_features else ''

# Set all folder paths
data_path = '../data/'
pickle_path = '../pickle_objects/'
model_path = pickle_path + 'models{}/'.format(features)
plots_path = '../plots{}/'.format(features)

# Specify initial variables
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
vectorizers = ['countvec', 'tfidf']
plot_types = ['model', 'target']
data_cols = ['data', 'X_train', 'X_val', 'X_train_val', 'X_test', 'y_train', 'y_val', 'y_train_val', 'y_test']
for i, col in enumerate(data_cols):
    data_cols[i] = col + clean + os

create_paths([data_path, pickle_path, plots_path])

Creating all folder paths... Done.


In [98]:
# Load all data sets
data_sets = load_data(data_cols, data_path, clean=clean, os=os)

Loading data... Done.


In [99]:
# List all models to be run
model_list = ['bnb', 'lrl1', 'lrl2', 'nbsvm', 'rf', 'xgb']

In [100]:
probabilities = pickle.load(open('../pickle_objects/probabilities{}.pkl'.format(features), 'rb'))
predictions = pickle.load(open('../pickle_objects/predictions{}.pkl'.format(features), 'rb'))

In [101]:
for vec in vectorizers:
    probabilities[vec]['ensemble_xgb_rf'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_xgb_rf'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_xgb_rf'][target] = \
            0.5*(probabilities[vec]['xgb'][target] + probabilities[vec]['rf'][target])

In [102]:
for vec in vectorizers:
    probabilities[vec]['ensemble_xgb_lrl1'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_xgb_lrl1'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_xgb_lrl1'][target] = \
            0.5*(probabilities[vec]['xgb'][target] + probabilities[vec]['lrl1'][target])

In [103]:
for vec in vectorizers:
    probabilities[vec]['ensemble_xgb_lrl2'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_xgb_lrl2'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_xgb_lrl2'][target] = \
            0.5*(probabilities[vec]['xgb'][target] + probabilities[vec]['lrl2'][target])

In [104]:
for vec in vectorizers:
    probabilities[vec]['ensemble_lrl1_rf'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_lrl1_rf'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_xgb_rf'][target] = \
            0.5*(probabilities[vec]['lrl1'][target] + probabilities[vec]['rf'][target])

In [105]:
for vec in vectorizers:
    probabilities[vec]['ensemble_lrl2_rf'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_lrl2_rf'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_lrl2_rf'][target] = \
            0.5*(probabilities[vec]['lrl2'][target] + probabilities[vec]['rf'][target])

In [106]:
for vec in vectorizers:
    probabilities[vec]['ensemble_lrl1_lrl2'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_lrl1_lrl2'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_lrl1_lrl2'][target] = \
            0.5*(probabilities[vec]['lrl1'][target] + probabilities[vec]['lrl2'][target])

In [107]:
for vec in vectorizers:
    probabilities[vec]['ensemble_not_bnb'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_not_bnb'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_not_bnb'][target] = \
            0.2*(probabilities[vec]['lrl1'][target] + probabilities[vec]['lrl2'][target] + \
                probabilities[vec]['rf'][target] + probabilities[vec]['xgb'][target] + \
                probabilities[vec]['nbsvm'][target])

In [108]:
for vec in vectorizers:
    probabilities[vec]['ensemble_tree_linear'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_tree_linear'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_tree_linear'][target] = \
            0.25*(probabilities[vec]['lrl1'][target] + probabilities[vec]['lrl2'][target] + \
                probabilities[vec]['rf'][target] + probabilities[vec]['xgb'][target])

In [109]:
for vec in vectorizers:
    probabilities[vec]['ensemble_xgb_nbsvm_lrl2'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_xgb_nbsvm_lrl2'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_xgb_nbsvm_lrl2'][target] = \
            (1./3.)*(probabilities[vec]['nbsvm'][target] + probabilities[vec]['lrl2'][target] + \
                probabilities[vec]['xgb'][target])

In [110]:
for vec in vectorizers:
    probabilities[vec]['ensemble_xgb_nbsvm'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_xgb_nbsvm'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_xgb_nbsvm'][target] = \
            0.5*(probabilities[vec]['nbsvm'][target] + probabilities[vec]['xgb'][target])

In [111]:
for vec in vectorizers:
    probabilities[vec]['ensemble_lrl2_nbsvm'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_lrl2_nbsvm'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_lrl2_nbsvm'][target] = \
            0.5*(probabilities[vec]['lrl2'][target] + probabilities[vec]['nbsvm'][target])

In [112]:
for vec in vectorizers:
    probabilities[vec]['ensemble_linear_nbsvm'] = {}
    for target in target_cols:
        probabilities[vec]['ensemble_linear_nbsvm'][target] = [0.]*len(data_sets['y_test_clean'])
        for model in model_list:
            probabilities[vec]['ensemble_linear_nbsvm'][target] = \
            (1./3.)*(probabilities[vec]['lrl2'][target] + probabilities[vec]['nbsvm'][target] + \
                probabilities[vec]['lrl1'][target])

In [113]:
aucs = {}
for vec in vectorizers:
    aucs[vec] = get_auc_values(data_sets['y_test_clean'], probabilities[vec], \
                               ['ensemble_xgb_rf', 'ensemble_xgb_lrl1', 'ensemble_xgb_lrl2', \
                               'ensemble_lrl1_rf', 'ensemble_lrl2_rf', 'ensemble_lrl1_lrl2', \
                               'ensemble_not_bnb', 'ensemble_tree_linear', 'ensemble_xgb_nbsvm_lrl2', \
                               'ensemble_xgb_nbsvm', 'ensemble_linear_nbsvm', 'ensemble_lrl2_nbsvm'], \
                               target_cols, vec, features)

	Obtaining auc value for ensemble_xgb_rf...
	Done.
	Obtaining auc value for ensemble_xgb_lrl1...
	Done.
	Obtaining auc value for ensemble_xgb_lrl2...
	Done.
	Obtaining auc value for ensemble_lrl1_rf...
	Done.
	Obtaining auc value for ensemble_lrl2_rf...
	Done.
	Obtaining auc value for ensemble_lrl1_lrl2...
	Done.
	Obtaining auc value for ensemble_not_bnb...
	Done.
	Obtaining auc value for ensemble_tree_linear...
	Done.
	Obtaining auc value for ensemble_xgb_nbsvm_lrl2...
	Done.
	Obtaining auc value for ensemble_xgb_nbsvm...
	Done.
	Obtaining auc value for ensemble_linear_nbsvm...
	Done.
	Obtaining auc value for ensemble_lrl2_nbsvm...
	Done.
	Obtaining auc value for ensemble_xgb_rf...
	Done.
	Obtaining auc value for ensemble_xgb_lrl1...
	Done.
	Obtaining auc value for ensemble_xgb_lrl2...
	Done.
	Obtaining auc value for ensemble_lrl1_rf...
	Done.
	Obtaining auc value for ensemble_lrl2_rf...
	Done.
	Obtaining auc value for ensemble_lrl1_lrl2...
	Done.
	Obtaining auc value for ensemble_not

In [114]:
t = pd.DataFrame.from_dict(aucs['tfidf'])
np.round(t.T, 3)

Unnamed: 0,identity_hate,insult,obscene,severe_toxic,threat,toxic
ensemble_linear_nbsvm,0.95,0.953,0.968,0.977,0.975,0.935
ensemble_lrl1_lrl2,0.95,0.953,0.969,0.977,0.974,0.934
ensemble_lrl1_rf,0.5,0.5,0.5,0.5,0.5,0.5
ensemble_lrl2_nbsvm,0.951,0.953,0.968,0.977,0.977,0.935
ensemble_lrl2_rf,0.952,0.956,0.97,0.974,0.973,0.934
ensemble_not_bnb,0.952,0.956,0.97,0.976,0.975,0.935
ensemble_tree_linear,0.952,0.956,0.97,0.975,0.974,0.935
ensemble_xgb_lrl1,0.949,0.954,0.97,0.976,0.971,0.934
ensemble_xgb_lrl2,0.953,0.955,0.971,0.976,0.975,0.934
ensemble_xgb_nbsvm,0.951,0.954,0.97,0.976,0.976,0.934


In [115]:
np.mean(t)

ensemble_linear_nbsvm      0.959580
ensemble_lrl1_lrl2         0.959497
ensemble_lrl1_rf           0.500000
ensemble_lrl2_nbsvm        0.960037
ensemble_lrl2_rf           0.959589
ensemble_not_bnb           0.960798
ensemble_tree_linear       0.960431
ensemble_xgb_lrl1          0.959103
ensemble_xgb_lrl2          0.960679
ensemble_xgb_nbsvm         0.960113
ensemble_xgb_nbsvm_lrl2    0.961454
ensemble_xgb_rf            0.957275
dtype: float64

In [125]:
pd.DataFrame(probabilities['tfidf']['rf']['severe_toxic']).describe()

Unnamed: 0,0
count,31915.0
mean,0.010039
std,0.008026
min,0.00101
25%,0.007567
50%,0.00893
75%,0.009967
max,0.150447


In [127]:
np.mean(predictions['tfidf']['rf']['severe_toxic'])

0.0

In [129]:
data_sets.keys()

dict_keys(['data_clean', 'X_train_clean', 'X_val_clean', 'X_train_val_clean', 'X_test_clean', 'y_train_clean', 'y_val_clean', 'y_train_val_clean', 'y_test_clean'])

In [135]:
data_sets['X_train_val_clean_tfidf_features'] = pickle.load(open('../data/X_train_val_clean_tfidf_features.pkl', 'rb'))
data_sets['y_train_val_clean'] = pickle.load(open('../data/X_train_val_clean.pkl', 'rb'))

In [130]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier()
clf.fit(data_sets['X_train_val_clean_tfidf_features'], data_sets['y_train_val_clean'])