In [1]:
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import pickle
from os import path, makedirs
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from skmultilearn.problem_transform import BinaryRelevance

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

In [3]:
# Set random seed
random.seed(1337)

In [4]:
def load_data(is_clean=0, is_os=0):
    clean = '_clean' if is_clean else ''
    os = '_os' if is_os else ''
    data_sets = {}
    data_cols = [
        'data', 'X_train', 'X_val', 'X_train_val', 'X_test', \
        'y_train', 'y_val', 'y_train_val', 'y_test'
    ]
    
    for i, col in enumerate(data_cols):
        data_cols[i] = col + clean + os
    
    for col in data_cols:
        data_sets[col] = pickle.load(open('../data/{}.pkl'.format(col),'rb'))
    
    return data_sets

In [5]:
is_clean, is_os = 1, 0
clean, os = '_clean', ''

In [6]:
data_sets = load_data(is_clean=1, is_os=0)

In [7]:
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
data_sets['data'+clean+os].head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit username hardcore metallica f...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww match background colour -pron- seemingly...,0,0,0,0,0,0
2,000113f07ec002fd,hey man -pron- not try edit war -pron- guy con...,0,0,0,0,0,0
3,0001b41b1c6bb37e,not real suggestion improvement wonder section...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
5,00025465d4725e87,congratulations good use tool good · talk,0,0,0,0,0,0
6,0002bcb3da6cb337,cocksucker piss work,1,1,1,0,1,0
7,00031b1e95af7921,vandalism matt shirvington article revert not ban,0,0,0,0,0,0
8,00037261f536c51d,sorry word nonsense offensive -pron- not inten...,0,0,0,0,0,0
9,00040093b2687caa,alignment subject contrary dulithgow,0,0,0,0,0,0


In [9]:
data_sets['X_train'+clean+os].head()

Unnamed: 0,id,comment_text
101689,202d64bd4c2bd9a4,champion hurdle delighted use 1 2_3 image uplo...
26429,4601831cd03ec679,edit war currently appear engage edit war note...
62499,a73c6f5b69d963be,-pron- wrong bud ready use taran boi verdict n...
10145,1ad558921b36c611,attempt change las vegas season las vegas augu...
16254,2adc0c27c1165eef,people arab homosexual bad thing d**n f***in h...


In [10]:
# Print value counts for each target
for col in target_cols:
    print('{}:'.format(col))
    for data in data_sets:
        if 'y' in data:
            value_counts = data_sets[data][col].value_counts()
            print('{}: {:.3f}%\t'.format(data, 100*value_counts[1]/sum(value_counts)), end='')
    print('\n')

toxic:
y_train_clean: 9.528%	y_val_clean: 9.551%	y_train_val_clean: 9.533%	y_test_clean: 9.789%	

severe_toxic:
y_train_clean: 1.005%	y_val_clean: 0.993%	y_train_val_clean: 1.002%	y_test_clean: 0.990%	

obscene:
y_train_clean: 5.321%	y_val_clean: 5.274%	y_train_val_clean: 5.309%	y_test_clean: 5.239%	

threat:
y_train_clean: 0.287%	y_val_clean: 0.295%	y_train_val_clean: 0.289%	y_test_clean: 0.342%	

insult:
y_train_clean: 4.960%	y_val_clean: 4.860%	y_train_val_clean: 4.935%	y_test_clean: 4.941%	

identity_hate:
y_train_clean: 0.902%	y_val_clean: 0.833%	y_train_val_clean: 0.885%	y_test_clean: 0.862%	



In [11]:
pickle_path = '../pickle_objects/'
if not path.exists(pickle_path):
    makedirs(pickle_path)

In [12]:
num_feats = 1000
n_grams = 2
ngram_range = list(map(lambda x: x+1,range(n_grams)))

In [23]:
def load_ngrams(data, num_feats, ngram_range, is_clean=1, is_os=0):
    clean = '_clean' if is_clean else ''
    os = '_os' if is_os else ''
    ngrams = {}
    vec_params = {'analyzer': 'word', 'lowercase': True,'max_features': num_feats, 'ngram_range': ngram_range}
    
    for vec in ['countvec', 'tfidf']:
        # Load vectorizer if present
        file_name = '{}{}_{}_ngrams_{}{}.pkl'.format(pickle_path, vec, num_feats, n_grams, clean+os)
        if path.isfile(file_name):
            ngrams[vec] = pickle.load(open(file_name, 'rb'))
        else:
            # Fit, store, and load vectorizer
            ngrams_vec = CountVectorizer(**vec_params) if vec == 'countvec' else TfidfVectorizer(**vec_params)
            ngrams_vec.fit(data['comment_text'])
            ngrams[vec] = ngrams_vec
            pickle.dump(ngrams_vec, open(file_name, 'wb'))
    return ngrams

In [24]:
ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range, is_clean, is_os)

In [25]:
def transform_to_ngrams(data_set, data_cols, ngrams):
    for data in data_cols:
        for vec in ['countvec', 'tfidf']:
            data_sets[data+'_'+vec] = ngrams[vec].transform(data_sets[data]['comment_text'])
    return data_sets

In [26]:
data_sets = transform_to_ngrams(data_sets, ['X_train_val'+clean+os, 'X_test'+clean+os], ngrams)

In [27]:
def normalize_data(X_train, X_test):
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [36]:
def fit_model(model, X, y, params, scoring='roc_auc', cv=None):
    print(model)
    models = BinaryRelevance(model)
    if cv:
        models = GridSearchCV(models, params, cv=cv, scoring=scoring, n_jobs=-1)
        models.fit(X, y)
        return models.best_model_, models.best_params_, models.best_score_
    else:
        models.fit(X, y)
        return models

In [34]:
def fit_all_models(data_sets, data_cols, model_list, param_grids, cv=None):
    best_models, best_params, best_scores = {}, {}, {}
    for (X, y) in data_cols:
        for model in model_list:
            if cv:
                best_models[X][model], best_params[X][model], best_scores[X][model] = \
                fit_model(model_list[model], data_sets[X], data_sets[y], param_grids[model], \
                          scoring='roc_auc', cv=cv)
            else:
                best_models[X][model] = \
                fit_model(model_list[model], data_sets[X], data_sets[y], param_grids[model], \
                          scoring='roc_auc', cv=cv)
    if cv:
        return best_models, best_params, best_scores
    return best_models

In [30]:
def predict_labels_and_probas(model, X):
    predictions = model.predict_proba(X)
    probabilities = np.squeeze(np.asarray(predictions.todense()))
    return predictions, probabilities

In [31]:
model_list = {
    'bnb': BernoulliNB(),
    'gnb': GaussianNB(),
    'lrl1': LogisticRegression(penalty='l1'),
    'lrl2': LogisticRegression(penalty='l2'),
    'rf': RandomForestClassifier(),
    'xgb': XGBClassifier(),
    'svm': SVC(kernel='linear')
}
    
param_grids = {
    'bnb': {},
    'gnb': {},
    'lrl1': {'C': np.concatenate((np.reciprocal(np.arange(1, 13, 3)), np.logspace(1, 6, num=6, endpoint=True, base=10)))},
    'lrl2': {'C': np.concatenate((np.reciprocal(np.arange(1, 13, 3)), np.logspace(1, 6, num=6, endpoint=True, base=10)))},
    'rf': {
        'n_estimators': np.arange(50, 250, 50),
        'max_features': ['auto', 'log2'],
        'max_depth': np.arange(3, 13, 2)
    },
    'xgb': {'n_estimators': [1]},
    'svm': {'C': np.concatenate((np.arange(1, 13, 3), np.logspace(1, 6, num=6, endpoint=True, base=10)))},
}

In [32]:
# Set predefined split for CV
# 0 corresponds to val, -1 to train
val_fold = [-1]*len(data_sets['X_train'+clean+os]) + [0]*len(data_sets['X_val'+clean+os])
predefined_split = PredefinedSplit(test_fold=val_fold)

In [37]:
model_list = {
    'lrl1': LogisticRegression(penalty='l1')
}
    
param_grids = {
    'lrl1': {'C': np.concatenate((np.reciprocal(np.arange(1, 4, 3)), np.logspace(1, 2, num=2, \
                                                                                 endpoint=True, base=10)))}
}

fit_all_models(data_sets, [('X_train'+clean+os, 'y_train'+clean+os)], model_list, \
               param_grids, cv=predefined_split)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/usr/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/dist-packages/ipykernel_launcher.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/usr/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x7f211cd99270, file "/...3.6/dist-packages/ipykernel_launcher.py", line 5>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.6/dist-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/dist-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py'>, ...}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/dist-packages/ipykernel_launcher.py'), pkg_name='', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x7f211cd99270, file "/...3.6/dist-packages/ipykernel_launcher.py", line 5>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/usr/local/lib/python3.6/dist-packages/__pycache__/ipykernel_launcher.cpython-36.pyc', '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': '/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': '', '__spec__': ModuleSpec(name='ipykernel_launcher', loader=<_f...b/python3.6/dist-packages/ipykernel_launcher.py'), 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py'>, ...}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()

...........................................................................
/usr/local/lib/python3.6/dist-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/usr/local/lib/python3.6/dist-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    481         if self.poller is not None:
    482             self.poller.start()
    483         self.kernel.start()
    484         self.io_loop = ioloop.IOLoop.current()
    485         try:
--> 486             self.io_loop.start()
        self.io_loop.start = <bound method BaseAsyncIOLoop.start of <tornado.platform.asyncio.AsyncIOMainLoop object>>
    487         except KeyboardInterrupt:
    488             pass
    489 
    490 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py in start(self=<tornado.platform.asyncio.AsyncIOMainLoop object>)
    107         except (RuntimeError, AssertionError):
    108             old_loop = None
    109         try:
    110             self._setup_logging()
    111             asyncio.set_event_loop(self.asyncio_loop)
--> 112             self.asyncio_loop.run_forever()
        self.asyncio_loop.run_forever = <bound method BaseEventLoop.run_forever of <_Uni...EventLoop running=True closed=False debug=False>>
    113         finally:
    114             asyncio.set_event_loop(old_loop)
    115 
    116     def stop(self):

...........................................................................
/usr/lib/python3.6/asyncio/base_events.py in run_forever(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
    416             sys.set_asyncgen_hooks(firstiter=self._asyncgen_firstiter_hook,
    417                                    finalizer=self._asyncgen_finalizer_hook)
    418         try:
    419             events._set_running_loop(self)
    420             while True:
--> 421                 self._run_once()
        self._run_once = <bound method BaseEventLoop._run_once of <_UnixS...EventLoop running=True closed=False debug=False>>
    422                 if self._stopping:
    423                     break
    424         finally:
    425             self._stopping = False

...........................................................................
/usr/lib/python3.6/asyncio/base_events.py in _run_once(self=<_UnixSelectorEventLoop running=True closed=False debug=False>)
   1421                         logger.warning('Executing %s took %.3f seconds',
   1422                                        _format_handle(handle), dt)
   1423                 finally:
   1424                     self._current_handle = None
   1425             else:
-> 1426                 handle._run()
        handle._run = <bound method Handle._run of <Handle BaseAsyncIOLoop._handle_events(13, 1)>>
   1427         handle = None  # Needed to break cycles when an exception occurs.
   1428 
   1429     def _set_coroutine_wrapper(self, enabled):
   1430         try:

...........................................................................
/usr/lib/python3.6/asyncio/events.py in _run(self=<Handle BaseAsyncIOLoop._handle_events(13, 1)>)
    122             self._callback = None
    123             self._args = None
    124 
    125     def _run(self):
    126         try:
--> 127             self._callback(*self._args)
        self._callback = <bound method BaseAsyncIOLoop._handle_events of <tornado.platform.asyncio.AsyncIOMainLoop object>>
        self._args = (13, 1)
    128         except Exception as exc:
    129             cb = _format_callback_source(self._callback, self._args)
    130             msg = 'Exception in callback {}'.format(cb)
    131             context = {

...........................................................................
/usr/local/lib/python3.6/dist-packages/tornado/platform/asyncio.py in _handle_events(self=<tornado.platform.asyncio.AsyncIOMainLoop object>, fd=13, events=1)
     97             self.writers.remove(fd)
     98         del self.handlers[fd]
     99 
    100     def _handle_events(self, fd, events):
    101         fileobj, handler_func = self.handlers[fd]
--> 102         handler_func(fileobj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fileobj = <zmq.sugar.socket.Socket object>
        events = 1
    103 
    104     def start(self):
    105         try:
    106             old_loop = asyncio.get_event_loop()

...........................................................................
/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    445             return
    446         zmq_events = self.socket.EVENTS
    447         try:
    448             # dispatch events:
    449             if zmq_events & zmq.POLLIN and self.receiving():
--> 450                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    451                 if not self.socket:
    452                     return
    453             if zmq_events & zmq.POLLOUT and self.sending():
    454                 self._handle_send()

...........................................................................
/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    475             else:
    476                 raise
    477         else:
    478             if self._recv_callback:
    479                 callback = self._recv_callback
--> 480                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    481         
    482 
    483     def _handle_send(self):
    484         """Handle a send event."""

...........................................................................
/usr/local/lib/python3.6/dist-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    427         close our socket."""
    428         try:
    429             # Use a NullContext to ensure that all StackContexts are run
    430             # inside our blanket exception handler rather than outside.
    431             with stack_context.NullContext():
--> 432                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    433         except:
    434             gen_log.error("Uncaught exception in ZMQStream callback",
    435                           exc_info=True)
    436             # Re-raise the exception so that IOLoop.handle_callback_exception

...........................................................................
/usr/local/lib/python3.6/dist-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    271         # Fast path when there are no active contexts.
    272         def null_wrapper(*args, **kwargs):
    273             try:
    274                 current_state = _state.contexts
    275                 _state.contexts = cap_contexts[0]
--> 276                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    277             finally:
    278                 _state.contexts = current_state
    279         null_wrapper._wrapped = True
    280         return null_wrapper

...........................................................................
/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 2, 1, 57, 33, 682589, tzinfo=tzutc()), 'msg_id': '58c02700189c4711b7c27349ecf83d1f', 'msg_type': 'execute_request', 'session': '16413b7e67c34e4c913bd760ce4a4bdc', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '58c02700189c4711b7c27349ecf83d1f', 'msg_type': 'execute_request', 'parent_header': {}})
    228             self.log.warn("Unknown message type: %r", msg_type)
    229         else:
    230             self.log.debug("%s: %s", msg_type, msg)
    231             self.pre_handler_hook()
    232             try:
--> 233                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'16413b7e67c34e4c913bd760ce4a4bdc']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 2, 1, 57, 33, 682589, tzinfo=tzutc()), 'msg_id': '58c02700189c4711b7c27349ecf83d1f', 'msg_type': 'execute_request', 'session': '16413b7e67c34e4c913bd760ce4a4bdc', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '58c02700189c4711b7c27349ecf83d1f', 'msg_type': 'execute_request', 'parent_header': {}}
    234             except Exception:
    235                 self.log.error("Exception in message handler:", exc_info=True)
    236             finally:
    237                 self.post_handler_hook()

...........................................................................
/usr/local/lib/python3.6/dist-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'16413b7e67c34e4c913bd760ce4a4bdc'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': datetime.datetime(2018, 5, 2, 1, 57, 33, 682589, tzinfo=tzutc()), 'msg_id': '58c02700189c4711b7c27349ecf83d1f', 'msg_type': 'execute_request', 'session': '16413b7e67c34e4c913bd760ce4a4bdc', 'username': 'username', 'version': '5.2'}, 'metadata': {}, 'msg_id': '58c02700189c4711b7c27349ecf83d1f', 'msg_type': 'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
/usr/local/lib/python3.6/dist-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    203 
    204         self._forward_input(allow_stdin)
    205 
    206         reply_content = {}
    207         try:
--> 208             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)"
        store_history = True
        silent = False
    209         finally:
    210             self._restore_input()
    211 
    212         if res.error_before_exec is not None:

...........................................................................
/usr/local/lib/python3.6/dist-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)",), **kwargs={'silent': False, 'store_history': True})
    532             )
    533         self.payload_manager.write_payload(payload)
    534 
    535     def run_cell(self, *args, **kwargs):
    536         self._last_traceback = None
--> 537         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)",)
        kwargs = {'silent': False, 'store_history': True}
    538 
    539     def _showtraceback(self, etype, evalue, stb):
    540         # try to preserve ordering of tracebacks and print statements
    541         sys.stdout.flush()

...........................................................................
/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="model_list = {\n    'lrl1': LogisticRegression(pe...\n               param_grids, cv=predefined_split)", store_history=True, silent=False, shell_futures=True)
   2723                 self.displayhook.exec_result = result
   2724 
   2725                 # Execute the user code
   2726                 interactivity = "none" if silent else self.ast_node_interactivity
   2727                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2728                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2729                 
   2730                 self.last_execution_succeeded = not has_raised
   2731                 self.last_execution_result = result
   2732 

...........................................................................
/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-37-292e8c033f66>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 7f20b407f0f0, executi..._before_exec=None error_in_exec=None result=None>)
   2851                     return True
   2852 
   2853             for i, node in enumerate(to_run_interactive):
   2854                 mod = ast.Interactive([node])
   2855                 code = compiler(mod, cell_name, "single")
-> 2856                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f211484c300, file "<ipython-input-37-292e8c033f66>", line 10>
        result = <ExecutionResult object at 7f20b407f0f0, executi..._before_exec=None error_in_exec=None result=None>
   2857                     return True
   2858 
   2859             # Flush softspace
   2860             if softspace(sys.stdout, 0):

...........................................................................
/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f211484c300, file "<ipython-input-37-292e8c033f66>", line 10>, result=<ExecutionResult object at 7f20b407f0f0, executi..._before_exec=None error_in_exec=None result=None>)
   2905         outflag = True  # happens in more places, so it's easier as default
   2906         try:
   2907             try:
   2908                 self.hooks.pre_run_code_hook()
   2909                 #rprint('Running code', repr(code_obj)) # dbg
-> 2910                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f211484c300, file "<ipython-input-37-292e8c033f66>", line 10>
        self.user_global_ns = {'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'BinaryRelevance': <class 'skmultilearn.problem_transform.br.BinaryRelevance'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GaussianNB': <class 'sklearn.naive_bayes.GaussianNB'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "import pandas as pd\nimport numpy as np\nimport ra..._ipython().run_line_magic('matplotlib', 'inline')", 'from sklearn.model_selection import train_test_s...ssifier\nfrom xgboost.sklearn import XGBClassifier', '# Set random seed\nrandom.seed(1337)', "def load_data(is_clean=0, is_os=0):\n    clean = ...pkl'.format(col),'rb'))\n    \n    return data_sets", "is_clean, is_os = 1, 0\nclean, os = '_clean', ''", 'data_sets = load_data(is_clean=1, is_os=0)', "target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']", "data_sets['data'+clean+os].head(10)", "data_sets['X_train'+clean+os].head()", "# Print value counts for each target\nfor col in ...ts[1]/sum(value_counts)), end='')\n    print('\\n')", "pickle_path = '../pickle_objects/'\nif not path.exists(pickle_path):\n    makedirs(pickle_path)", 'num_feats = 1000\nn_grams = 2\nngram_range = list(map(lambda x: x+1,range(n_grams)))', "def load_ngrams(data, num_feats, ngram_range):\n ...vec, num_feats,n_grams), 'wb'))\n    return ngrams", "ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range)", "def transform_to_ngrams(data_set, data_cols, ngr..._sets[data]['comment_text'])\n    return data_sets", "ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range)", "def load_ngrams(data, num_feats, ngram_range):\n ...vec, num_feats,n_grams), 'wb'))\n    return ngrams", "ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range)", "def transform_to_ngrams(data_set, data_cols, ngr..._sets[data]['comment_text'])\n    return data_sets", ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {8:                  id                             ...      0        0       0       0              0  , 9:                       id                        ...people arab homosexual bad thing d**n f***in h...}, 'PredefinedSplit': <class 'sklearn.model_selection._split.PredefinedSplit'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, ...}
        self.user_ns = {'BernoulliNB': <class 'sklearn.naive_bayes.BernoulliNB'>, 'BinaryRelevance': <class 'skmultilearn.problem_transform.br.BinaryRelevance'>, 'CountVectorizer': <class 'sklearn.feature_extraction.text.CountVectorizer'>, 'GaussianNB': <class 'sklearn.naive_bayes.GaussianNB'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', "import pandas as pd\nimport numpy as np\nimport ra..._ipython().run_line_magic('matplotlib', 'inline')", 'from sklearn.model_selection import train_test_s...ssifier\nfrom xgboost.sklearn import XGBClassifier', '# Set random seed\nrandom.seed(1337)', "def load_data(is_clean=0, is_os=0):\n    clean = ...pkl'.format(col),'rb'))\n    \n    return data_sets", "is_clean, is_os = 1, 0\nclean, os = '_clean', ''", 'data_sets = load_data(is_clean=1, is_os=0)', "target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']", "data_sets['data'+clean+os].head(10)", "data_sets['X_train'+clean+os].head()", "# Print value counts for each target\nfor col in ...ts[1]/sum(value_counts)), end='')\n    print('\\n')", "pickle_path = '../pickle_objects/'\nif not path.exists(pickle_path):\n    makedirs(pickle_path)", 'num_feats = 1000\nn_grams = 2\nngram_range = list(map(lambda x: x+1,range(n_grams)))', "def load_ngrams(data, num_feats, ngram_range):\n ...vec, num_feats,n_grams), 'wb'))\n    return ngrams", "ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range)", "def transform_to_ngrams(data_set, data_cols, ngr..._sets[data]['comment_text'])\n    return data_sets", "ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range)", "def load_ngrams(data, num_feats, ngram_range):\n ...vec, num_feats,n_grams), 'wb'))\n    return ngrams", "ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range)", "def transform_to_ngrams(data_set, data_cols, ngr..._sets[data]['comment_text'])\n    return data_sets", ...], 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, 'Out': {8:                  id                             ...      0        0       0       0              0  , 9:                       id                        ...people arab homosexual bad thing d**n f***in h...}, 'PredefinedSplit': <class 'sklearn.model_selection._split.PredefinedSplit'>, 'RandomForestClassifier': <class 'sklearn.ensemble.forest.RandomForestClassifier'>, ...}
   2911             finally:
   2912                 # Reset our crash handler in place
   2913                 sys.excepthook = old_excepthook
   2914         except SystemExit as e:

...........................................................................
/home/mihir/Desktop/GitHub/nyu/ml_project/src/<ipython-input-37-292e8c033f66> in <module>()
      5 param_grids = {
      6     'lrl1': {'C': np.concatenate((np.reciprocal(np.arange(1, 4, 3)), np.logspace(1, 2, num=2, \
      7                                                                                  endpoint=True, base=10)))}
      8 }
      9 
---> 10 fit_all_models(data_sets, [('X_train'+clean+os, 'y_train'+clean+os)], model_list,                param_grids, cv=predefined_split)

...........................................................................
/home/mihir/Desktop/GitHub/nyu/ml_project/src/<ipython-input-34-38d68be09502> in fit_all_models(data_sets={'X_test_clean':                       id                        ...ove content talk pag...

[31915 rows x 2 columns], 'X_test_clean_countvec': <31915x1000 sparse matrix of type '<class 'numpy... stored elements in Compressed Sparse Row format>, 'X_test_clean_tfidf': <31915x1000 sparse matrix of type '<class 'numpy... stored elements in Compressed Sparse Row format>, 'X_train_clean':                       id                        ...e bill gates1.jpg va...

[95742 rows x 2 columns], 'X_train_val_clean':                       id                        ...ounty texas|of head...

[127656 rows x 2 columns], 'X_train_val_clean_countvec': <127656x1000 sparse matrix of type '<class 'nump... stored elements in Compressed Sparse Row format>, 'X_train_val_clean_tfidf': <127656x1000 sparse matrix of type '<class 'nump... stored elements in Compressed Sparse Row format>, 'X_val_clean':                       id                        ...    -pron- right remove

[31914 rows x 2 columns], 'data_clean':                       id                        ...    0              0  

[159571 rows x 8 columns], 'y_test_clean':         toxic  severe_toxic  obscene  threat  in...       0              0

[31915 rows x 6 columns], ...}, data_cols=[('X_train_clean', 'y_train_clean')], model_list={'lrl1': LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False)}, param_grids={'lrl1': {'C': array([  1.,  10., 100.])}}, cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])))
      1 def fit_all_models(data_sets, data_cols, model_list, param_grids, cv=None):
      2     best_models, best_params, best_scores = {}, {}, {}
      3     for (X, y) in data_cols:
      4         for model in model_list:
      5             if cv:
----> 6                 best_models[X][model], best_params[X][model], best_scores[X][model] =                 fit_model(model_list[model], data_sets[X], data_sets[y], param_grids[model],                           scoring='roc_auc', cv=cv)
      7             else:
      8                 best_models[X][model] =                 fit_model(model_list[model], data_sets[X], data_sets[y], param_grids[model],                           scoring='roc_auc', cv=cv)
      9     if cv:
     10         return best_models, best_params, best_scores

...........................................................................
/home/mihir/Desktop/GitHub/nyu/ml_project/src/<ipython-input-36-c365f5705cf9> in fit_model(model=LogisticRegression(C=1.0, class_weight=None, dua...ol=0.0001,
          verbose=0, warm_start=False), X=                      id                        ...e bill gates1.jpg va...

[95742 rows x 2 columns], y=        toxic  severe_toxic  obscene  threat  in...       0              0

[95742 rows x 6 columns], params={'C': array([  1.,  10., 100.])}, scoring='roc_auc', cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])))
      1 def fit_model(model, X, y, params, scoring='roc_auc', cv=None):
      2     print(model)
      3     models = BinaryRelevance(model)
      4     if cv:
      5         models = GridSearchCV(models, params, cv=cv, scoring=scoring, n_jobs=-1)
----> 6         models.fit(X, y)
      7         return models.best_model_, models.best_params_, models.best_score_
      8     else:
      9         models.fit(X, y)
     10         return models

...........................................................................
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=PredefinedSplit(test_fold=array(...core='warn',
       scoring='roc_auc', verbose=0), X=                      id                        ...e bill gates1.jpg va...

[95742 rows x 2 columns], y=        toxic  severe_toxic  obscene  threat  in...       0              0

[95742 rows x 6 columns], groups=None, **fit_params={})
    634                                   return_train_score=self.return_train_score,
    635                                   return_n_test_samples=True,
    636                                   return_times=True, return_parameters=False,
    637                                   error_score=self.error_score)
    638           for parameters, (train, test) in product(candidate_params,
--> 639                                                    cv.split(X, y, groups)))
        cv.split = <bound method PredefinedSplit.split of PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0]))>
        X =                       id                        ...e bill gates1.jpg va...

[95742 rows x 2 columns]
        y =         toxic  severe_toxic  obscene  threat  in...       0              0

[95742 rows x 6 columns]
        groups = None
    640 
    641         # if one choose to see train score, "out" will contain train score info
    642         if self.return_train_score:
    643             (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,

...........................................................................
/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=-1), iterable=<generator object BaseSearchCV.fit.<locals>.<genexpr>>)
    784             if pre_dispatch == "all" or n_jobs == 1:
    785                 # The iterable was consumed all at once by the above for loop.
    786                 # No need to wait for async callbacks to trigger to
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-1)>
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time
    792             self._print('Done %3i out of %3i | elapsed: %s finished',
    793                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Tue May  1 21:57:34 2018
PID: 24283                             Python 3.6.3: /usr/local/bin/python3
...........................................................................
/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (BinaryRelevance(classifier=LogisticRegression(C=...start=False),
        require_dense=[True, True]),                       id                        ...e bill gates1.jpg va...

[95742 rows x 2 columns],         toxic  severe_toxic  obscene  threat  in...       0              0

[95742 rows x 6 columns], {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([    0,     1,     2, ..., 95739, 95740, 95741]), array([ 95742,  95743,  95744, ..., 127653, 127654, 127655]), 0, {'C': 1.0}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/usr/local/lib/python3.6/dist-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (BinaryRelevance(classifier=LogisticRegression(C=...start=False),
        require_dense=[True, True]),                       id                        ...e bill gates1.jpg va...

[95742 rows x 2 columns],         toxic  severe_toxic  obscene  threat  in...       0              0

[95742 rows x 6 columns], {'score': make_scorer(roc_auc_score, needs_threshold=True)}, array([    0,     1,     2, ..., 95739, 95740, 95741]), array([ 95742,  95743,  95744, ..., 127653, 127654, 127655]), 0, {'C': 1.0})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': False, 'return_times': True, 'return_train_score': 'warn'}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/usr/local/lib/python3.6/dist-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=BinaryRelevance(classifier=LogisticRegression(C=...start=False),
        require_dense=[True, True]), X=                      id                        ...e bill gates1.jpg va...

[95742 rows x 2 columns], y=        toxic  severe_toxic  obscene  threat  in...       0              0

[95742 rows x 6 columns], scorer={'score': make_scorer(roc_auc_score, needs_threshold=True)}, train=array([    0,     1,     2, ..., 95739, 95740, 95741]), test=array([ 95742,  95743,  95744, ..., 127653, 127654, 127655]), verbose=0, parameters={'C': 1.0}, fit_params={}, return_train_score='warn', return_parameters=False, return_n_test_samples=True, return_times=True, error_score='raise')
    439                       for k, v in fit_params.items()])
    440 
    441     test_scores = {}
    442     train_scores = {}
    443     if parameters is not None:
--> 444         estimator.set_params(**parameters)
        estimator.set_params = <bound method MLClassifierBase.set_params of Bin...tart=False),
        require_dense=[True, True])>
        parameters = {'C': 1.0}
    445 
    446     start_time = time.time()
    447 
    448     X_train, y_train = _safe_split(estimator, X, y, train)

...........................................................................
/usr/local/lib/python3.6/dist-packages/skmultilearn/base/base.py in set_params(self=BinaryRelevance(classifier=LogisticRegression(C=...start=False),
        require_dense=[True, True]), **parameters={'C': 1.0})
    228                 setattr(self, parameter, value)
    229             else:
    230                 raise ValueError('Invalid parameter %s for estimator %s. '
    231                                  'Check the list of available parameters '
    232                                  'with `estimator.get_params().keys()`.' %
--> 233                                  (parameter, self))
        parameter = 'C'
        self = BinaryRelevance(classifier=LogisticRegression(C=...start=False),
        require_dense=[True, True])
    234 
    235 
    236         parameters_below_current_level = [x for x in parameters if '__' in x]
    237         parameters_grouped_by_current_level = {object : {} for object in valid_params}

ValueError: Invalid parameter C for estimator BinaryRelevance(classifier=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        require_dense=[True, True]). Check the list of available parameters with `estimator.get_params().keys()`.
___________________________________________________________________________

In [None]:
# Fit individual BernoulliNB models for each target with Count Vectorizer
bnb_countvec = BinaryRelevance(BernoulliNB())
bnb_countvec.fit(X_train_val_countvec, y_train_val)
predictions_bnb_countvec = bnb_countvec.predict_proba(X_test_countvec)

# Fit individual GaussianNB models for each target with Count Vectorizer
gnb_countvec = BinaryRelevance(GaussianNB())
gnb_countvec.fit(X_train_val_countvec, y_train_val)
predictions_gnb_countvec = gnb_countvec.predict_proba(X_test_countvec)

# Fit individual GaussianNB models for each target with TF-IDF Vectorizer
gnb_tfidf = BinaryRelevance(GaussianNB())
gnb_tfidf.fit(X_train_val_tfidf, y_train_val)
predictions_gnb_tfidf = gnb_tfidf.predict_proba(X_test_tfidf)

# Fit individual Logistic Regression+l1 models for each target with TF-IDF Vectorizer
lr1_tfidf = BinaryRelevance(LogisticRegression(penalty='l1'))
lr1_tfidf.fit(X_train_val_tfidf, y_train_val)
predictions_lr1_tfidf = lr1_tfidf.predict_proba(X_test_tfidf)

# Fit individual Logistic Regression+l2 models for each target with TF-IDF Vectorizer
lr2_tfidf = BinaryRelevance(LogisticRegression(penalty='l2'))
lr2_tfidf.fit(X_train_val_tfidf, y_train_val)
predictions_lr2_tfidf = lr2_tfidf.predict_proba(X_test_tfidf)

# Fit RandomForestClassifier for each target with Count Vectorizer
rf_countvec = RandomForestClassifier(n_estimators=100)
rf_countvec.fit(X_train_val_countvec, y_train_val)
probabilities_rf_countvec = rf_countvec.predict_proba(X_test_countvec)

# Fit RandomForestClassifier for each target with TF-IDF Vectorizer
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train_val_tfidf, y_train_val)
probabilities_rf_tfidf = rf_tfidf.predict_proba(X_test_tfidf)

# # Fit XGBClassifier for each target with Count Vectorizer
# xgb_countvec = BinaryRelevance(XGBClassifier(n_estimators=1))
# xgb_countvec.fit(X_train_val_countvec, y_train_val)
# predictions_xgb_countvec = xgb_countvec.predict_proba(X_test_countvec)

# # Fit XGBClassifier for each target with TF-IDF Vectorizer
# xgb_tfidf = BinaryRelevance(XGBClassifier(n_estimators=1))
# xgb_tfidf.fit(X_train_val_tfidf, y_train_val)
# predictions_xgb_tfidf = xgb_tfidf.predict_proba(X_test_tfidf)


# # Fit individual GaussianNB models for each target with TF-IDF Vectorizer
# scaler = preprocessing.StandardScaler().fit(X_train_val_tfidf)
# X_train_val_tfidf = scaler.transform(X_train_val_tfidf)
# X_test_tfidf = scaler.transform(X_test_tfidf)
# svc_tfidf = BinaryRelevance(SVC(kernel='linear'))
# svc_tfidf.fit(X_train_val_tfidf, y_train_val)
# predictions_svc_tfidf = svc_tfidf.predict_proba(X_test_tfidf)

In [None]:
# Predict probabilities of each class for Count Vectorizer (BernoulliNB)
probabilities_bnb_countvec = np.squeeze(np.asarray(predictions_bnb_countvec.todense()))

# Predict probabilities of each class for Count Vectorizer and TF-IDF Vectorizer (GaussianNB)
probabilities_gnb_countvec = np.squeeze(np.asarray(predictions_gnb_countvec.todense()))
probabilities_gnb_tfidf = np.squeeze(np.asarray(predictions_gnb_tfidf.todense()))


# Predict probabilities of each class for TF-IDF Vectorizer (LogisticRegression+l1)
probabilities_lr1_tfidf = np.squeeze(np.asarray(predictions_lr1_tfidf.todense()))

# Predict probabilities of each class for TF-IDF Vectorizer (LogisticRegression+l2)
probabilities_lr2_tfidf = np.squeeze(np.asarray(predictions_lr2_tfidf.todense()))

# # Predict probabilities of each class for TF-IDF Vectorizer (SVC(linear kernel))
# probabilities_svc_tfidf = np.squeeze(np.asarray(predictions_svc_tfidf.todense()))

In [None]:
# Compute ROC-AUC values of each class for Count Vectorizer (BernoulliNB)
auc_bnb_countvec = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_bnb_countvec[:,i])
    auc_value = auc(fpr, tpr)
    auc_bnb_countvec.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for BernoulliNB with Count Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for Count Vectorizer (GaussianNB)
auc_gnb_countvec = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_gnb_countvec[:,i])
    auc_value = auc(fpr, tpr)
    auc_gnb_countvec.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for GaussianNB with Count Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (GaussianNB)
auc_gnb_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_gnb_tfidf[:,i])
    auc_value = auc(fpr, tpr)
    auc_gnb_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for GaussianNB with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (LogisticRegression+l1)
auc_lr1_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_lr1_tfidf[:,i])
    auc_value = auc(fpr, tpr)
    auc_lr1_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for LogisticRegression+l1 with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (LogisticRegression+l2)
auc_lr2_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_lr2_tfidf[:,i])
    auc_value = auc(fpr, tpr)
    auc_lr2_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for LogisticRegression+l2 with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for Count Vectorizer (RandomForestClassifier)
auc_rf_countvec = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_rf_countvec[i][:,1])
    auc_value = auc(fpr, tpr)
    auc_rf_countvec.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for Random Forest with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (RandomForestClassifier)
auc_rf_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_rf_tfidf[i][:,1])
    auc_value = auc(fpr, tpr)
    auc_rf_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for Random Forest with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# # Compute ROC-AUC values of each class for Count Vectorizer (XGBClassifier)
# auc_xgb_countvec = []
# plt.figure(figsize=(10,8))
# for i, col in enumerate(target_cols):
#     fpr, tpr, threshold = roc_curve(y_test[col], probabilities_xgb_countvec[:,i])
#     auc_value = auc(fpr, tpr)
#     auc_xgb_countvec.append(auc_value)
#     plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
# plt.xlabel('fpr')
# plt.ylabel('tpr')
# plt.title('ROC Curve for XGBoost with Count Vectorizer')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.show()

# # Compute ROC-AUC values of each class for TfIdf Vectorizer (XGBClassifier)
# auc_xgb_tfidf = []
# plt.figure(figsize=(10,8))
# for i, col in enumerate(target_cols):
#     fpr, tpr, threshold = roc_curve(y_test[col], probabilities_xgb_tfidf[:,i])
#     auc_value = auc(fpr, tpr)
#     auc_xgb_tfidf.append(auc_value)
#     plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
# plt.xlabel('fpr')
# plt.ylabel('tpr')
# plt.title('ROC Curve for XGBoost with TfIdf Vectorizer')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.show()

# # Compute ROC-AUC values of each class for TfIdf Vectorizer (SVC(linear kernel))
# auc_svc_tfidf = []
# plt.figure(figsize=(10,8))
# for i, col in enumerate(target_cols):
#     fpr, tpr, threshold = roc_curve(y_test[col], probabilities_svc_tfidf[:,i])
#     auc_value = auc(fpr, tpr)
#     auc_svc_tfidf.append(auc_value)
#     plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
# plt.xlabel('fpr')
# plt.ylabel('tpr')
# plt.title('ROC Curve for LogisticRegression with TfIdf Vectorizer')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.show()

In [None]:
# Get mean column-wise ROC-AUC values
print('ROC-AUC for BernoulliNB with Count Vectorizer = {:.4f}'.format(np.mean(auc_bnb_countvec)))
print('ROC-AUC for GaussianNB with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_gnb_tfidf)))
print('ROC-AUC for GaussianNB with Count Vectorizer = {:.4f}'.format(np.mean(auc_gnb_countvec)))
print('ROC-AUC for RandomForest with Count Vectorizer = {:.4f}'.format(np.mean(auc_rf_countvec)))
print('ROC-AUC for RandomForest with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_rf_tfidf)))
print('ROC-AUC for LogisticRegression+l1 with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_lr1_tfidf)))
print('ROC-AUC for LogisticRegression+l2 with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_lr2_tfidf)))
# print('ROC-AUC for XGBoost with Count Vectorizer = {:.4f}'.format(np.mean(auc_xgb_countvec)))
# print('ROC-AUC for XGBoost with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_xgb_tfidf)))
# print('ROC-AUC for SVC with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_svc_tfidf)))

In [None]:
aucs = pd.DataFrame()
aucs['bnb_countvec'] = auc_bnb_countvec
aucs['gnb_tfidfvec'] = auc_gnb_tfidf
aucs['gnb_countvec'] = auc_gnb_countvec
aucs['rf_countvec'] = auc_rf_countvec
aucs['rf_tfidfvec'] = auc_rf_tfidf
aucs['lr1_tfidfvec'] = auc_lr1_tfidf
aucs['lr2_tfidfvec'] = auc_lr2_tfidf
# aucs['xgb_countvec'] = auc_xgb_countvec
# aucs['xgb_tfidfvec'] = auc_xgb_tfidf
# aucs['svc_tfidfvec'] = auc_svc_tfidf

aucs = aucs.T
aucs.columns = target_cols
aucs['mean'] = np.mean(aucs[target_cols], axis=1)
aucs