# Final Experiments - Multi-label MNB - CPU

## Utilities and Imports

In [1]:
%reload_ext autoreload
%autoreload 2

import itertools
from collections import Counter
import numpy as np
import pickle
from operator import itemgetter
import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline
# matplotlib.rcParams['figure.figsize'] = [5, 10]

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, hamming_loss
from sklearn.svm import LinearSVC, SVC
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from fastai import text as ft
from fastai import dataloader as fd
from fastai import dataset as fs
from fastai import learner as fl
from fastai import core as fc
from fastai import metrics as fm


from skai.runner import TextRunner, Adam_lambda
from skai.mwrapper import MWrapper, SKModel
from skai.utils import multi_to_text_out, vote_pred
from skai.utils import get_classification_type, weights_init, multilabel_prediction
from skai.dataset import TokenDataset, SimpleDataset


def mapt(f, *iters):
    return tuple(map(f, *iters))

def mapl(f, *iters):
    return list(map(f, *iters))

def manually_remove_problems(data):
    """ remove problem from data if it has a certain tag"""
    final_data = {}
    remove = ['*special']
    for i in data:
        if set(data[i][1][0]).intersection(set(remove)) == set():
            if data[i][0][0] != '':
                final_data[i] = data[i]
    return final_data

def get_single_label_problems(data):
    '''returns a dict of all problems which only have one label'''
    single_label_problems = {}
    for i in data:
        if len(data[i][1][0]) == 1:
            single_label_problems[i] = data[i]
    return single_label_problems

def get_classwise_distribution(data):
    class_count = {}
    for i in data:
        for cls in data[i][1][0]:
            if cls in class_count:
                class_count[cls] +=1 
            else:
                class_count[cls] = 1
    return class_count


def get_topk_single_label_problems(data,k):
    """ get top k by frequency single label problems"""
    class_dict = get_classwise_distribution(data)
    print(class_dict)
    class_dict = dict(sorted(class_dict.items(), key=itemgetter(1), reverse=True)[:k])
    print(set(class_dict.keys()))

    topk_data = {}
    for i in data:
        if set(data[i][1][0]).intersection(set(class_dict.keys())) != set():
            topk_data[i] = data[i]
            
    return topk_data

def make_text_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0][0])
        except IndexError:
            continue
        Xtext.append(data[0][0])
    return Xtext, ytext

def make_multi_text_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0])
        except IndexError:
            continue
        Xtext.append(data[0][0])
    return Xtext, ytext

def make_statement_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0][0])
        except IndexError:
            continue
        Xtext.append(data[0][2])
    return Xtext, ytext

def make_non_statement_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0][0])
        except IndexError:
            continue
        Xtext.append(f'{data[0][3]}\n{data[0][4]}\n{data[0][5]}')
    return Xtext, ytext

def get_class_list(labels):
    return list(set(labels))

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    fig = plt.gcf()
    fig.set_size_inches(22, 16)
    plt.imshow(cm, interpolation='nearest', cmap=cmap, vmin=0.0, vmax=1.0)
#     plt.title(title, fontsize)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=32)
    plt.yticks(tick_marks, classes, fontsize=32)

    print(cm.max())
    fmt = '.2f' if normalize else 'd'
    thresh = 0.5
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=32)

    plt.tight_layout()
    plt.ylabel('True label', fontsize=32)
    plt.xlabel('Predicted label', fontsize=32)

  from numpy.core.umath_tests import inner1d


## Load data

In [2]:
top10m = pickle.load(open('data/10multi_26aug.pkl','rb'))
top20m = pickle.load(open('data/20multi_26aug.pkl','rb'))

top10m, top20m = mapt(make_multi_text_dataset, [top10m, top20m])

In [3]:
print(top10m[1][0])

['binary search', 'data structures', 'brute force', 'dp']


## MNB Definitions

In [4]:
mnb = SKModel(Pipeline(
    [('countvec', CountVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(MultinomialNB()))]),
    {'countvec__max_df': (0.25, 0.5, 0.75),
     'countvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
     'clf__estimator__alpha': [0.25, 0.5, 0.75, 1]}) 
mnb = MWrapper(mnb, 'mnb')

Note: Model directory for mnb exists.


## Experiments

### 10-class experiments

In [5]:
trunner = TextRunner([mnb], top10m[0], top10m[1], 'top10m', make_pyt_data=False)
Xall, yall = np.array(trunner.rdata), np.array(trunner.labels)

Checkpoint reached: raw data cleaned.
multilabel classification.


In [35]:
runs = 1
out_dim = 10

preds_txt, targs_txt = [], []

for i in range(runs):
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=i+42)
    
    outer_cv.get_n_splits(Xall, yall)
    for j, (nontest_i, test_i) in enumerate(outer_cv.split(Xall, yall)):
        X_train, y_train = Xall[nontest_i], yall[nontest_i]
        X_test, y_test = Xall[test_i], yall[test_i]
        
        clf, score = trunner.get_clf_sk(mnb, X_train, y_train)
        preds = clf.predict(X_test)
        y_test = trunner.alldata.ovectorizer.transform(y_test)
        
        preds_txt.append(preds)
        targs_txt.append(y_test)
        print(accuracy_score(np.concatenate(targs_txt),
                             np.concatenate(preds_txt)))

{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   28.6s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.11764705882352941
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   28.1s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.12032085561497326
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   28.6s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.12566844919786097
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   28.2s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.12366310160427807
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   27.1s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.2min finished


0.11978609625668449
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   27.6s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.2min finished


0.12076648841354724
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   28.7s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.12337662337662338
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   28.1s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.12571046472751587
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   27.9s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.12990487514863258
{'math', 'implementation', 'constructive algorithms', 'dp', 'data structures', 'greedy', 'dfs and similar', 'binary search', 'sortings', 'brute force'}
10
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   27.5s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.3min finished


0.13005084292213004


In [36]:
preds_txt = np.concatenate(preds_txt)
targs_txt = np.concatenate(targs_txt)

In [37]:
print(targs_txt[0])

[0 1 0 1 0 0 0 0 1 0]


In [6]:
# pickle.dump([preds_txt, targs_txt], open('data/results/mnb_10m.pkl', 'wb'))
preds_txt, targs_txt = pickle.load(open('data/results/mnb_10m.pkl', 'rb'))

In [7]:
hl = hamming_loss(targs_txt, preds_txt)
micro_f1 = f1_score(targs_txt, preds_txt, average='micro')
macro_f1 = f1_score(targs_txt, preds_txt, average='macro')

print(f'Hamming loss = {hl}\nMicro_F1 = {micro_f1}l\nMacro_F1 = {macro_f1}')
# plot_confusion_matrix(trunner.y_test, predictions_sv,
#                       get_class_list(trunner.y_test))

Hamming loss = 0.1706181428953706
Micro_F1 = 0.3057491289198606l
Macro_F1 = 0.25736132541303536


### 20-class experiments

In [8]:
trunner = TextRunner([mnb], top20m[0], top20m[1], 'top20m', make_pyt_data=False)
Xall, yall = np.array(trunner.rdata), np.array(trunner.labels)

Checkpoint reached: raw data cleaned.
multilabel classification.


In [41]:
runs = 1
out_dim = 20

preds_txt, targs_txt = [], []

for i in range(runs):
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=i+42)
    
    outer_cv.get_n_splits(Xall, yall)
    for j, (nontest_i, test_i) in enumerate(outer_cv.split(Xall, yall)):
        X_train, y_train = Xall[nontest_i], yall[nontest_i]
        X_test, y_test = Xall[test_i], yall[test_i]
        
        clf, score = trunner.get_clf_sk(mnb, X_train, y_train)
        preds = clf.predict(X_test)
        y_test = trunner.alldata.ovectorizer.transform(y_test)
        
        preds_txt.append(preds)
        targs_txt.append(y_test)
        print(accuracy_score(np.concatenate(targs_txt),
                             np.concatenate(preds_txt)))

{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   35.4s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.7min finished


0.08838383838383838
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   35.3s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.6min finished


0.07702020202020202
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   34.5s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.6min finished


0.07912457912457913
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   35.7s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.6min finished


0.07828282828282829
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   38.9s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.6min finished


0.08181818181818182
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'sortings', 'brute force', 'bitmasks', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   40.4s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.7min finished


0.08038720538720538
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   36.1s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.6min finished


0.08008658008658008
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   33.4s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.5min finished


0.07954545454545454
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   33.8s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.6min finished


0.07856341189674523
{'math', 'strings', 'implementation', 'graphs', 'constructive algorithms', 'geometry', 'dp', 'dsu', 'data structures', 'trees', 'greedy', 'dfs and similar', 'two pointers', 'binary search', 'bitmasks', 'brute force', 'sortings', 'number theory', 'probabilities', 'combinatorics'}
20
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:   35.0s
[Parallel(n_jobs=5)]: Done 108 out of 108 | elapsed:  1.5min finished


0.07929292929292929


In [42]:
preds_txt = np.concatenate(preds_txt)
targs_txt = np.concatenate(targs_txt)

In [43]:
print(targs_txt[0])

[0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]


In [9]:
# pickle.dump([preds_txt, targs_txt], open('data/results/mnb_20m.pkl', 'wb'))
preds_txt, targs_txt = pickle.load(open('data/results/mnb_20m.pkl', 'rb'))

In [10]:
hl = hamming_loss(targs_txt, preds_txt)
micro_f1 = f1_score(targs_txt, preds_txt, average='micro')
macro_f1 = f1_score(targs_txt, preds_txt, average='macro')

print(f'Hamming loss = {hl}\nMicro_F1 = {micro_f1}l\nMacro_F1 = {macro_f1}')
# plot_confusion_matrix(trunner.y_test, predictions_sv,
#                       get_class_list(trunner.y_test))

Hamming loss = 0.10669191919191919
Micro_F1 = 0.2966539037789246l
Macro_F1 = 0.23411972853832436
