# Final Experiments - Multi-label SVM - Problem Statement

## Utilities and Imports

In [1]:
%reload_ext autoreload
%autoreload 2

import itertools
from collections import Counter
import numpy as np
import pickle
from operator import itemgetter
import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline
# matplotlib.rcParams['figure.figsize'] = [5, 10]

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import hamming_loss, make_scorer, confusion_matrix
from sklearn.svm import LinearSVC, SVC
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from fastai import text as ft
from fastai import dataloader as fd
from fastai import dataset as fs
from fastai import learner as fl
from fastai import core as fc
from fastai import metrics as fm


from skai.runner import TextRunner, Adam_lambda
from skai.mwrapper import MWrapper, SKModel
from skai.utils import multi_to_text_out, vote_pred
from skai.utils import get_classification_type, weights_init, multilabel_prediction
from skai.dataset import TokenDataset, SimpleDataset


def mapt(f, *iters):
    return tuple(map(f, *iters))

def mapl(f, *iters):
    return list(map(f, *iters))

def manually_remove_problems(data):
    """ remove problem from data if it has a certain tag"""
    final_data = {}
    remove = ['*special']
    for i in data:
        if set(data[i][1][0]).intersection(set(remove)) == set():
            if data[i][0][0] != '':
                final_data[i] = data[i]
    return final_data

def get_single_label_problems(data):
    '''returns a dict of all problems which only have one label'''
    single_label_problems = {}
    for i in data:
        if len(data[i][1][0]) == 1:
            single_label_problems[i] = data[i]
    return single_label_problems

def get_classwise_distribution(data):
    class_count = {}
    for i in data:
        for cls in data[i][1][0]:
            if cls in class_count:
                class_count[cls] +=1 
            else:
                class_count[cls] = 1
    return class_count


def get_topk_single_label_problems(data,k):
    """ get top k by frequency single label problems"""
    class_dict = get_classwise_distribution(data)
    print(class_dict)
    class_dict = dict(sorted(class_dict.items(), key=itemgetter(1), reverse=True)[:k])
    print(set(class_dict.keys()))

    topk_data = {}
    for i in data:
        if set(data[i][1][0]).intersection(set(class_dict.keys())) != set():
            topk_data[i] = data[i]
            
    return topk_data

def make_text_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0][0])
        except IndexError:
            continue
        Xtext.append(data[0][0])
    return Xtext, ytext

def make_multi_text_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0])
        except IndexError:
            continue
        Xtext.append(data[0][0])
    return Xtext, ytext

def make_statement_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0])
        except IndexError:
            continue
        Xtext.append(data[0][2])
    return Xtext, ytext

def make_non_statement_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0][0])
        except IndexError:
            continue
        Xtext.append(f'{data[0][3]}\n{data[0][4]}\n{data[0][5]}')
    return Xtext, ytext

def get_class_list(labels):
    return list(set(labels))

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    fig = plt.gcf()
    fig.set_size_inches(22, 16)
    plt.imshow(cm, interpolation='nearest', cmap=cmap, vmin=0.0, vmax=1.0)
#     plt.title(title, fontsize)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, fontsize=32)
    plt.yticks(tick_marks, classes, fontsize=32)

    print(cm.max())
    fmt = '.2f' if normalize else 'd'
    thresh = 0.5
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=32)

    plt.tight_layout()
    plt.ylabel('True label', fontsize=32)
    plt.xlabel('Predicted label', fontsize=32)

  from numpy.core.umath_tests import inner1d


## Load data

In [2]:
top10m = pickle.load(open('data/10multi_26aug.pkl','rb'))
top20m = pickle.load(open('data/20multi_26aug.pkl','rb'))

top10pm, top20pm = mapt(make_statement_dataset, [top10m, top20m])

In [3]:
print(top10pm[0][0])


You've got array a[1], a[2], ..., a[n], consisting of n integers. Count the number of ways to split all the elements of the array into three contiguous parts so that the sum of elements in each part is the same.
More formally, you need to find the number of such pairs of indices i, j (2 ≤ i ≤ j ≤ n - 1), that .





In [4]:
print(top20pm[1][0])

['binary search', 'implementation', 'data structures']


## SVM Definitions

In [5]:
svm_cv = SKModel(Pipeline(
    [('countvec', CountVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC()))]),
    {'countvec__max_df': (0.25, 0.5, 0.75),
    'countvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None]})
svm_cv = MWrapper(svm_cv, 'svm_cv')

Note: Model directory for svm_cv exists.


In [21]:
cv_clf.get_params()

{'memory': None,
 'steps': [('countvec',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.5, max_features=None, min_df=1,
           ngram_range=(1, 3), preprocessor=None,
           stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
           strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=None, vocabulary=None)),
  ('clf',
   OneVsRestClassifier(estimator=LinearSVC(C=0.01, class_weight='balanced', dual=True, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge

In [23]:
tf_clf.get_params()

# {'countvec__max_df': (0.25, 0.5, 0.75),
#     'countvec__ngram_range': [(1, 1), (1, 2), (1, 3)],
#     "clf__estimator__C": [0.01, 0.1, 1],
#     "clf__estimator__class_weight": ['balanced', None]}

{'memory': None,
 'steps': [('countvec',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=0.5, max_features=None, min_df=1,
           ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
           strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('clf',
   OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True

In [10]:
svm_cv = SKModel(Pipeline(
    [('countvec', CountVectorizer(stop_words=stop_words, max_df=0.5)),
    ('clf', OneVsRestClassifier(LinearSVC()))]),
    {'countvec__ngram_range': [(1, 3)],
    "clf__estimator__C": [0.01],
    "clf__estimator__class_weight": ['balanced']})
svm_cv = MWrapper(svm_cv, 'svm_cv')

Note: Model directory for svm_cv exists.
Note: Checkpoints directory for svm_cv exists.


### 20-class experiments

In [6]:
trunner = TextRunner([svm_cv], top20pm[0], top20pm[1], 'top20pm', make_pyt_data=False)
Xall, yall = np.array(trunner.rdata), np.array(trunner.labels)

Checkpoint reached: raw data cleaned.
multilabel classification.


In [17]:
runs = 1
out_dim = 20

preds_txt_cv, preds_txt_tf = [], []
targs_txt = []

for i in range(runs):
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=i+42)
    
    outer_cv.get_n_splits(Xall, yall)
    for j, (nontest_i, test_i) in enumerate(outer_cv.split(Xall, yall)):
        print(f'Outer split no. {j}')
        X_train, y_train = Xall[nontest_i], yall[nontest_i]
        X_test, y_test = Xall[test_i], yall[test_i]
        
        cv_clf, cv_score = trunner.get_clf_sk(svm_cv, X_train, y_train)
        
        preds = cv_clf.predict(X_test)
        preds_txt_cv.append(preds)
        
        y_test = trunner.alldata.ovectorizer.transform(y_test)
        targs_txt.append(y_test)

        print(accuracy_score(np.concatenate(targs_txt),
                             np.concatenate(preds_txt_cv)))

Outer split no. 0
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'constructive algorithms', 'sortings', 'data structures', 'graphs', 'combinatorics', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.1s finished


0.09090909090909091
Outer split no. 1
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'constructive algorithms', 'sortings', 'data structures', 'graphs', 'combinatorics', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    8.1s finished


0.07954545454545454
Outer split no. 2
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'sortings', 'combinatorics', 'data structures', 'graphs', 'constructive algorithms', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.3s finished


0.07744107744107744
Outer split no. 3
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'constructive algorithms', 'sortings', 'data structures', 'graphs', 'combinatorics', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    8.1s finished


0.07765151515151515
Outer split no. 4
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'constructive algorithms', 'sortings', 'data structures', 'graphs', 'combinatorics', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.3s finished


0.07727272727272727
Outer split no. 5
{'number theory', 'geometry', 'greedy', 'implementation', 'two pointers', 'bitmasks', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'constructive algorithms', 'sortings', 'data structures', 'graphs', 'combinatorics', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.9s finished


0.0787037037037037
Outer split no. 6
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'constructive algorithms', 'combinatorics', 'data structures', 'graphs', 'sortings', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    8.0s finished


0.07611832611832611
Outer split no. 7
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'constructive algorithms', 'brute force', 'dfs and similar', 'sortings', 'dsu', 'data structures', 'combinatorics', 'graphs', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.1s finished


0.07512626262626262
Outer split no. 8
{'number theory', 'geometry', 'bitmasks', 'implementation', 'two pointers', 'greedy', 'binary search', 'math', 'dsu', 'brute force', 'dfs and similar', 'constructive algorithms', 'sortings', 'data structures', 'graphs', 'combinatorics', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.0s finished


0.07463524130190796
Outer split no. 9
{'number theory', 'geometry', 'bitmasks', 'implementation', 'greedy', 'two pointers', 'binary search', 'math', 'dsu', 'dfs and similar', 'constructive algorithms', 'brute force', 'sortings', 'data structures', 'graphs', 'combinatorics', 'dp', 'probabilities', 'trees', 'strings'}
20
Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    6.5s finished


0.0755050505050505


In [18]:
preds_txt_cv = np.concatenate(preds_txt_cv)
targs_txt = np.concatenate(targs_txt)

In [7]:
# pickle.dump([preds_txt_cv, targs_txt], open('data/results/svm-ps_20m.pkl', 'wb'))
preds_txt_cv, targs_txt = pickle.load(open('data/results/svm-ps_20m.pkl', 'rb'))

In [8]:
hl = hamming_loss(targs_txt, preds_txt_cv)
micro_f1 = f1_score(targs_txt, preds_txt_cv, average='micro')
macro_f1 = f1_score(targs_txt, preds_txt_cv, average='macro')

print(f'Hamming loss = {hl}\nMicro_F1 = {micro_f1}l\nMacro_F1 = {macro_f1}')

Hamming loss = 0.10957070707070707
Micro_F1 = 0.2974417098445596l
Macro_F1 = 0.2556668833871532
