# Final Experiments - Multi-label MLP-GPU

## Utilities and Imports

In [1]:
%reload_ext autoreload
%autoreload 2

import itertools
from collections import Counter
import random
import numpy as np
import pickle
from operator import itemgetter
import matplotlib
from matplotlib import pyplot as plt

%matplotlib inline
# matplotlib.rcParams['figure.figsize'] = [5, 10]

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.metrics import hamming_loss, make_scorer, confusion_matrix
from sklearn.svm import LinearSVC, SVC
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

from fastai import text as ft
from fastai import dataloader as fd
from fastai import dataset as fs
from fastai import learner as fl
from fastai import core as fc
from fastai import metrics as fm


from skai.runner import TextRunner, Adam_lambda
from skai.mwrapper import MWrapper, SKModel
from skai.utils import multi_to_text_out, multilabel_prediction, weights_init
from skai.dataset import SimpleDataset, SimpleBoWDataset
from skai.metrics import f1_micro_skai

def mapt(f, *iters):
    return tuple(map(f, *iters))

def mapl(f, *iters):
    return list(map(f, *iters))

def manually_remove_problems(data):
    """ remove problem from data if it has a certain tag"""
    final_data = {}
    remove = ['*special']
    for i in data:
        if set(data[i][1][0]).intersection(set(remove)) == set():
            if data[i][0][0] != '':
                final_data[i] = data[i]
    return final_data

def get_single_label_problems(data):
    '''returns a dict of all problems which only have one label'''
    single_label_problems = {}
    for i in data:
        if len(data[i][1][0]) == 1:
            single_label_problems[i] = data[i]
    return single_label_problems

def get_classwise_distribution(data):
    class_count = {}
    for i in data:
        for cls in data[i][1][0]:
            if cls in class_count:
                class_count[cls] +=1 
            else:
                class_count[cls] = 1
    return class_count


def get_topk_single_label_problems(data,k):
    """ get top k by frequency single label problems"""
    class_dict = get_classwise_distribution(data)
    print(class_dict)
    class_dict = dict(sorted(class_dict.items(), key=itemgetter(1), reverse=True)[:k])
    print(set(class_dict.keys()))

    topk_data = {}
    for i in data:
        if set(data[i][1][0]).intersection(set(class_dict.keys())) != set():
            topk_data[i] = data[i]
            
    return topk_data

def make_text_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0][0])
        except IndexError:
            continue
        Xtext.append(data[0][0])
    return Xtext, ytext

def make_multi_text_dataset(rdata):
    Xtext, ytext = [], []
    for url, data in rdata.items():
        try:
            ytext.append(data[1][0])
        except IndexError:
            continue
        Xtext.append(data[0][0])
    return Xtext, ytext

def get_class_list(labels):
    return list(set(labels))

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    cm = confusion_matrix(y_true, y_pred, labels=classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    fig = plt.gcf()
    fig.set_size_inches(22,16)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

  from numpy.core.umath_tests import inner1d


## Load data

In [2]:
top10m = pickle.load(open('data/10multi_26aug.pkl', 'rb'))
top20m = pickle.load(open('data/20multi_26aug.pkl', 'rb'))

top10m, top20m = mapt(make_multi_text_dataset, [top10m, top20m])

In [3]:
print(len(top10m[0]))

3737


In [4]:
print(top10m[1][0])

['binary search', 'data structures', 'brute force', 'dp']


## CNN Experiments

In [5]:
class MLP(nn.Module):
    def __init__(self, vocab_size, class_num, hidden_size=200):
        super().__init__()
        self.fc1 = nn.Linear(vocab_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, class_num)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        out = torch.sigmoid(self.fc3(x))
        
        return out

### 10-multi

In [6]:
trunner = TextRunner([None], top10m[0], top10m[1], 'top10m')
in_dim = len(trunner.alldata.tvectorizer.itos)
Xall, yall = trunner.dataset

Checkpoint reached: raw data cleaned.
multilabel classification.


In [7]:
print(in_dim)

13580


In [16]:
runs = 1
out_dim = 10

all_preds, all_targs = [], []

for i in range(runs):
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=i+42)
    
    outer_cv.get_n_splits(Xall, yall)
    for j, (nontest_i, test_i) in enumerate(outer_cv.split(Xall, yall)):
        print(f'{j+1}-th Fold.')
        X_train, y_train = Xall[nontest_i], yall[nontest_i]
        X_test, y_test = Xall[test_i], yall[test_i]
        
        textmlp = MWrapper(MLP(in_dim, out_dim),
                           f'{i}_mlp_gpu{j}')
        textmlp.model.apply(weights_init)

        dl_train = fd.DataLoader(SimpleBoWDataset(X_train, y_train, in_dim),
                                 batch_size=32, num_workers=2,
                                 pad_idx=1, transpose=False)
        dl_val = fd.DataLoader(SimpleBoWDataset(X_test, y_test, in_dim),
                               batch_size=32, num_workers=2,
                               pad_idx=1, transpose=False)
        modeldata = fs.ModelData(str(textmlp.path), dl_train, dl_val)
        learner = fl.Learner.from_model_data(textmlp.model,
                                             modeldata,
                                             opt_fn=Adam_lambda())
        learner.metrics = [f1_micro_skai]
        learner.fit(5e-4, 10, best_save_name='best')
        
        dl_test = fd.DataLoader(SimpleBoWDataset(X_test, y_test, in_dim),
                                batch_size=32, num_workers=2,
                                pad_idx=1, transpose=False)
        learner.load('best')
        preds, targs = learner.predict_dl(dl_test)
        preds = multilabel_prediction(preds, 0.5)
        
        all_preds.append(preds)
        all_targs.append(targs)
        
        print(f1_score(np.concatenate(np.array(all_targs), axis=0), 
                       np.concatenate(np.array(all_preds), axis=0), average='micro'))

1-th Fold.
Note: Model directory for mlp_gpu0 exists.
Note: Checkpoints directory for mlp_gpu0 exists.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai                
    0      0.133656   0.123855   0.173739  
    1      0.101464   0.123159   0.340613                     
    2      0.068831   0.127369   0.367896                      
    3      0.050323   0.141769   0.32142                       
    4      0.042831   0.153103   0.358759                      
    5      0.035586   0.149015   0.378039                      
    6      0.027181   0.152278   0.362421                     
    7      0.02152    0.164009   0.368968                      
    8      0.016777   0.151592   0.327226                     
    9      0.012315   0.147863   0.320118                     

0.37928007023705007
2-th Fold.
Note: Model directory for mlp_gpu1 exists.
Note: Checkpoints directory for mlp_gpu1 exists.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai               
    0      0.132528   0.130356   0.140543  
    1      0.100953   0.125768   0.297193                     
    2      0.068866   0.129882   0.323839                     
    3      0.048128   0.140387   0.317073                      
    4      0.041373   0.159427   0.344739                     
    5      0.034913   0.148388   0.380945                     
    6      0.025803   0.166292   0.361987                     
    7      0.019906   0.155543   0.361672                      
    8      0.015195   0.162507   0.347014                      
    9      0.011426   0.164993   0.314129                    

0.38190517616354935
3-th Fold.
Note: Model directory for mlp_gpu2 exists.
Note: Checkpoints directory for mlp_gpu2 exists.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai                
    0      0.132054   0.130825   0.130331  
    1      0.10049    0.125557   0.296521                    
    2      0.070161   0.130863   0.300328                      
    3      0.049767   0.144435   0.300775                     
    4      0.040082   0.143516   0.350424                     
    5      0.030856   0.154066   0.33902                      
    6      0.026763   0.15819    0.340212                      
    7      0.02294    0.177889   0.345209                     
    8      0.017784   0.16154    0.335007                     
    9      0.012039   0.157985   0.289127                     

0.3724504877327816
4-th Fold.
Note: Model directory for mlp_gpu3 exists.
Note: Checkpoints directory for mlp_gpu3 exists.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai               
    0      0.134236   0.124963   0.195349  
    1      0.103142   0.118612   0.34024                     
    2      0.073069   0.12246    0.360292                      
    3      0.054786   0.137721   0.369605                      
    4      0.043833   0.141897   0.354921                      
    5      0.0344     0.137382   0.392144                      
    6      0.025027   0.135313   0.397811                      
    7      0.019108   0.137577   0.410065                      
    8      0.013238   0.147894   0.373691                     
    9      0.010736   0.149705   0.368268                      

0.38222222222222224
5-th Fold.
Note: Model directory for mlp_gpu4 exists.
Note: Checkpoints directory for mlp_gpu4 exists.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai               
    0      0.131539   0.132491   0.138925  
    1      0.101369   0.129498   0.295701                    
    2      0.070883   0.133455   0.350476                      
    3      0.050082   0.141444   0.376095                      
    4      0.041048   0.146142   0.341252                     
    5      0.032065   0.154627   0.343142                     
    6      0.025099   0.169444   0.340833                      
    7      0.021047   0.162055   0.312292                     
    8      0.017266   0.158069   0.361169                     
    9      0.01405    0.161161   0.327963                     

0.38096922532720195
6-th Fold.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai               
    0      0.133119   0.118955   0.191502  
    1      0.100611   0.116793   0.332722                    
    2      0.068995   0.121522   0.370443                     
    3      0.049106   0.130021   0.379673                      
    4      0.038991   0.157601   0.334649                     
    5      0.033161   0.142031   0.363854                      
    6      0.031781   0.140614   0.407898                     
    7      0.023298   0.154061   0.369175                     
    8      0.017123   0.146169   0.374326                     
    9      0.013754   0.149588   0.381218                      

0.38522660388463803
7-th Fold.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai                
    0      0.135807   0.123702   0.171951  
    1      0.104481   0.117317   0.32873                     
    2      0.074178   0.124623   0.347045                     
    3      0.055383   0.146495   0.3333                       
    4      0.044061   0.130336   0.394129                      
    5      0.030799   0.138042   0.344262                      
    6      0.021097   0.145809   0.330482                     
    7      0.016578   0.151428   0.392133                     
    8      0.013762   0.15719    0.382694                      
    9      0.014443   0.155172   0.348256                     

0.3863434445995685
8-th Fold.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai                
    0      0.133721   0.129273   0.182047  
    1      0.10457    0.124549   0.331474                    
    2      0.074789   0.127294   0.339184                     
    3      0.050839   0.138336   0.341963                     
    4      0.044106   0.141485   0.373623                     
    5      0.034286   0.141971   0.410463                     
    6      0.024203   0.15954    0.341025                     
    7      0.018414   0.168159   0.360424                     
    8      0.015421   0.160686   0.353421                     
    9      0.012165   0.153621   0.350168                      

0.3895560207116889
9-th Fold.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai               
    0      0.132076   0.126692   0.131628  
    1      0.101647   0.123613   0.27347                      
    2      0.069592   0.133422   0.335191                     
    3      0.047905   0.148093   0.359538                     
    4      0.038856   0.147122   0.411888                     
    5      0.031889   0.146245   0.389301                     
    6      0.023506   0.143046   0.383762                     
    7      0.019451   0.145601   0.382234                     
    8      0.01541    0.158734   0.334545                     
    9      0.012068   0.158282   0.336603                      

0.3925324047204488
10-th Fold.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai               
    0      0.134421   0.12965    0.071183  
  0%|          | 0/106 [00:00<?, ?it/s]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


    1      0.103927   0.125368   0.279047                     
    2      0.072042   0.132858   0.328218                      
    3      0.051691   0.146659   0.30795                       
    4      0.042555   0.156581   0.343581                      
    5      0.035291   0.143799   0.342388                     
    6      0.029574   0.1481     0.360839                     
    7      0.026538   0.154848   0.3513                        
    8      0.018324   0.161595   0.380917                     
    9      0.013191   0.157548   0.319551                     

0.39126289329981806


In [17]:
all_preds = np.array(all_preds)
all_targs = np.array(all_targs)

all_preds = np.concatenate(all_preds, axis=0)
all_targs = np.concatenate(all_targs, axis=0)

In [8]:
# pickle.dump([all_preds, all_targs], open('data/results/mlp-gpu_10m.pkl', 'wb'))
all_preds, all_targs = pickle.load(open('data/results/mlp-gpu_10m.pkl', 'rb'))

In [9]:
print(all_preds[7])

[0 0 0 0 0 1 0 0 0 0]


In [10]:
hl = hamming_loss(all_targs, all_preds)
micro_f1 = f1_score(all_targs, all_preds, average='micro')
macro_f1 = f1_score(all_targs, all_preds, average='macro')

In [11]:
print(f'Hamming loss = {hl}\nMicro_F1 = {micro_f1}l\nMacro_F1 = {macro_f1}')

Hamming loss = 0.18793149585228794
Micro_F1 = 0.39126289329981806l
Macro_F1 = 0.34918186964295644


### 20-multi

In [12]:
trunner = TextRunner([None], top20m[0], top20m[1], 'top20m')
in_dim = len(trunner.alldata.tvectorizer.itos)
Xall, yall = trunner.dataset

Checkpoint reached: raw data cleaned.
multilabel classification.


In [13]:
print(in_dim)

14061


In [18]:
runs = 1
out_dim = 20

all_preds, all_targs = [], []

for i in range(runs):
    outer_cv = KFold(n_splits=10, shuffle=True, random_state=i+42)
    
    outer_cv.get_n_splits(Xall, yall)
    for j, (nontest_i, test_i) in enumerate(outer_cv.split(Xall, yall)):
        print(f'{j+1}-th Fold.')
        X_train, y_train = Xall[nontest_i], yall[nontest_i]
        X_test, y_test = Xall[test_i], yall[test_i]
        
        textmlp = MWrapper(MLP(in_dim, out_dim),
                           f'{i}mlp_20_gpu{j}')
        textmlp.model.apply(weights_init)

        dl_train = fd.DataLoader(SimpleBoWDataset(X_train, y_train, in_dim),
                                 batch_size=32, num_workers=2,
                                 pad_idx=1, transpose=False)
        dl_val = fd.DataLoader(SimpleBoWDataset(X_test, y_test, in_dim),
                               batch_size=32, num_workers=2,
                               pad_idx=1, transpose=False)
        modeldata = fs.ModelData(str(textmlp.path), dl_train, dl_val)
        learner = fl.Learner.from_model_data(textmlp.model,
                                             modeldata,
                                             opt_fn=Adam_lambda())
        learner.metrics = [f1_micro_skai]
        learner.fit(5e-4, 10, best_save_name='best')
        
        dl_test = fd.DataLoader(SimpleBoWDataset(X_test, y_test, in_dim),
                                batch_size=32, num_workers=2,
                                pad_idx=1, transpose=False)
        learner.load('best')
        preds, targs = learner.predict_dl(dl_test)
        preds = multilabel_prediction(preds, 0.5)
        
        all_preds.append(preds)
        all_targs.append(targs)
        
        print(f1_score(np.concatenate(np.array(all_targs), axis=0), 
                       np.concatenate(np.array(all_preds), axis=0), average='micro'))

1-th Fold.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai                 
    0      0.08945    0.086641   0.106773  
    1      0.071075   0.080633   0.238814                      
    2      0.054172   0.079652   0.309555                      
    3      0.041553   0.082986   0.361689                      
    4      0.035716   0.08946    0.377158                      
    5      0.029183   0.093995   0.359542                      
    6      0.023957   0.087165   0.348648                      
    7      0.01986    0.090831   0.35024                       
    8      0.014189   0.089929   0.396724                      
    9      0.011043   0.090714   0.395725                      

0.3972125435540069
2-th Fold.


HBox(children=(IntProgress(value=0, description='Epoch', max=10), HTML(value='')))

epoch      trn_loss   val_loss   f1_micro_skai                 
    0      0.090391   0.08207    0.125658  
    1      0.071848   0.07677    0.296632                      
    2      0.054092   0.077789   0.341522                      
 52%|█████▏    | 58/112 [00:00<00:00, 110.29it/s, loss=0.0461]


KeyboardInterrupt: 

In [26]:
all_preds = np.array(all_preds)
all_targs = np.array(all_targs)

all_preds = np.concatenate(all_preds, axis=0)
all_targs = np.concatenate(all_targs, axis=0)

In [14]:
# pickle.dump([all_preds, all_targs], open('data/results/mlp-gpu_20m.pkl', 'wb'))
all_preds, all_targs = pickle.load(open('data/results/mlp-gpu_20m.pkl', 'rb'))

In [15]:
print(all_preds[7])

[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [16]:
hl = hamming_loss(all_targs, all_preds)
micro_f1 = f1_score(all_targs, all_preds, average='micro')
macro_f1 = f1_score(all_targs, all_preds, average='macro')

In [17]:
print(f'Hamming loss = {hl}\nMicro_F1 = {micro_f1}l\nMacro_F1 = {macro_f1}')

Hamming loss = 0.1167550505050505
Micro_F1 = 0.3811818242655424l
Macro_F1 = 0.31369342335343253
