In [None]:
%matplotlib inline

from __future__ import absolute_import, print_function, unicode_literals

import numpy as np
import os
import pandas as pd

from itertools import product
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, cohen_kappa_score
from tqdm import tqdm_notebook

## Feature Selection

First we check how much does feature selection affect the final results.
As with MLP we are obligued to do a feature selection, we need to compare
to see if there is a possible inconsistency later.

For this we need the results of handcrafted features with and without
feature selection for the classifiers:
+ Decision Tree
+ Logistic Regression
+ Naive Bayes
+ SVM

We collect the metrics following metrics both considering monoclass
lemmas and filtering them:
+ Accuracy
+ Macro Precision
+ Macro Recall

After that we also collect the Cohen's kappa score for each classifiers
vs the ground truth. But only for the cases of lemmas with more than one class.

In [None]:
results_path = '../../results/experiment0/'
representations = ['handcrafted', 'handcrafted_feature_selection']
classifiers = ['decision_tree', 'log', 'naive_bayes', 'svm']
corpora = ['sensem', 'semeval']

In [None]:
hand_fs = []
labels = []
labels_count = []
hand_fs_columns = ['classifier', 'representation', 'corpus', 'lemma', 'num_classes',
                   'accuracy', 'macro_precision', 'macro_recall', 'kappa_score']

for classifier, representation, corpus in\
    tqdm_notebook(product(*(classifiers, representations, corpora)),
                  total=len(classifiers)*len(representations)*len(corpora)):
    path = os.path.join(results_path, '%s.csv' % ('_'.join([classifier, representation, corpus])))
    df = pd.read_csv(path)
    
    for (lemma, corpus_split), lcdf in df.groupby(['lemma', 'corpus'], sort=False):
        if corpus_split == 'train':
            labels, labels_count = np.unique(lcdf.true, return_counts=True)
        
        rdf = {'classifier': classifier,
               'representation': representation,
               'corpus': '%s.%s' % (corpus, corpus_split),
               'lemma': lemma,
               'num_classes': labels.shape[0],
              }
        rdf['accuracy'] = accuracy_score(lcdf.true, lcdf.prediction)
        rdf['macro_precision'], rdf['macro_recall'], _, _ =\
            precision_recall_fscore_support(lcdf.true, lcdf.prediction, average='macro', labels=labels)
        rdf['kappa_score'] = cohen_kappa_score(lcdf.true, lcdf.prediction, labels=labels)
        hand_fs.append(rdf)

hand_fs = pd.DataFrame(hand_fs, columns=hand_fs_columns)
hand_fs.to_csv('./data/handcrafted_vs_feature_selection.csv', index=False, float_format='%.2e')

## Representations

After finding out there is no difference using Feature Selection, we show the general metrics
results for all the classifiers, that is:
+ Baseline
+ Decision Tree
+ Logistic Regression
+ MLP
+ Naive Bayes
+ SVM

For this we use the following representations:
+ Feature selection of handcrafted features
+ Hashed features with only positive values
+ Hashed features with positive and negative values (not valid with Naive Bayes)

The first boxplot will have the representations as columns and the classifiers as rows
with the following metrics:
+ Accuracy
+ Macro Precision
+ Macro Recall
+ PMFC
+ RMLFC

This last will show which is the best representation (or if there is any difference at all) and
then we use the visual information to select such representation and also we discard those
algorithms which are visually showing less performance.

In [None]:
results_path = '../../results/experiment0'
classifiers = ['baseline', 'decision_tree', 'log', 'mlp_5000', 'naive_bayes', 'svm']
representations = ['handcrafted_feature_selection', 'hashed', 'negative_hashed']
corpora = ['sensem', 'semeval']

In [None]:
metrics_df = []
labels = []
labels_count = []
columns = ['classifier', 'representation', 'lemma', 'num_classes', 'corpus', 'accuracy',
           'macro_precision', 'macro_recall', 'pmfc', 'rmlfc', 'kappa_score', 'micro_precision', 'micro_recall',
           'weighted_precision', 'weighted_recall']

for classifier, representation, corpus in\
    tqdm_notebook(product(*(classifiers, representations, corpora)),
                  total=len(classifiers)*len(representations)*len(corpora)):
    try:
        path = os.path.join(results_path, '%s.csv' % ('_'.join([classifier, representation, corpus])))
        df = pd.read_csv(path)
    except OSError:
        continue
    
    for (lemma, corpus_split), lcdf in df.groupby(['lemma', 'corpus'], sort=False):
        if corpus_split == 'train':
            labels, labels_count = np.unique(lcdf.true, return_counts=True)

        rdf = {'classifier': classifier,
               'representation': representation,
               'corpus': '%s.%s' % (corpus, corpus_split),
               'lemma': lemma,
               'num_classes': labels.shape[0],
              }
        rdf['accuracy'] = accuracy_score(lcdf.true, lcdf.prediction)
        rdf['macro_precision'], rdf['macro_recall'], _, _ =\
            precision_recall_fscore_support(lcdf.true, lcdf.prediction, average='macro', labels=labels)
        rdf['micro_precision'], rdf['micro_recall'], _, _ =\
            precision_recall_fscore_support(lcdf.true, lcdf.prediction, average='micro', labels=labels)
        rdf['weighted_precision'], rdf['weighted_recall'], _, _ =\
            precision_recall_fscore_support(lcdf.true, lcdf.prediction, average='weighted', labels=labels)
        rdf['kappa_score'] = cohen_kappa_score(lcdf.true, lcdf.prediction, labels=labels)

        if labels.shape[0] > 1:
            precision, recall, _, _ =\
                precision_recall_fscore_support(lcdf.true, lcdf.prediction, average=None, labels=labels)
            mask = np.ones(recall.shape, dtype=np.bool)
            mask[np.argmax(recall)] = False
            rdf['pmfc'] = precision[~mask][0]
            rdf['rmlfc'] = recall[mask].mean()
        else:  # Ill defined metrics for such case (only label available make the metrics always 1)
            rdf['pmfc'] = 1.0
            rdf['rmlfc'] = 1.0
        
        metrics_df.append(rdf)

metrics_df = pd.DataFrame(metrics_df, columns=columns)
metrics_df.to_csv('./data/experiment0_general_metrics.csv', index=False, float_format='%.2e')

## Representation selection and classifiers comparison

Once compared the representations, we see if any of them is visually better than the others. If
no representation shows real improvement, we decide to go with the one that simplifies everything (for now)
that is Hashed All Positive Features.

After selecting the final representation we need to do a classifier comparison to select the classifiers
to work with. Previous to this we filter out those classifiers that show visually worse performance
in the previous plots (baseline and naive_bayes).

This is done using two graphics:
+ A boxplot showing the different metrics of each classifier side by side.
+ A heatmap showing the kappa average values comparando cada clasificador contra todos los demás.

In [None]:
results_path = '../../results/experiment0'
classifiers = ['decision_tree', 'log', 'mlp_5000', 'svm']
representations = ['hashed']
corpora = ['sensem', 'semeval']

In [None]:
predictions_df = {}

for classifier, representation, corpus in\
    tqdm_notebook(product(*(classifiers, representations, corpora)),
                  total=len(classifiers)*len(representations)*len(corpora)):
    try:
        path = os.path.join(results_path, '%s.csv' % ('_'.join([classifier, representation, corpus])))
        df = pd.read_csv(path)
    except OSError:
        continue
    
    for corpus_split, cdf in df.groupby(['corpus'], sort=False):
        if (corpus, corpus_split, 'ground_truth') not in predictions_df:
            cdft = cdf[['lemma', 'corpus', 'true']]
            cdft.columns = ['lemma', 'corpus_split', 'value']
            cdft['corpus'] = corpus
            cdft['classifier'] = 'ground_truth'            
            predictions_df[(corpus, corpus_split, 'ground_truth')] =\
                cdft[['classifier', 'corpus', 'corpus_split', 'lemma', 'value']]

        cdfp = cdf[['lemma', 'corpus', 'prediction']]
        cdfp.columns = ['lemma', 'corpus_split', 'value']
        cdfp['corpus'] = corpus
        cdfp['classifier'] = classifier            
        predictions_df[(corpus, corpus_split, classifier)] =\
            cdfp[['classifier', 'corpus', 'corpus_split', 'lemma', 'value']]

predictions_df = pd.concat(predictions_df.values())

In [None]:
kappa_heatmap = []
columns = ['lemma', 'corpus', 'corpus_split', 't1', 't2', 'kappa_score']

num_labels = {}
ground_truth_df = predictions_df[predictions_df['classifier'] == 'ground_truth']
ground_truth_df = ground_truth_df[ground_truth_df['corpus_split'] == 'train']

for (corpus, lemma), cldf in ground_truth_df.groupby(['corpus', 'lemma']):
    num_labels[(corpus, lemma)] = cldf.value.unique().shape[0]

predictions_df = predictions_df[predictions_df['corpus_split'] == 'test']
    
for (t1, corpus, lemma), kdf1 in\
    tqdm_notebook(predictions_df.groupby(['classifier', 'corpus', 'lemma'])):
    for t2 in predictions_df['classifier'].unique():
        kdf2 = predictions_df[
            (predictions_df['classifier'] == t2) &
            (predictions_df['corpus'] == corpus) &
            (predictions_df['lemma'] == lemma)]
        
        if num_labels[(corpus, lemma)] > 1:
            kappa_heatmap.append({
                    'corpus': corpus,
                    'lemma': lemma,
                    't1': t1,
                    't2': t2,
                    'kappa_score': cohen_kappa_score(kdf1.value, kdf2.value)
                })

kappa_heatmap = pd.DataFrame(kappa_heatmap, columns=columns)
kappa_heatmap = kappa_heatmap.groupby(['corpus', 't1', 't2'])\
    .agg({'kappa_score': np.mean}).reset_index()

kappa_heatmap.to_csv('./data/experiment0_kappa_interclassifier.csv', index=False, float_format='%.2e')