In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

### Environmental vars

In [None]:
P_THRESHOLD=0.5

In [None]:
DATADIR=os.getenv('DATADIR')
RESULTS_DIR = os.path.join(DATADIR, "2018-03-05")
RESULTS_DIR

### get some data about taxons/content

In [None]:
labelled_level2 = pd.read_csv(
    os.path.join(DATADIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [None]:
# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [None]:
# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

### Load in data

In [None]:
true_train = pd.read_csv(os.path.join(RESULTS_DIR, 'true_train_1010_0803_.csv.gz'), dtype=float, compression='gzip')
prob_train = pd.read_csv(os.path.join(RESULTS_DIR, 'train_results_1010_0803_.csv.gz')

In [None]:
true_dev = pd.read_csv(os.path.join(RESULTS_DIR, 'true_dev_1010_0803_.csv.gz'), dtype=float, compression='gzip')
prob_dev = pd.read_csv(os.path.join(RESULTS_DIR, 'dev_results_1010_0803_.csv.gz'), dtype=float, compression='gzip')

In [None]:
train = np.load(os.path.join(RESULTS_DIR, 'train_arrays.npz'))

In [None]:
dev = np.load(os.path.join(RESULTS_DIR, 'dev_arrays.npz'))

In [None]:
train.files

In [None]:
train['content_id']

In [None]:
train['content_id'].shape

In [None]:
true_train.shape[0]

### Merge results array with labelled_level2 values
content_id, metadata etc

In [None]:
df = pd.DataFrame()
if train['content_id'].shape[0] == true_train.shape[0]:
    df['content_id'] = train['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

In [None]:
df.head()

In [None]:
df = pd.concat([df, prob_train], axis=1, join='inner')

In [None]:
df.head()

In [None]:
merged_probs = pd.merge(
    left=df,
    right=labelled_level2,
    on='content_id',
    how='left',
    indicator=True, 
    validate='m:m'
)

In [None]:
merged_probs.head()

### Evaluate model

#### Training metrics

In [None]:
y_pred = prob_train.copy(deep=True)

In [None]:
y_pred[y_pred>=P_THRESHOLD] = 1
y_pred[y_pred<P_THRESHOLD] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(true_train, y_pred, average='weighted', sample_weight=None)))

In [None]:
eval_metrics = precision_recall_fscore_support(true_train, y_pred, average=None, sample_weight=None)
eval_metrics_df = pd.DataFrame(list(eval_metrics))

In [None]:
train_metrics = eval_metrics_df.transpose()
train_metrics.columns = ['precision', 'recall', 'f1', 'support']
train_metrics['level2code'] = train_metrics.index +1
train_metrics['level2label'] = train_metrics['level2code'].map(labels_index)

#### Development set metrics

In [None]:
pred_dev = prob_dev.copy()

In [None]:
pred_dev[pred_dev>=P_THRESHOLD] = 1
pred_dev[pred_dev<P_THRESHOLD] = 0

In [None]:
print('micro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='micro', sample_weight=None)))
print('macro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='macro', sample_weight=None)))
print('weightedmacro: {}'.format(precision_recall_fscore_support(true_dev, pred_dev, average='weighted', sample_weight=None)))

In [None]:
eval_metrics = precision_recall_fscore_support(true_dev, pred_dev, average=None, sample_weight=None)
eval_metrics_df = pd.DataFrame(list(eval_metrics))

In [None]:
dev_metrics = eval_metrics_df.transpose()
dev_metrics.columns = ['precision', 'recall', 'f1', 'support']
dev_metrics['taxon2code'] = dev_metrics.index +1
dev_metrics['taxon2label'] = dev_metrics['taxon2code'].map(labels_index)

In [None]:
dev_metrics[dev_metrics['f1']==0].shape[0]

In [None]:
dev_metrics[dev_metrics['f1']==1].shape[0]

In [None]:
dev_metrics[dev_metrics['f1']==1]

In [None]:
high = dev_metrics[dev_metrics['f1']>0.9]

In [None]:
morethan80 = dev_metrics[dev_metrics['f1']>0.8]

In [None]:
dev_metrics[dev_metrics['f1']<0.6].shape[0]

In [None]:
if dev['content_id'].shape[0] == keep_perfect_performing_taxons.shape[0]:
    keep_perfect_performing_taxons['content_id'] = dev['content_id']
else:
    print("warning: true_train and content_id may not originate from same data")

#### F1 scores

In [None]:
#Calculate globally by counting the total true positives, false negatives and false positives.
precision_recall_fscore_support(y_train, y_pred, average='micro', sample_weight=None) 

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='macro', sample_weight=None)

In [None]:
#Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account
precision_recall_fscore_support(y_dev, y_pred_dev, average='weighted', sample_weight=None)

## support and performance

The support is the number of occurrences of each class in y_true

In [None]:
dev_metrics.plot.scatter(x='support', y='f1', marker='o', alpha=.5, figsize=(20, 20))


In [None]:
morethan80.plot.scatter(x='support', y='f1', figsize=(20, 10))

for label, x, y in zip(morethan80['taxon2label'], morethan80['support'], morethan80['f1']):
    plt.annotate(
        label,
        xy=(x, y), xytext=(-1, 1),
        textcoords='offset points', ha='right', va='bottom')
plt.show()

#### Predictions

In [None]:
predictions = pd.read_csv(os.path.join(RESULTS_DIR, 'predictions_meta.csv.gz'), dtype=object, compression='gzip')

In [None]:
predictions.head()

In [None]:
predictions.columns

In [None]:
predictions = predictions[['content_id', 'prob', 'taxon2label', 'base_path', 'title', 'description',
       'combined_text', 'document_type', 'first_published_at', 'primary_publishing_organisation', 'publishing_app']]

In [None]:
predictions[200:250].head()

In [None]:
taxons = predictions['taxon2label'].unique()

In [None]:
taxons

In [None]:
subsample = pd.DataFrame(columns = predictions.columns)

In [None]:
for taxon in taxons:
    taxon_spec = predictions.loc[predictions['taxon2label']==taxon]
    sample_size = int(round(taxon_spec.shape[0]*0.1))
    if not sample_size == 0 and taxon_spec.shape[0] > sample_size:
        print(taxon,": SAMPLING AT:",sample_size)
        subsample = subsample.append(taxon_spec.sample(n=sample_size), ignore_index=True)

In [None]:
subsample[['taxon2label','title','combined_text','base_path']]

In [None]:
subsample[['taxon2label','title','combined_text','base_path']].to_csv(os.path.join(RESULTS_DIR,"subsampled_predictions_10percent.csv"),index=False)