In [None]:
import pandas as pd
import numpy as np
import os
import operator

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score 
from sklearn.metrics import precision_recall_fscore_support, classification_report

import matplotlib.pyplot as plt
%matplotlib inline

import functools

import h5py

## Environmental vars

In [None]:
P_THRESHOLD=0.5

In [None]:
DATADIR=os.getenv('DATADIR')
RESULTS_DIR = os.path.join(DATADIR, "2018-03-12")
RESULTS_DIR

## Get some data about taxons/content

In [None]:
labelled_level2 = pd.read_csv(
    os.path.join(RESULTS_DIR, 'labelled_level2.csv.gz'),
    dtype=object,
    compression='gzip'
)

In [None]:
# Create World taxon in case any items not identified 
# through doc type in clean_content are still present
labelled_level2.loc[labelled_level2['level1taxon'] == 'World', 'level2taxon'] = 'world_level1'

In [None]:
# creating categorical variable for level2taxons from values
labelled_level2['level2taxon'] = labelled_level2['level2taxon'].astype('category')

# Add 1 because of zero-indexing to get 1-number of level2taxons as numerical targets
labelled_level2['level2taxon_code'] = labelled_level2.level2taxon.astype('category').cat.codes + 1

# create dictionary of taxon category code to string label for use in model evaluation
labels_index = dict(zip((labelled_level2['level2taxon_code']),
                        labelled_level2['level2taxon']))

## Load in data

### 1. Probability and y arrays.

In [None]:
true_train = pd.read_csv(os.path.join(RESULTS_DIR, 'true_train_1329_1203_.csv.gz'), dtype=float, compression='gzip')
prob_train = pd.read_csv(os.path.join(RESULTS_DIR, 'train_results_1329_1203_.csv.gz'), dtype=float, compression='gzip')

In [None]:
true_dev = pd.read_csv(os.path.join(RESULTS_DIR, 'true_dev_1329_1203_.csv.gz'), dtype=float, compression='gzip')
prob_dev = pd.read_csv(os.path.join(RESULTS_DIR, 'dev_results_1329_1203_.csv.gz'), dtype=float, compression='gzip')

### 2. Metadata, content_id, x, y arrays.
A bit redundant.

In [None]:
train = np.load(os.path.join(RESULTS_DIR, 'train_arrays.npz'))

In [None]:
dev = np.load(os.path.join(RESULTS_DIR, 'dev_arrays.npz'))

In [None]:
train.files

In [None]:
train['content_id']

In [None]:
train['content_id'].shape

In [None]:
true_train.shape[0]

## Merge results array with labelled_level2 values
content_id, metadata etc

In [None]:
def create_df_from_array(npz_array,val):
    df = pd.DataFrame()
    df[val] = npz_array[val]
    return df

In [None]:
metadata = ['document_type','publishing_app']

In [None]:
def add_meta_to_df(meta_vars,df):
    for var in meta_vars:
        print("Working on:",var)
        df[var] = df['content_id'].map(dict(zip(labelled_level2['content_id'], labelled_level2[var])))

In [None]:
df_true = create_df_from_array(train,'content_id')
df_true = pd.concat([df_true, true_train], axis=1, join='inner')

In [None]:
add_meta_to_df(metadata,df_true)

In [None]:
df_prob = create_df_from_array(train,'content_id')
df_prob = pd.concat([df_prob, prob_train], axis=1, join='inner')

In [None]:
add_meta_to_df(metadata,df_prob)

In [None]:
df_true.head()

In [None]:
df_prob.head()

## Evaluate model

1. Loop over results and then compute filtered (in terms of metadata) f1 micro score. 
2. F1 score shold correspond to overall taxon performance, for a specific metadata value.
3. Correlation between metadata and F1?

In [None]:
df_prob[[str(i) for i in range(1,219)]]

In [None]:
def filtered_f1(meta_vars, metadata, true, probs):
    f1_scores = {}
    for i,meta in enumerate(meta_vars):

        print(i+1,"out of",len(meta_vars),":",meta)
        number_of_samples = probs[[str(i) for i in range(1,219)]].loc[probs[metadata]==meta].shape[0]

        if number_of_samples >0:

            filtered_prob_array = probs[[str(i) for i in range(1,219)]].loc[probs[metadata]==meta].values

            filtered_prob_array[filtered_prob_array>=P_THRESHOLD] = 1
            filtered_prob_array[filtered_prob_array<P_THRESHOLD] = 0

            filtered_true_array = true[[str(i) for i in range(1,219)]].loc[true[metadata]==meta].values

            print("Equal size true and pred",(len(filtered_prob_array)==len(filtered_true_array)))

            f1_m = precision_recall_fscore_support(filtered_true_array, filtered_prob_array, 
                                                   average='micro', sample_weight=None)[2]
            f1_scores[meta] = f1_m

        else:
            print("Metadata value",meta,"from",metadata,"not found in set.")
        
    return f1_scores

### F1 scores in terms of meta data

In [None]:
def get_meta_lists(metadata_var):
    print("Getting values for:",metadata_var)
    print(labelled_level2[metadata_var].nunique())
    freq_meta_vals = labelled_level2[metadata_var].value_counts()
    print("Frequency dict:",[(k,v) for k,v in freq_meta_vals.items()][0:5])
    meta_vals = labelled_level2[metadata_var].unique()
    print("Value array:",meta_vals[0:4])
    
    return freq_meta_vals,meta_vals

In [None]:
_,doc_types = get_meta_lists("document_type")

In [None]:
f1_train_doc_type = filtered_f1(doc_types, "document_type", df_true, df_prob)

In [None]:
sorted_f1 = sorted(f1_scores.items(), key=operator.itemgetter(1))
sorted_f1

In [None]:
len(sorted_f1)

### Training metrics

### Dev set metrics

In [None]:
df_true_dev = create_df_from_array(dev,'content_id')
df_true_dev = pd.concat([df_true_dev, true_dev], axis=1, join='inner')

In [None]:
add_meta_to_df(metadata,df_true_dev)

In [None]:
df_true_dev.head()

In [None]:
df_true_dev['content_id'].iloc[2]

In [None]:
labelled_level2.loc[labelled_level2['content_id']=="33582c0d-57a3-4dc3-a601-6ef316d997af"]

In [None]:
df_prob_dev = create_df_from_array(dev,'content_id')
df_prob_dev = pd.concat([df_prob_dev, prob_dev], axis=1, join='inner')

In [None]:
add_meta_to_df(metadata,df_prob_dev)

In [None]:
freq_doc, doc_list = get_meta_lists("document_type")
freq_pub_app, pub_app_list = get_meta_lists("publishing_app")

print(doc_list[0:2],pub_app_list[0:2])

In [None]:
f1_dev_doc = filtered_f1(doc_list, "document_type", df_true_dev, df_prob_dev)
f1_dev_pub = filtered_f1(pub_app_list, "publishing_app", df_true_dev, df_prob_dev)

In [None]:
sorted(f1_dev_doc.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
results_doc = pd.DataFrame.from_dict(data=f1_dev_doc,orient='index').rename(columns={0:'F1_micro'}).reset_index().\
                     rename(columns={'index':'document_type',0:'F1_micro'})

In [None]:
training_supp = create_df_from_array(train,'content_id')
add_meta_to_df(metadata,training_supp)

In [None]:
support_doc = training_supp['document_type'].value_counts()
support_pub = training_supp['publishing_app'].value_counts()

In [None]:
results_doc['support_doc_type'] = results_doc['document_type'].map(lambda x: support_doc[x] if x in support_doc else 0)

In [None]:
results_doc.head()

In [None]:
results_doc['F1_micro'].sort_values().plot( kind = 'barh', figsize=(20, 20))

In [None]:
results_doc['support_doc_type'].max()

In [None]:
# normal values

fig = plt.figure(figsize=(30, 20))
axes = plt.subplot()
axes.set_xlim([0,21000])
axes.set_ylim([0,1.1])

plt.xlabel('Support Per Document Type')
plt.ylabel('F1 micro')
plt.scatter(results_doc['support_doc_type'][results_doc['F1_micro']>=0.8], 
            results_doc['F1_micro'][results_doc['F1_micro']>=0.8], 
            c = 'b', 
            alpha=.4)
# less than 0.8
plt.scatter(results_doc['support_doc_type'][results_doc['F1_micro']<0.8], 
            results_doc['F1_micro'][results_doc['F1_micro']<0.8], 
            c = 'grey', 
            alpha=.4)

plt.scatter(results_doc['support_doc_type'][results_doc['F1_micro']==0], 
            results_doc['F1_micro'][results_doc['F1_micro']==0], 
            c = 'red', 
            alpha=.4)

plt.scatter(results_doc['support_doc_type'][results_doc['F1_micro']==1], 
            results_doc['F1_micro'][results_doc['F1_micro']==1], 
            c = 'yellow', 
            alpha=.4)


for label, x, y in zip(results_doc['document_type'], results_doc['support_doc_type'], results_doc['F1_micro']):
    if y < 1:
        plt.annotate(
            label,
            xy=(x, y), xytext=(-1, 1),
            textcoords='offset points', ha='right', va='bottom')
    if (y==1):
        label = ""
        plt.annotate(
            label,
            xy=(x, y), xytext=(-1, 1),
            textcoords='offset points', ha='right', va='bottom')

plt.yticks(np.arange(0, 1, 0.1))        
plt.xticks(np.arange(0, 22000, 1000))
plt.show()

## F1 scores

## Support and performance

## Predictions

In [None]:
predictions = pd.read_csv(os.path.join(RESULTS_DIR, 'predictions_meta.csv.gz'), dtype=object, compression='gzip')

In [None]:
predictions.head()

In [None]:
predictions.columns

In [None]:
predictions = predictions[['content_id', 'prob', 'taxon2label', 'base_path', 'title', 'description',
       'combined_text', 'document_type', 'first_published_at', 'primary_publishing_organisation', 'publishing_app']]

In [None]:
predictions[200:250].head()

In [None]:
taxons = predictions['taxon2label'].unique()

In [None]:
taxons

In [None]:
subsample = pd.DataFrame(columns = predictions.columns)

In [None]:
for taxon in taxons:
    taxon_spec = predictions.loc[predictions['taxon2label']==taxon]
    sample_size = int(round(taxon_spec.shape[0]*0.1))
    if not sample_size == 0 and taxon_spec.shape[0] > sample_size:
        print(taxon,": SAMPLING AT:",sample_size)
        subsample = subsample.append(taxon_spec.sample(n=sample_size), ignore_index=True)

In [None]:
subsample[['taxon2label','title','combined_text','base_path']]

In [None]:
subsample[['taxon2label','title','combined_text','base_path']].to_csv(os.path.join(RESULTS_DIR,"subsampled_predictions_10percent.csv"),index=False)