In [None]:
%load_ext autoreload
%autoreload 2

In [19]:
import numpy as np
import spacy
import logging
import pandas as pd
import en_core_web_sm
import plotly.express as px

from tqdm.auto import tqdm
from IPython.display import display
from transformers import AutoTokenizer
from utilities.data_processing.processing_utils import generate_word_cloud, get_labelled, get_agreement
from utilities.utils import load_annotations, input_dir, figures_dir, shared_dir, get_positive, display_image, \
    get_negative, \
    calculate_dataset_similarity, select_annotation, plot_confusion_matrix, kappa, annotations_dir, word_clouds_dir

In [20]:
tqdm.pandas()
spacy.prefer_gpu()
sp = en_core_web_sm.load()
logging.basicConfig(level=logging.INFO)
pd.set_option('display.max_colwidth', None)

### Load annotations

In [None]:
annotations_one = load_annotations(filename=f'{input_dir}/geospatial_annotations_1.json')
annotations_two = load_annotations(filename=f'{input_dir}/geospatial_annotations_2.json')
annotations_three = load_annotations(filename=f'{input_dir}/geospatial_annotations_3.json')

display(annotations_one.head())
display(annotations_two.head())
display(annotations_three.head())

annotations_df = annotations_one.merge(annotations_two, how='inner', left_index=True, right_index=True, suffixes=('_one', '_two'))
annotations_df = annotations_df.merge(annotations_three, how='inner', left_index=True, right_index=True, suffixes=('', '_three'))

annotations_df.drop(['text_two', 'text'], axis=1, inplace=True)
annotations_df.rename(columns={'text_one': 'text', 'label': 'label_three'}, inplace=True)

annotations_df.head()

### Calculate annotator agreement

In [22]:
validation_agreement = {'pp': [], 'nn': [], 'pn': [], 'np': []}

In [None]:
cross_pp = len(annotations_df[(annotations_df.label_one == 1) & (annotations_df.label_two == 1)])
cross_nn = len(annotations_df[(annotations_df.label_one == 0) & (annotations_df.label_two == 0)])
cross_pn = len(annotations_df[(annotations_df.label_one == 1) & (annotations_df.label_two == 0)])
cross_np = len(annotations_df[(annotations_df.label_one == 0) & (annotations_df.label_two == 1)])
plot_confusion_matrix(np.asarray([cross_nn, cross_pn, cross_np, cross_pp]).reshape((2, 2)),
                      filename=f'{annotations_dir}/annotator_agreement_validation_1_2', group_names=['TN', 'FP', 'FN', 'TP'])
print(kappa(cross_pp, cross_nn, cross_pn, cross_np))

In [None]:
cross_pp = len(annotations_df[(annotations_df.label_one == 1) & (annotations_df.label_three == 1)])
cross_nn = len(annotations_df[(annotations_df.label_one == 0) & (annotations_df.label_three == 0)])
cross_pn = len(annotations_df[(annotations_df.label_one == 1) & (annotations_df.label_three == 0)])
cross_np = len(annotations_df[(annotations_df.label_one == 0) & (annotations_df.label_three == 1)])
plot_confusion_matrix(np.asarray([cross_nn, cross_pn, cross_np, cross_pp]).reshape((2, 2)),
                      filename=f'{annotations_dir}/annotator_agreement_validation_1_3', group_names=['TN', 'FP', 'FN', 'TP'])
print(kappa(cross_pp, cross_nn, cross_pn, cross_np))

In [None]:
cross_pp = len(annotations_df[(annotations_df.label_two == 1) & (annotations_df.label_three == 1)])
cross_nn = len(annotations_df[(annotations_df.label_two == 0) & (annotations_df.label_three == 0)])
cross_pn = len(annotations_df[(annotations_df.label_two == 1) & (annotations_df.label_three == 0)])
cross_np = len(annotations_df[(annotations_df.label_two == 0) & (annotations_df.label_three == 1)])
plot_confusion_matrix(np.asarray([cross_nn, cross_pn, cross_np, cross_pp]).reshape((2, 2)),
                      filename=f'{annotations_dir}/annotator_agreement_validation_2_3', group_names=['TN', 'FP', 'FN', 'TP'])
print(kappa(cross_pp, cross_nn, cross_pn, cross_np))

In [None]:
annotations_df['label'] = annotations_df.apply(lambda x: select_annotation([x['label_one'], x['label_two'], x['label_three']]), axis=1)

annotations_df.head()

In [None]:
cross_tp = sum(get_agreement(annotations_df, y_hat=1, y=1)) / 3
cross_tn = sum(get_agreement(annotations_df, y_hat=0, y=0)) / 3
cross_fp = sum(get_agreement(annotations_df, y_hat=1, y=0)) / 3
cross_fn = sum(get_agreement(annotations_df, y_hat=0, y=1)) / 3
plot_confusion_matrix(np.asarray([cross_tn, cross_fp, cross_fn, cross_tp]).reshape((2, 2)),
                      filename=f'{annotations_dir}/annotator_agreement_validation', group_names=['TN', 'FP', 'FN', 'TP'])
print(kappa(cross_tp, cross_tn, cross_fp, cross_fn))

In [28]:
annotations_df.drop(['label_one', 'label_two', 'label_three'], axis=1, inplace=True)

### Display the class distribution

In [None]:
total_size = len(annotations_df)
negative_size = len(annotations_df[annotations_df["label"] == 0])
positive_size = total_size - negative_size
print(f'Ratio of labels: {negative_size :.0f}:{positive_size :.0f}')

fig = px.histogram(annotations_df, x='label')

fig.update_layout(template='plotly', font={'family': 'verdana', 'size': 26, 'color': 'black'}, xaxis_title='class')
fig.update_xaxes(type='category')
fig.update_yaxes(range=[0, 2200], type='linear')

fig.write_html(f'{annotations_dir}/annotations.html')
fig.write_json(f'{annotations_dir}/annotations.json')
fig.show()

### Merge annotations with geospatial data

In [None]:
geo_df = pd.read_pickle(f'{shared_dir}/geospatial.pickle')
geo_df = geo_df[~geo_df.index.duplicated(keep='first')].copy()
geo_df.head()

In [None]:
geo_df = geo_df.merge(annotations_df, how='left', left_index=True, right_index=True, suffixes=(None, '_annotation'))
geo_df.drop('text_annotation', axis=1, inplace=True)
geo_df.head()

### Build word clouds for both classes

In [None]:
word_cloud = generate_word_cloud(get_positive(geo_df).head(50), sp=sp)
display_image(word_cloud, filename=f'{word_clouds_dir}/positive_wordcloud.png')

In [None]:
word_cloud = generate_word_cloud(get_negative(geo_df), sp=sp)
display_image(word_cloud, filename=f'{word_clouds_dir}/negative_wordcloud.png')

### Calculate Jaccard similarity between the classes' text

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f'{shared_dir}/tokenizer/')
similarity = calculate_dataset_similarity(tokenizer, get_positive(geo_df), get_negative(geo_df))
print(f'Jaccard Similarity between positive and negative classes: {similarity}')
del tokenizer

### Save labelled geospatial data to file

In [None]:
labelled = get_labelled(geo_df)
print(f'Size of Labelled Dataset: {len(labelled)}')

In [None]:
print('Saving Labelled Dataset')
labelled.to_pickle(f'{shared_dir}/labelled.pickle')