In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
!nvidia-smi

In [3]:
import logging
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from transformers import AutoTokenizer
from IPython.core.display_functions import display
from utilities.classifier.nlp_model import NLPModel
from utilities.data_processing.preprocessing import PreProcessing
from utilities.classifier.model_utils import run_graph, preprocessing_graph, augmentation_graph, \
    run_cross_validation, display_cross_validation, update_key
from utilities.utils import shared_dir, read_json, annotated_dir, read_jsonl, \
    calculate_dataset_similarity, get_positive, get_negative, plot_confusion_matrix, hashtags_dir, \
    input_dir, kappa, get_cuda_availability, load_raw_annotations, annotations_dir, classifier_tuning_dir, \
    final_model_dir

In [4]:
tqdm.pandas()
pd.set_option('display.max_colwidth', None)

logging.basicConfig(level=logging.CRITICAL)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.CRITICAL)

DEVICE = get_cuda_availability()

### Load and pre-process labelled dataset

In [None]:
labelled = pd.read_pickle(f'{shared_dir}/labelled.pickle')

labelled['label'] = labelled['label'].astype(int)
labelled_raw = labelled.copy()

print(f'Size of labelled data: {len(labelled)}')
labelled.head()

In [None]:
preprocessing = PreProcessing(labelled, word_source=f'{hashtags_dir}/50_000_words.txt')
preprocessing.strip_newlines().contextualise_hashtags(cache_source=f'{hashtags_dir}/unigram_hashtags_50_000.json', use_frequencies=True).emojis()
labelled = preprocessing.df
labelled.head()

### Load and pre-process external test set

In [None]:
original_data = pd.DataFrame(read_jsonl(f'{annotated_dir}/annotations/10k_sample_tweets.jsonl'))
original_data.head()

In [None]:
annotated_data = pd.DataFrame(read_json(f'{annotated_dir}/annotations/annotated_10k_sample_tweets.json'))
annotated_data = annotated_data[~(annotated_data['answer'] == 'ignore')]

annotated_data['text'] = original_data['text']
annotated_data['label'] = annotated_data['answer'].progress_apply(lambda x: 0 if x == 'reject' else 1)

annotated_data.drop('id', axis=1, inplace=True)
annotated_data.drop('accept', axis=1, inplace=True)
annotated_data.drop('spans', axis=1, inplace=True)
annotated_data.drop('answer', axis=1, inplace=True)
annotated_data_raw = annotated_data.copy()

annotated_data.head()

In [None]:
preprocessor = PreProcessing(annotated_data, word_source=f'{hashtags_dir}/50_000_words.txt')
preprocessor.strip_newlines().contextualise_hashtags(cache_source=f'{hashtags_dir}/unigram_hashtags_50_000.json', use_frequencies=True).emojis()
annotated_data = preprocessor.df
annotated_data.head()

### Calculate Jaccard similarity between the classes of the labelled and external datasets

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f'{shared_dir}/tokenizer/')
overall_similarity = calculate_dataset_similarity(tokenizer, labelled, annotated_data)
pp_similarity = calculate_dataset_similarity(tokenizer, get_positive(labelled), get_positive(annotated_data))
nn_similarity = calculate_dataset_similarity(tokenizer, get_negative(labelled), get_negative(annotated_data))
pn_similarity = calculate_dataset_similarity(tokenizer, get_positive(labelled), get_negative(annotated_data))
np_similarity = calculate_dataset_similarity(tokenizer, get_negative(labelled), get_positive(annotated_data))

print(f'Overall similarity of training and testing data: {overall_similarity}')
plot_confusion_matrix(np.asarray([nn_similarity, pn_similarity, np_similarity, pp_similarity]).reshape((2, 2)),
                      filename=f'{annotations_dir}/jaccard_similarity', format_labels=False, group_names=['NN', 'PN', 'NP', 'PP'])
del tokenizer

### Calculate annotator agreement and Cohen's kappa
$ CK = \frac{2 * (TP * TN - FN * FP)}{(TP + FP) * (FP + TN) + (TP + FN) * (FN + TN)} $

In [None]:
unzipped_annotations = load_raw_annotations(filename=f'{input_dir}/cross_annotations.json')

cross_df = pd.DataFrame(
    {'id': unzipped_annotations[0], 'text': unzipped_annotations[1], 'label': unzipped_annotations[2]})
cross_df.set_index('id', inplace=True)
cross_df['label'].astype(int)
cross_df = annotated_data.merge(cross_df, how='right', left_index=True, right_index=True, suffixes=(None, '_annotation'))
cross_df.drop(['text_annotation'], axis=1, inplace=True)
cross_df.rename(columns={'label': 'annotator_two', 'label_annotation': 'annotator_one'}, inplace=True)

cross_pp = len(cross_df[(cross_df.annotator_one == 1) & (cross_df.annotator_two == 1)])
cross_nn = len(cross_df[(cross_df.annotator_one == 0) & (cross_df.annotator_two == 0)])
cross_pn = len(cross_df[(cross_df.annotator_one == 1) & (cross_df.annotator_two == 0)])
cross_np = len(cross_df[(cross_df.annotator_one == 0) & (cross_df.annotator_two == 1)])
plot_confusion_matrix(np.asarray([cross_nn, cross_pn, cross_np, cross_pp]).reshape((2, 2)),
                      filename=f'{annotations_dir}/annotator_agreement_test', group_names=['TN', 'FP', 'FN', 'TP'])
print(kappa(cross_pp, cross_nn, cross_pn, cross_np))

### Save processed datasets for future use/visualisation purposes

In [15]:
annotated_data.to_pickle(f'{shared_dir}/tedf.pickle')

# Hyperparameter Tuning

### Load hyperparameters

In [28]:
k = 5
hyperparameters = read_json(f'{input_dir}/hyperparameters.json')

### Tune initial learning rate

In [None]:
run_graph(
    k=k,
    xs=hyperparameters['initial_lrs'],
    data=labelled,
    tedf=annotated_data,
    update_key_=update_key,
    x_key='lr_start',
    x_title='Initial Learning Rate',
    log_x=True,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/lr_start/lr_start_results.pickle'),
    show_all=False
)

### Tune final learning rate

In [None]:
run_graph(
    k=k,
    xs=hyperparameters['final_lrs'],
    data=labelled,
    tedf=annotated_data,
    x_key='lr_end',
    x_title='Final Learning Rate',
    log_x=True,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/lr_end/lr_end_results.pickle'),
    show_all=False
)

### Tune batch size

In [None]:
run_graph(
    k=k,
    xs=hyperparameters['batch_sizes'],
    data=labelled,
    tedf=annotated_data,
    x_key='batch_size',
    x_title='Batch Size',
    log_x=False,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/batch_size/batch_size_results.pickle'),
    show_all=False
)

### Tune model name

In [None]:
run_graph(
    k=k,
    xs=hyperparameters['model_types'],
    data=labelled,
    tedf=annotated_data,
    x_key='model_name',
    x_title='Model Name',
    log_x=False,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/model_name/model_name_results.pickle'),
    show_all=False
)
display(pd.read_pickle(f'{classifier_tuning_dir}/model_name/model_name_results.pickle').head())

### Tune number of epochs

In [None]:
run_graph(
    k=k,
    xs=hyperparameters['epochs'],
    data=labelled,
    tedf=annotated_data,
    x_key='epochs',
    x_title='No. Epochs',
    log_x=False,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/epochs/epochs_results.pickle'),
    show_all=False
)

### Tune scheduler type

In [None]:
run_graph(
    k=k,
    xs=hyperparameters['scheduler_types'],
    data=labelled,
    tedf=annotated_data,
    x_key='scheduler_type',
    x_title='Scheduler Type',
    log_x=False,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/scheduler_type/scheduler_type_results.pickle'),
    show_all=False
)
display(pd.read_pickle(f'{classifier_tuning_dir}/scheduler_type/scheduler_type_results.pickle').head())

### Tune use of downsampling

In [None]:
run_graph(
    k=k,
    xs=hyperparameters['downsampling'],
    data=labelled,
    tedf=annotated_data,
    x_key='downsample',
    x_title='Use Downsampling',
    log_x=False,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/downsample/downsample_results.pickle'),
    show_all=False
)
display(pd.read_pickle(f'{classifier_tuning_dir}/downsample/downsample_results.pickle').head())

### Tune pre-processing tasks

In [None]:
preprocessing_graph(
    k=k,
    data=labelled_raw,
    testing=annotated_data_raw,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/preprocessing/preprocessing_results.pickle'),
    baseline=pd.read_pickle(f'{classifier_tuning_dir}/baseline/baseline_results.pickle'),
    show_all=False
)
display(pd.read_pickle(f'{classifier_tuning_dir}/baseline/baseline_results.pickle').head())
display(pd.read_pickle(f'{classifier_tuning_dir}/preprocessing/preprocessing_results.pickle'))

### Tune use of augmentation

In [None]:
augmentation_graph(
    k=k,
    augmentation=hyperparameters['augmentation'],
    data=labelled_raw,
    tedf=annotated_data_raw,
    device=DEVICE,
    df=pd.read_pickle(f'{classifier_tuning_dir}/augmentation/augmentation_results.pickle'),
    show_all=False
)

# Model evaluation

### Run cross-validation

In [34]:
result = run_cross_validation(
    k,
    labelled,
    annotated_data,
    DEVICE,
    key=None,
    to_display=True,
    cache=read_json(f'{final_model_dir}/cross_validation_results.json')
)

### Display cross-validation results for all datasets

Datasets:
<ul>
    <li>Validation Dataset</li>
    <li>Short Validation Dataset</li>
    <li>Long Validation Dataset</li>
    <li>Testing Dataset</li>
    <li>Short Testing Dataset</li>
    <li>Long Testing Dataset</li>
</ul>

In [None]:
display_cross_validation(
    labelled,
    result,
    'eval',
    cv_filename='validation/cross_validation',
    cfm_filename=f'{final_model_dir}/validation/cfm.png',
    loss_filename=f'validation/losses',
    lr_filename=f'validation/lrs',
    predictions_filename=f'{final_model_dir}/validation/predictions.txt',
    display_training=True
)

In [None]:
display_cross_validation(
    labelled,
    result,
    'short_eval',
    cv_filename='short_validation/cross_validation',
    cfm_filename=f'{final_model_dir}/short_validation/cfm.png',
    loss_filename=f'short_validation/losses',
    lr_filename=f'short_validation/lrs',
    predictions_filename=f'{final_model_dir}/short_validation/predictions.txt'
)

In [None]:
display_cross_validation(
    labelled,
    result,
    'long_eval',
    cv_filename='long_validation/cross_validation',
    cfm_filename=f'{final_model_dir}/long_validation/cfm.png',
    loss_filename=f'long_validation/losses',
    lr_filename=f'long_validation/lrs',
    predictions_filename=f'{final_model_dir}/long_validation/predictions.txt'
)

In [None]:
display_cross_validation(
    labelled,
    result,
    'test',
    cv_filename='testing/cross_validation',
    cfm_filename=f'{final_model_dir}/testing/cfm.png',
    loss_filename=f'testing/losses',
    lr_filename=f'testing/lrs',
    predictions_filename=f'{final_model_dir}/testing/predictions.txt'
)

In [None]:
display_cross_validation(
    labelled,
    result,
    'short_test',
    cv_filename='short_testing/cross_validation',
    cfm_filename=f'{final_model_dir}/short_testing/cfm.png',
    loss_filename=f'short_testing/losses',
    lr_filename=f'short_testing/lrs',
    predictions_filename=f'{final_model_dir}/short_testing/predictions.txt'
)

In [None]:
display_cross_validation(
    labelled,
    result,
    'long_test',
    cv_filename='long_testing/cross_validation',
    cfm_filename=f'{final_model_dir}/long_testing/cfm.png',
    loss_filename=f'long_testing/losses',
    lr_filename=f'long_testing/lrs',
    predictions_filename=f'{final_model_dir}/long_testing/predictions.txt'
)

# Label all data

### Build and train final model

In [13]:
model = NLPModel(
    training_data=labelled,
    validation_data=pd.DataFrame({'text': [], 'label': []}),
    device=DEVICE,
    use_downsampling=True,
    batch_size=32,
    gradient_accumulation_steps=1,
    epochs=2,
    scheduler_type='linear',
    model_name='roberta-base'
)

In [None]:
model.train(log_level='critical')

### Save model and tokenizer to file for future use

In [None]:
model.model.save_pretrained(f'{shared_dir}/model/')
model.tokenizer.save_pretrained(f'{shared_dir}/tokenizer/')

### Load and pre-process unlabelled geospatial data

In [None]:
geo_df = pd.read_pickle(f'{shared_dir}/geospatial.pickle')
print(f'Size of Geospatial Dataset: {len(geo_df)}')
geo_df.head()

In [None]:
preprocessor = PreProcessing(geo_df, word_source=f'{hashtags_dir}/50_000_words.txt')
preprocessor.strip_newlines().contextualise_hashtags(cache_source=f'{hashtags_dir}/unigram_hashtags_50_000.json', use_frequencies=True).emojis()
geo_df = preprocessor.df

### Label and format data for clustering

In [None]:
cluster_df = model.fit(geo_df)
display(cluster_df)
cluster_df = cluster_df[cluster_df.label == 1].copy()
cluster_df['id'] = cluster_df.index
cluster_df.drop_duplicates(subset='id', inplace=True)
cluster_df.head()

### Save clustering dataset to file

In [None]:
cluster_df.to_pickle(f'{shared_dir}/clusters.pickle')