In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
!nvidia-smi

In [9]:
import logging
import itertools
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from tqdm.auto import tqdm
from utilities.classifier.nlp_model import NLPModel
from utilities.data_processing.preprocessing import PreProcessing
from utilities.utils import shared_dir, split_dataset, write_json, figures_dir, \
    hashtags_dir, get_cuda_availability, get_positive

In [4]:
tqdm.pandas()
pd.set_option('display.max_colwidth', None)

logging.basicConfig(level=logging.CRITICAL)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.CRITICAL)

DEVICE = get_cuda_availability()

### Load labelled data

In [None]:
labelled = pd.read_pickle(f'{shared_dir}/labelled.pickle')
labelled['label'] = labelled['label'].astype(int)
labelled.head()

### Get size of each of the three sources of location data

In [None]:
user_df = labelled[labelled.location_type == 'user']
coordinates_df = labelled[labelled.location_type == 'coordinates']
entity_df = labelled[labelled.location_type == 'entity']

print(f'size of user dataset: {len(user_df)}')
print(f'size of coordinates dataset: {len(coordinates_df)}')
print(f'size of entity dataset: {len(entity_df)}')

### Pre-process data and split into training and validation sets

In [None]:
preprocessing = PreProcessing(labelled, word_source=f'{hashtags_dir}/50_000_words.txt')
preprocessing.strip_newlines()\
    .contextualise_hashtags(cache_source=f'{hashtags_dir}/unigram_hashtags_50_000.json', use_frequencies=True).emojis()
labelled = preprocessing.df
labelled.head()

In [None]:
trdf, tvdf = split_dataset(labelled)
tvdf.head()

In [None]:
preprocessing = PreProcessing(trdf).augment_dataset(n=2, reset_index=False)
trdf = preprocessing.df
trdf.sample(n=5)

### Load and train model

In [13]:
model = NLPModel(
    training_data=trdf,
    validation_data=tvdf,
    device=DEVICE,
    use_downsampling=True,
    batch_size=32,
    gradient_accumulation_steps=1,
    epochs=2,
    scheduler_type='linear',
    model_name='roberta-base'
)

In [None]:
model.train(log_level='critical')

### Generate baseline performance

In [None]:
model.test(tvdf, predictions_filename=None, to_display=False)
reports = [model.report for _ in range(3)]
baseline_report = pd.DataFrame([{'accuracy': report['accuracy'], 'negative_f1': report['negative']['f1-score'], 'positive_f1': report['positive']['f1-score']} for report in reports])
baseline_report.head()

### Ablate location data source, save and display results

In [None]:
options = ['user', 'entity', 'coordinates']
ys = list(itertools.combinations(options, 2))
reports = []

for y in ys:
    x = [x for x in options if x not in y][0]
    print(f'Ablating {x}')

    model.test(tvdf[tvdf.location_type.isin(y)], predictions_filename=None, cfm_filename=f'{figures_dir}/ablation/ablating_{x}_cfm.png')

    reports.append((x, model.report))
    write_json(model.report, f'{figures_dir}/ablation/ablating_{x}_report.json')

### Generate ablation graph

In [None]:
data = pd.DataFrame([{'key': x[0], 'accuracy': x[1]['accuracy'], 'negative_f1': x[1]['negative']['f1-score'],
                      'positive_f1': x[1]['positive']['f1-score']} for x in reports])
fig = px.line(data, x='key', y=['accuracy', 'negative_f1', 'positive_f1'], template='plotly')
fig.add_trace(go.Scatter(x=options, y=baseline_report['accuracy'].tolist(), name='accuracy_baseline',
                         line={'color': 'blue', 'dash': 'dash'}))
fig.add_trace(go.Scatter(x=options, y=baseline_report['negative_f1'].tolist(), name='negative_f1_baseline',
                         line={'color': 'red', 'dash': 'dash'}))
fig.add_trace(go.Scatter(x=options, y=baseline_report['positive_f1'].tolist(), name='positive_f1_baseline',
                         line={'color': 'green', 'dash': 'dash'}))
fig.update_layout(
    xaxis_title='location type ablated',
    yaxis_title='performance',
    template='plotly',
    font={'family': 'verdana', 'size': 26, 'color': 'black'}
)
fig.write_json(f'{figures_dir}/ablation/ablation_graph.json')
fig.write_html(f'{figures_dir}/ablation/ablation_graph.html')
fig.show()

### Generate class distribution graph for each location source

In [22]:
labelled_positive = get_positive(labelled)

user_negative = len(user_df[user_df.label == 0])
user_positive = len(user_df[user_df.label == 1])

coordinates_negative = len(coordinates_df[coordinates_df.label == 0])
coordinates_positive = len(coordinates_df[coordinates_df.label == 1])

entity_negative = len(entity_df[entity_df.label == 0])
entity_positive = len(entity_df[entity_df.label == 1])

In [23]:
df = pd.DataFrame([
    {
        'key': 'user',
        'positive': user_positive,
        'negative': user_negative
    },
    {
        'key': 'entity',
        'positive': entity_positive,
        'negative': entity_negative
    },
    {
        'key': 'coordinates',
        'positive': coordinates_positive,
        'negative': coordinates_negative
    }
])

In [None]:
fig = px.bar(df, x='key', y=['positive', 'negative'], template='plotly')

fig.update_layout(template='plotly', xaxis_title='location source', yaxis_title='size', font={'family': 'verdana', 'size': 26, 'color': 'black'})

fig.write_json(f'{figures_dir}/ablation/location_distribution_graph.json')
fig.write_html(f'{figures_dir}/ablation/location_distribution_graph.html')

fig.show()