In [6]:
# general data science/utilization/visualization imports
import json
import os
import random

# progress bar
from tqdm import tqdm


# data manipulation and data reading
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt
import plotly.express as px

# pre-defined evaluation metrics  
from sklearn.metrics import (accuracy_score, classification_report, f1_score,
                             precision_score, recall_score, matthews_corrcoef)

from sklearn.model_selection import train_test_split
# torch imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
# huggingface imports
import transformers
from datasets import load_metric
from transformers import (AutoModelForSequenceClassification, AutoTokenizer, 
                          Trainer, TrainingArguments)

# ray tune imports for hyperparameter optimization
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.suggest.hyperopt import HyperOptSearch



  defaults = yaml.load(f)


In [7]:
def seed_all(seed):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

SEED=42
seed_all(SEED)

In [8]:
class TextClassificationDataset(Dataset):
    def __init__(self, dataframe):
        self.labels = dataframe.label.to_list()
        self.inputs = dataframe.content.to_list()
        self.labels_to_idx = {k:v for k,v in labels_dict.items()}

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        if type(idx)==torch.Tensor:
            idx = list(idx)

        input_data = self.inputs[idx]
        target = self.labels[idx]
        target = self.labels_to_idx[target]

        return {'text': input_data, 'label':target}

In [9]:
# we can download the model easily just by specifying huggingface hub repository
model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
# we will perform the search to optimize the model accuracy,
# we need to specify and load the accuracy metric as a first step
metric = load_metric("accuracy")


# since we already entered a model name, we can load the pretrained tokenizer
# we can also load the model but i'll describe it in the model_init function.
tokenizer = AutoTokenizer.from_pretrained(model_name)


def model_init():
    """
    Hyperparameter optimization is performed by newly initialized models,
    therefore we will need to initialize the model again for every single search run. 
    This function initializes and returns the pretrained model selected with `model_name`
    """
    return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, return_dict=True, ignore_mismatched_sizes=True)

# we picked the metric to search for but haven't defined how to calculate it from predictions
# let's do that!
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1) # just pick the indices that has the maximum values
    return metric.compute(predictions=predictions, references=labels)


Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [10]:
# pick a dataset
file_name = "dataset-11.csv"
dataset_path = os.path.join('./data/processed', file_name)
dataset = pd.read_csv(dataset_path)
print('Number of samples: %d'%len(dataset))
display(dataset.sample(5))

idx_to_label = dict(enumerate(dataset.label.unique()))
labels_dict = {v:k for k,v in idx_to_label.items()}

print("Label to index dictionary:", labels_dict)


px.pie(dataset, names=dataset.label.unique(), values=dataset.label.value_counts(), title='Category distribution of dataset')

Number of samples: 346


Unnamed: 0,id,externalId,skipped,status,label,content
108,ckyd73h600h5r0l1t7irg2gmm,218,False,REVIEWED,INTERFACE,not able to scroll the code snippet inside the...
158,ckyd73qb903ef0lyq9sos8d3h,447,False,LABELED,INTERFACE,"latest update forces me to link twitter, and t..."
278,ckyd747hk0hns0l0uhsp4hrlw,895,False,LABELED,SUBSCRIPTION,have to pay to read news articles? no thanks.
209,ckyd73zbb0gqq0l0e7p2q6ze2,664,False,LABELED,SUBSCRIPTION,they don't give a single article without premi...
320,ckyd74gp20hcp0l164ens459v,1060,False,REVIEWED,SUBSCRIPTION,"what is this app, even two or four words are n..."


Label to index dictionary: {'CONTENT': 0, 'USER_EXPERIENCE': 1, 'SUBSCRIPTION': 2, 'INTERFACE': 3}


## Automatic hyperparameter search with Ray Tune backend.

I've prepared ASHA scheduler and Population Based Training to perform the search. Just pick the one you want from the dropdown menu.

Note: You cannot use population based training with search algorithms so pick None as a search_algorithm if that's the case. 

In [11]:
schedulers = {
    'ASHA Scheduler': ASHAScheduler(metric='objective', mode='max'),
    'Population Based Training': PopulationBasedTraining(metric='objective', mode='max')
}

search_algos = {
    'None': None,
    'HyperOpt':HyperOptSearch(metric='objective', mode='max', random_state_seed=SEED),
}
scheduler = "ASHA Scheduler" 
search_algorithm = "HyperOpt"
n_trials =  40


scheduler = schedulers[scheduler]
search_algorithm = search_algos[search_algorithm]

#### Utility functions

In [12]:
# we will use this function to tokenize the input data
# I'm passing tokenizer as a default argument since 
# ray tune performs search in a black-box setting
# so passing what we need beforehand is important and otherwise
# it'll throw an error.
def tokenize(sample, tokenizer=tokenizer):
    tokenized_sample = tokenizer(sample['text'], padding=True, truncation=True)
    tokenized_sample['label'] = sample['label']
    return tokenized_sample

def prepare_datasets(dataset_df, test_size=.2, val_size=.2):
    # split the dataset to training and test sets
    # then create a validation split from training set
    # it is generally advised to have stratified splits
    # this means that every split will have almost same ratio of different labels
    # otherwise, the model could overfit to dominant class or underfit the 
    # class with less samples.
    train_set, test_set = train_test_split(dataset_df, test_size=test_size,
                                        stratify=dataset_df.label, random_state=SEED)

    train_set, val_set = train_test_split(train_set, test_size=val_size,
                                        stratify=train_set.label, random_state=SEED)

    # shuffle the dataframes beforehand 
    train_set = train_set.sample(frac=1, random_state=SEED)
    val_set = val_set.sample(frac=1, random_state=SEED)
    test_set = test_set.sample(frac=1, random_state=SEED)

    # convert dataframes to torch datasets
    train_dataset = TextClassificationDataset(train_set)
    val_dataset = TextClassificationDataset(val_set)
    test_dataset = TextClassificationDataset(test_set)

    # tokenize the datasets
    tokenized_train_set = train_dataset.map(tokenize)
    tokenized_val_set = val_dataset.map(tokenize)
    tokenized_test_set = test_dataset.map(tokenize)

    return tokenized_train_set, tokenized_val_set, tokenized_test_set

def calculate_metrics(labels, preds, index):
    multi_indices = pd.MultiIndex.from_product([['Recall', 'Precision', 'F1'],['Weighted', 'Macro', 'Micro']])
    df = pd.DataFrame(columns=multi_indices, index=[index])

    df['Accuracy'] = accuracy_score(labels, preds)
    df[('Recall', 'Weighted')] = recall_score(labels, preds, average='weighted')
    df[('Recall', 'Macro')] = recall_score(labels, preds, average='macro')
    df[('Recall', 'Micro')] = recall_score(labels, preds, average='micro')


    df[('Precision', 'Weighted')] = precision_score(labels, preds, average='weighted') 
    df[('Precision', 'Macro')] = precision_score(labels, preds, average='macro')
    df[('Precision', 'Micro')] = precision_score(labels, preds, average='micro')


    df[('F1', 'Weighted')] = f1_score(labels, preds, average='weighted')
    df[('F1', 'Macro')] = f1_score(labels, preds, average='macro')
    df[('F1', 'Micro')] = f1_score(labels, preds, average='micro')

    df['Matthews Corr. Coeff.'] = matthews_corrcoef(labels, preds)

    df['n_sample'] = len(labels)

    return df

In [None]:
tokenized_train_set, tokenized_val_set, tokenized_test_set = prepare_datasets(dataset)

print('Training set has %d samples'%len(tokenized_train_set))
print('Validation set has %d samples' %len(tokenized_val_set))
print('Test set has %d samples'%len(tokenized_test_set))

# let's define the training arguments, 
# skip_memory_metrics: Whether to skip adding of memory profiler reports to metrics.
#       passed True since it slows down the training and evaluation speed
# evaluation_strategy: evaluation strategy to adopt at training phase. 
#       "steps" means evaluation is done at every 500 steps by default 
#       (we can override this value by passing eval_steps parameter)
training_args = TrainingArguments(
    'trial_results',
    evaluation_strategy="steps",
    disable_tqdm=True,
    skip_memory_metrics=True,
)

trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_set,
    eval_dataset=tokenized_val_set,
    model_init=model_init,
    compute_metrics=compute_metrics
    )

best_run = trainer.hyperparameter_search(
                direction="maximize", 
                n_trials=n_trials,
                backend="ray",
                search_alg=search_algorithm,
                scheduler=scheduler
                )

In [None]:
best_run_params = best_run.hyperparameters
for param, val in best_run_params.items():
    setattr(trainer.args, param, val)
setattr(trainer.args, 'seed', int(np.floor(best_run_params['seed'])))

trainer.train()

In [23]:
trainer.model.eval()
outputs = trainer.predict(tokenized_test_set)
logits, labels, metrics = outputs.predictions, outputs.label_ids, outputs.metrics
preds = np.argmax(logits, axis=-1)
results = calculate_metrics(labels, preds, file_name)
results.to_csv('model_result_eval_%d_%s'%(n_trials,file_name))
results

***** Running Prediction *****
  Num examples = 70
  Batch size = 8


Unnamed: 0_level_0,Recall,Recall,Recall,Precision,Precision,Precision,F1,F1,F1,Accuracy,Matthews Corr. Coeff.,n_sample
Unnamed: 0_level_1,Weighted,Macro,Micro,Weighted,Macro,Micro,Weighted,Macro,Micro,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
dataset-11.csv,0.885714,0.882759,0.885714,0.882976,0.88125,0.885714,0.881766,0.878936,0.885714,0.885714,0.840447,70


In [16]:
trainer.save_model('review-classification-roberta-%d-trials'%n_trials)

Saving model checkpoint to review-classification-roberta-40-trials
Configuration saved in review-classification-roberta-40-trials/config.json
Model weights saved in review-classification-roberta-40-trials/pytorch_model.bin
tokenizer config file saved in review-classification-roberta-40-trials/tokenizer_config.json
Special tokens file saved in review-classification-roberta-40-trials/special_tokens_map.json
