In [1]:
import time
import json
import torch
import numpy as np
import pandas as pd
from numpy import argmax
from datasets import load_metric
from sklearn.metrics import f1_score
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from seqeval.metrics import f1_score, accuracy_score, classification_report
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup

In [2]:
# identify and specify the GPU as the device, later in training loop we will load data into device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
N_GPU = torch.cuda.device_count()
GPU_NAME = torch.cuda.get_device_name(0)

## IMDB

In [3]:
from datasets import load_dataset

In [4]:
imdb_datasets = load_dataset("imdb")
imdb_datasets

Reusing dataset imdb (/home/biolab/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
with open('../data/imdb_train_beautify_html.json') as file:
    imdb_train = json.load(file)
with open('../data/imdb_test_beautify_html.json') as file:
    imdb_test = json.load(file)

In [6]:
imdb = imdb_train + imdb_test

In [7]:
imdb_label = imdb_datasets['train']['label'] + imdb_datasets['test']['label']

## Read training data

In [8]:
train_dataset = pd.read_csv('../data/train.csv', header=0)
train_dataset.head()

Unnamed: 0,ID,review,sentiment
0,41411,I watched this film because I'm a big fan of R...,0
1,37586,It does not seem that this movie managed to pl...,1
2,6017,"Enough is not a bad movie , just mediocre .",0
3,44656,my friend and i rented this one a few nights a...,0
4,38711,"Just about everything in this movie is wrong, ...",0


In [9]:
with open('../data/sentiments_beautify_html.json') as file:
    views = json.load(file)

In [10]:
predict_dataset = pd.read_csv('../data/test.csv', header=0)
predict_dataset.head()

Unnamed: 0,ID,review
0,22622,Robert Lansing plays a scientist experimenting...
1,10162,"Well I've enjoy this movie, even though someti..."
2,17468,First things first - though I believe Joel Sch...
3,42579,I watched this movie on the grounds that Amber...
4,701,A certain sexiness underlines even the dullest...


In [11]:
with open('../data/predicts_beautify_html.json') as file:
    predict_reviews = json.load(file)

In [12]:
training_views = imdb + views
labels = imdb_label + list(train_dataset['sentiment'])

In [13]:
train_texts, val_texts, train_labels, val_labels = train_test_split(training_views, labels, random_state=42, test_size=.3)

In [14]:
MAX_LENGTH = 256
BATCH_SIZE = 8
EPOCHS = 4

## Tokenize

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [16]:
train_encodings = tokenizer(train_texts, is_split_into_words=False, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors="pt")
val_encodings = tokenizer(val_texts, is_split_into_words=False, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors="pt")

In [17]:
train_inputs = torch.tensor(train_encodings.input_ids)
train_masks = torch.tensor(train_encodings.attention_mask)
train_labels = torch.tensor(train_labels)
training_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(training_data, shuffle=False, batch_size=BATCH_SIZE)

  """Entry point for launching an IPython kernel.
  


In [18]:
val_inputs = torch.tensor(val_encodings.input_ids)
val_masks = torch.tensor(val_encodings.attention_mask)
val_labels = torch.tensor(val_labels)
validate_data = TensorDataset(val_inputs, val_masks, val_labels)

  """Entry point for launching an IPython kernel.
  


### Data collator

In [19]:
def data_collator(dataset):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in dataset])
    batch['attention_mask'] = torch.stack([f[1] for f in dataset])
    batch['labels'] = torch.stack([f[2] for f in dataset])
    return batch

### Compute metrics

In [20]:
METRIC = load_metric("f1")

In [21]:
def compute_metrics(p):
    predictions, labels = p
    predictions = argmax(predictions, axis=1)
    return METRIC.compute(predictions=predictions, references=labels)

## Training

In [22]:
configuration = BertConfig.from_pretrained(
    'bert-base-cased',
    num_labels=2,
)

In [23]:
model = BertForSequenceClassification.from_pretrained('bert-base-cased', config=configuration).to(DEVICE)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [24]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (28996, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [25]:
training_args = TrainingArguments(
    output_dir='./outputs',          # output directory
    num_train_epochs=EPOCHS,              # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    evaluation_strategy="epoch",
    load_best_model_at_end=True
)

In [26]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [27]:
optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=2e-5,
    eps=1e-8
)

In [28]:
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0, # Default value in run_glue.py
    num_training_steps = total_steps
)

In [29]:
optimizers = optimizer, scheduler

In [30]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=training_data,
    eval_dataset=validate_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=optimizers
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.258167,0.215078,0.93855
2,0.142072,0.257556,0.948122
3,0.080197,0.24205,0.95765
4,0.037711,0.286389,0.959702


TrainOutput(global_step=27772, training_loss=0.13823489572444533)

## Predict

In [32]:
def predictReview(review):
    inputs = tokenizer.encode_plus(review, is_split_into_words=False, max_length=MAX_LENGTH, padding=True, truncation=True, return_tensors="pt")
    inputs.to(DEVICE)
    outputs = model(**inputs)
    logits = np.array(outputs.logits.tolist())
    result = argmax(logits, axis=1)[0]
    return result

In [33]:
predict_sentiments = []
for review in predict_reviews:
    predict_sentiments.append(predictReview(review))

In [34]:
predict_dataset['sentiment'] = predict_sentiments
predict_dataset.head()

Unnamed: 0,ID,review,sentiment
0,22622,Robert Lansing plays a scientist experimenting...,0
1,10162,"Well I've enjoy this movie, even though someti...",1
2,17468,First things first - though I believe Joel Sch...,0
3,42579,I watched this movie on the grounds that Amber...,0
4,701,A certain sexiness underlines even the dullest...,0


In [65]:
predict_dataset[['ID', 'sentiment']].to_csv('../results/submission_{}.csv'.format(int(time.time())), index=False)