In [None]:
!pip install transformers==2.8.0 -q

[K     |████████████████████████████████| 573kB 15.4MB/s 
[K     |████████████████████████████████| 133kB 48.2MB/s 
[K     |████████████████████████████████| 890kB 53.0MB/s 
[K     |████████████████████████████████| 3.7MB 39.3MB/s 
[K     |████████████████████████████████| 1.1MB 49.4MB/s 
[K     |████████████████████████████████| 71kB 10.4MB/s 
[K     |████████████████████████████████| 6.9MB 48.2MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[31mERROR: botocore 1.19.25 has requirement urllib3<1.27,>=1.25.4; python_version != "3.4", but you'll have urllib3 1.24.3 which is incompatible.[0m


In [None]:
import numpy as np
import pandas as pd
from transformers import *
import torch

In [None]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

#model.cuda()
device = torch.device("cuda")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
# Dictionary maps task names to required prefixes
task_dict = {
'MDS-A': ["all","all","all"],
'SDS-H': ["hotels","hotels","hotels"],
'SDS-M': ["medicine","medicine","medicine"],
'SDS-P': ["products","products","products"],
'SDS-R': ["reviews","reviews","reviews"],
'DOS-H': ["shuffled.except.hotels","shuffled.except.hotels","hotels"],
'DOS-M': ["shuffled.except.medicine","shuffled.except.medicine","medicine"],
'DOS-P': ["shuffled.except.products","shuffled.except.products","products"],
'DOS-R': ["shuffled.except.reviews","shuffled.except.reviews","reviews"]
}

In [None]:
# Get the data
sets = ("train", "dev", "test")

def get_sets(task):
    return ['%s.sentence.%s.txt'%(task_dict[task][i], d) for i,d in enumerate(sets)]

**Hotels**

In [None]:
import os

TASK = 'SDS-H'

for set_ in get_sets(TASK):
  os.system('wget https://wothub-data.s3.amazonaws.com/Corpus/%s -nc'%set_)

In [None]:
train_path, dev_path, test_path = get_sets(TASK)

In [None]:
# Load data as pandas dataframes with two columns -- sentences and labels
train_data = pd.read_csv(train_path, sep="__label__", header=None, names=["text", "label"], engine="python")
dev_data = pd.read_csv(dev_path, sep="__label__", header=None, names=["text", "label"], engine="python")
test_data = pd.read_csv(test_path, sep="__label__", header=None, names=["text", "label"], engine="python")

In [None]:
train_data = train_data[train_data.label != 'z_amb']
dev_data = dev_data[dev_data.label != 'z_amb']
test_data = test_data[test_data.label != 'z_amb']

In [None]:
# Convert to numpy arrays
train_sentences, dev_sentences, test_sentences = [data.iloc[:,0] for data in (train_data, dev_data, test_data)]
train_labels, dev_labels, test_labels = [data.iloc[:,1] for data in (train_data, dev_data, test_data)]

In [None]:
# Check for errors in data labeling, removing nans
def remove_nulls(sentences, labels):
  lab = pd.Series(labels)
  sen = pd.Series(sentences)
  lab_nuls = pd.isnull(lab)
  sen_nuls = pd.isnull(sen)
  not_nuls = ~(lab_nuls | sen_nuls)
  lab = lab.loc[not_nuls].to_numpy()
  sen = sen.loc[not_nuls].to_numpy()
  return [sen, lab]

In [None]:
train_sentences, train_labels = remove_nulls(train_sentences, train_labels)
dev_sentences, dev_labels = remove_nulls(dev_sentences, dev_labels)
test_sentences, test_labels = remove_nulls(test_sentences, test_labels)

In [None]:
len(np.concatenate((train_labels, dev_labels, test_labels)))

21805

In [None]:
np.unique(np.concatenate((train_labels, dev_labels, test_labels)), return_counts=True)

(array([0, 1, 2]), array([10226,  7343,  4236]))

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoder for labels
labelencoder = LabelEncoder()
train_labels = labelencoder.fit_transform(train_labels)
test_labels = labelencoder.transform(test_labels)
dev_labels = labelencoder.transform(dev_labels)
labelencoder.classes_

array(['z_minus_m', 'z_plus_m', 'z_zero'], dtype=object)

In [None]:
len(train_sentences)  * (1-0.875)

2177.25

In [None]:
dev_sentences = np.append(dev_sentences, train_sentences[:2178])
dev_labels = np.append(dev_labels, train_labels[:2178])
train_sentences = train_sentences[2178:]
train_labels = train_labels[2178:]

In [None]:
print('Train data')
print(len(train_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Dev data')
print(len(dev_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Test data')
print(len(test_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))

Train data
0.6989222655354277
Dev data
0.20027516624627378
Test data
0.10080256821829855


In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dev_polemo_data = pd.DataFrame([dev_sentences,dev_labels]).T
test_polemo_data = pd.DataFrame([test_sentences,test_labels]).T

In [None]:
dev_polemo_data.to_csv('/content/drive/My Drive/dev_polemo_hotels_data_preprocessed.csv')
test_polemo_data.to_csv('/content/drive/My Drive/test_polemo_hotels_data_preprocessed.csv')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

X = train_sentences
y = train_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
  train_sentences, test_sentences = X[train_index], X[test_index]
  train_labels, test_labels = y[train_index], y[test_index]

X = test_sentences
y = test_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for train_index, test_index in sss.split(X, y):
  dev_sentences, test_sentences = X[train_index], X[test_index]
  dev_labels, test_labels = y[train_index], y[test_index]

In [None]:
# Remove long sentences.
# TO-DO Possible cut?
def remove_big(sentences, labels):
  to_remove = []
  for i, sent in enumerate(sentences):
      input_ids = tokenizer.encode(sent, add_special_tokens=True) # TO-DO: add_special_tokens
      if len(input_ids) > MAX_LEN:
        to_remove.append(i)

  sentences = np.delete(sentences, to_remove)
  labels = np.delete(labels, to_remove) 

  print('{} samples removed.'.format(len(to_remove)))

  return sentences, labels

In [None]:
# Downloading tokenizer
# From Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=494801.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2.0, style=ProgressStyle(description_wi…




In [None]:
MAX_LEN = 128

train_sentences, train_labels = remove_big(train_sentences, train_labels)
test_sentences, test_labels = remove_big(test_sentences, test_labels)
dev_sentences, dev_labels = remove_big(dev_sentences, dev_labels)

25 samples removed.
6 samples removed.
3 samples removed.


In [None]:
from torch.utils.data import TensorDataset
# Create TensorDatasets for train/dev/test sets
def tensor_dataset(sentences, labels):
  input_ids = []
  attention_masks = []

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                          sent,                     
                          add_special_tokens = True,
                          max_length = MAX_LEN,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  dataset = TensorDataset(input_ids, attention_masks, labels)
  return dataset

In [None]:
BATCH_SIZE = 8

train_dataset = tensor_dataset(train_sentences, train_labels)
test_dataset = tensor_dataset(test_sentences, test_labels)
dev_dataset = tensor_dataset(dev_sentences, dev_labels)

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# Create the DataLoaders for train/dev/test sets.
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = BATCH_SIZE)
validation_dataloader = DataLoader(dev_dataset, sampler = SequentialSampler(dev_dataset), batch_size = BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = BATCH_SIZE)

In [None]:
batch_size = 32

In [None]:
# Load model with a sequence classification head
model = BertForSequenceClassification.from_pretrained(
    "dkleczek/bert-base-polish-uncased-v1", # Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=459.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=531146902.0, style=ProgressStyle(descri…




BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(60000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
import time, datetime
import numpy as np
from tqdm import tqdm
from transformers.optimization import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

# Parameters:
epochs = 3
#lr = 1e-3 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
lr = 5e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
adam_epsilon = 1e-8
WARM_UP = 0

optimizer = AdamW(model.parameters(), lr = lr, eps = adam_epsilon)

from transformers import get_linear_schedule_with_warmup
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = WARM_UP, num_training_steps = total_steps)

train_loss_values = []
dev_acc_values = []

model.zero_grad()

t0 = time.time()
for epoch_i in range(0, epochs):  
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')

  # https://github.com/huggingface/transformers/blob/master/examples/run_glue.py
  # linie 168-183
  epoch_train_loss = 0 # Cumulative loss
  loss = 0 ;     batch_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):

    # Progress update every 40 batches.
    if step % 40 == 0 and not step == 0:
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)      
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}. Loss: {:.3f}  Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
    

    batch_loss = 0
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)    

    # clear any previously calculated gradients before backward pass
    optimizer.zero_grad()

    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    epoch_train_loss += loss.item()
    batch_loss += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    optimizer.step()
    scheduler.step()  # Update learning rate schedule

  epoch_train_loss = epoch_train_loss / len(train_dataloader)          
  train_loss_values.append(epoch_train_loss)
  
  print('Average training loss: {0:.2f}'.format(epoch_train_loss))

  # Evaluation
  total_eval_accuracy = 0
  model.eval()

  for batch in validation_dataloader:
    
    input_ids = batch[0].to(device)
    attention_masks = batch[1].to(device)
    labels = batch[2].to('cpu').numpy()
                
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    predictions = np.argmax(logits, axis=1).flatten()
    total_eval_accuracy += flat_accuracy(logits, labels)

  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.4f}".format(avg_val_accuracy))



Training...


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


  Batch    40  of  1,521. Loss: 1.166  Elapsed: 0:00:09.
  Batch    80  of  1,521. Loss: 0.430  Elapsed: 0:00:17.
  Batch   120  of  1,521. Loss: 0.412  Elapsed: 0:00:26.
  Batch   160  of  1,521. Loss: 0.635  Elapsed: 0:00:35.
  Batch   200  of  1,521. Loss: 1.897  Elapsed: 0:00:43.
  Batch   240  of  1,521. Loss: 0.429  Elapsed: 0:00:52.
  Batch   280  of  1,521. Loss: 0.347  Elapsed: 0:01:01.
  Batch   320  of  1,521. Loss: 0.097  Elapsed: 0:01:11.
  Batch   360  of  1,521. Loss: 1.731  Elapsed: 0:01:20.
  Batch   400  of  1,521. Loss: 0.180  Elapsed: 0:01:29.
  Batch   440  of  1,521. Loss: 0.970  Elapsed: 0:01:38.
  Batch   480  of  1,521. Loss: 0.151  Elapsed: 0:01:48.
  Batch   520  of  1,521. Loss: 0.106  Elapsed: 0:01:57.
  Batch   560  of  1,521. Loss: 0.286  Elapsed: 0:02:06.
  Batch   600  of  1,521. Loss: 0.551  Elapsed: 0:02:15.
  Batch   640  of  1,521. Loss: 1.340  Elapsed: 0:02:24.
  Batch   680  of  1,521. Loss: 0.446  Elapsed: 0:02:33.
  Batch   720  of  1,521. Loss:

In [None]:
predicted_labels = [] ; true_labels = []; logits_list = []

for batch in test_dataloader:
  
  input_ids = batch[0].to(device)
  attention_masks = batch[1].to(device)
  labels = batch[2]
  
  with torch.no_grad():        
      outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
                  
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  logits_list.append(logits)
  
  predictions = np.argmax(logits, axis=1).flatten()
  labels = labels.numpy().flatten()

  predicted_labels.extend( predictions )
  true_labels.extend( labels )
  

In [None]:
def inverse_logit(x):
  return np.exp(x) / (1 + np.exp(x))

In [None]:
# Parameters:
#epochs = 2
#lr = 3e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
#adam_epsilon = 1e-8
#WARM_UP = 0
#87 86 87
from sklearn.metrics import classification_report 
print( classification_report(y_true=true_labels, y_pred=predicted_labels, zero_division=0) )

              precision    recall  f1-score   support

           0       0.92      0.93      0.92       706
           1       0.89      0.92      0.91       514
           2       0.84      0.76      0.80       298

    accuracy                           0.89      1518
   macro avg       0.88      0.87      0.88      1518
weighted avg       0.89      0.89      0.89      1518



In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = "/content/drive/My Drive/model_bert_finetuned_1_1"

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)

model_to_save = model.module if hasattr(model, 'module') else model
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)

('/content/drive/My Drive/model_bert_finetuned_1_1/vocab.txt',
 '/content/drive/My Drive/model_bert_finetuned_1_1/special_tokens_map.json',
 '/content/drive/My Drive/model_bert_finetuned_1_1/added_tokens.json')

In [None]:
# Step 2: Re-load the saved model and vocabulary
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

**Medicine**

In [None]:
import os

TASK = 'SDS-M'

for set_ in get_sets(TASK):
  os.system('wget https://wothub-data.s3.amazonaws.com/Corpus/%s -nc'%set_)

In [None]:
train_path, dev_path, test_path = get_sets(TASK)

In [None]:
# Load data as pandas dataframes with two columns -- sentences and labels
train_data = pd.read_csv(train_path, sep="__label__", header=None, names=["text", "label"], engine="python")
dev_data = pd.read_csv(dev_path, sep="__label__", header=None, names=["text", "label"], engine="python")
test_data = pd.read_csv(test_path, sep="__label__", header=None, names=["text", "label"], engine="python")

In [None]:
train_data = train_data[train_data.label != 'z_amb']
dev_data = dev_data[dev_data.label != 'z_amb']
test_data = test_data[test_data.label != 'z_amb']

In [None]:
# Convert to numpy arrays
train_sentences, dev_sentences, test_sentences = [data.iloc[:,0] for data in (train_data, dev_data, test_data)]
train_labels, dev_labels, test_labels = [data.iloc[:,1] for data in (train_data, dev_data, test_data)]

In [None]:
# Check for errors in data labeling, removing nans
def remove_nulls(sentences, labels):
  lab = pd.Series(labels)
  sen = pd.Series(sentences)
  lab_nuls = pd.isnull(lab)
  sen_nuls = pd.isnull(sen)
  not_nuls = ~(lab_nuls | sen_nuls)
  lab = lab.loc[not_nuls].to_numpy()
  sen = sen.loc[not_nuls].to_numpy()
  return [sen, lab]

In [None]:
train_sentences, train_labels = remove_nulls(train_sentences, train_labels)
dev_sentences, dev_labels = remove_nulls(dev_sentences, dev_labels)
test_sentences, test_labels = remove_nulls(test_sentences, test_labels)

In [None]:
np.unique(np.concatenate((train_labels, dev_labels, test_labels)), return_counts=True)

(array([0, 1, 2]), array([7041, 5253, 8945]))

In [None]:
len(train_sentences)  * (1-0.875)

2126.375

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoder for labels
labelencoder = LabelEncoder()
train_labels = labelencoder.fit_transform(train_labels)
test_labels = labelencoder.transform(test_labels)
dev_labels = labelencoder.transform(dev_labels)
labelencoder.classes_

array(['z_minus_m', 'z_plus_m', 'z_zero'], dtype=object)

In [None]:
dev_sentences = np.append(dev_sentences, train_sentences[:2127])
dev_labels = np.append(dev_labels, train_labels[:2127])
train_sentences = train_sentences[2127:]
train_labels = train_labels[2127:]

In [None]:
print('Train data')
print(len(train_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Dev data')
print(len(dev_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Test data')
print(len(test_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))

Train data
0.7007862893733227
Dev data
0.19977400065916476
Test data
0.0994397099675126


In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dev_polemo_data = pd.DataFrame([dev_sentences,dev_labels]).T
test_polemo_data = pd.DataFrame([test_sentences,test_labels]).T

In [None]:
dev_polemo_data.to_csv('/content/drive/My Drive/dev_polemo_medicine_data_preprocessed.csv')
test_polemo_data.to_csv('/content/drive/My Drive/test_polemo_medicine_data_preprocessed.csv')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

X = train_sentences
y = train_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
  train_sentences, test_sentences = X[train_index], X[test_index]
  train_labels, test_labels = y[train_index], y[test_index]

X = test_sentences
y = test_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for train_index, test_index in sss.split(X, y):
  dev_sentences, test_sentences = X[train_index], X[test_index]
  dev_labels, test_labels = y[train_index], y[test_index]

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoder for labels
labelencoder = LabelEncoder()
train_labels = labelencoder.fit_transform(train_labels)
test_labels = labelencoder.transform(test_labels)
dev_labels = labelencoder.transform(dev_labels)

In [None]:
# Remove long sentences.
# TO-DO Possible cut?
def remove_big(sentences, labels):
  to_remove = []
  for i, sent in enumerate(sentences):
      input_ids = tokenizer.encode(sent, add_special_tokens=True) # TO-DO: add_special_tokens
      if len(input_ids) > MAX_LEN:
        to_remove.append(i)

  sentences = np.delete(sentences, to_remove)
  labels = np.delete(labels, to_remove) 

  print('{} samples removed.'.format(len(to_remove)))

  return sentences, labels

In [None]:
# Downloading tokenizer
# From Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")

In [None]:
MAX_LEN = 128

train_sentences, train_labels = remove_big(train_sentences, train_labels)
test_sentences, test_labels = remove_big(test_sentences, test_labels)
dev_sentences, dev_labels = remove_big(dev_sentences, dev_labels)

14 samples removed.
3 samples removed.
0 samples removed.


In [None]:
from torch.utils.data import TensorDataset
# Create TensorDatasets for train/dev/test sets
def tensor_dataset(sentences, labels):
  input_ids = []
  attention_masks = []

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                          sent,                     
                          add_special_tokens = True,
                          max_length = MAX_LEN,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  dataset = TensorDataset(input_ids, attention_masks, labels)
  return dataset

In [None]:
BATCH_SIZE = 8

train_dataset = tensor_dataset(train_sentences, train_labels)
test_dataset = tensor_dataset(test_sentences, test_labels)
dev_dataset = tensor_dataset(dev_sentences, dev_labels)

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# Create the DataLoaders for train/dev/test sets.
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = BATCH_SIZE)
validation_dataloader = DataLoader(dev_dataset, sampler = SequentialSampler(dev_dataset), batch_size = BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = BATCH_SIZE)

In [None]:
# Load model with a sequence classification head
model = BertForSequenceClassification.from_pretrained(
    "dkleczek/bert-base-polish-uncased-v1", # Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(60000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
import time, datetime
import numpy as np
from tqdm import tqdm
from transformers.optimization import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

# Parameters:
epochs = 3
#lr = 1e-3 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
lr = 5e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
adam_epsilon = 1e-8
WARM_UP = 0

optimizer = AdamW(model.parameters(), lr = lr, eps = adam_epsilon)

from transformers import get_linear_schedule_with_warmup
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = WARM_UP, num_training_steps = total_steps)

train_loss_values = []
dev_acc_values = []

model.zero_grad()

t0 = time.time()
for epoch_i in range(0, epochs):  
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')

  # https://github.com/huggingface/transformers/blob/master/examples/run_glue.py
  # linie 168-183
  epoch_train_loss = 0 # Cumulative loss
  loss = 0 ;     batch_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):

    # Progress update every 40 batches.
    if step % 40 == 0 and not step == 0:
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)      
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}. Loss: {:.3f}  Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
    

    batch_loss = 0
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)    

    # clear any previously calculated gradients before backward pass
    optimizer.zero_grad()

    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    epoch_train_loss += loss.item()
    batch_loss += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    optimizer.step()
    scheduler.step()  # Update learning rate schedule

  epoch_train_loss = epoch_train_loss / len(train_dataloader)          
  train_loss_values.append(epoch_train_loss)
  
  print('Average training loss: {0:.2f}'.format(epoch_train_loss))

  # Evaluation
  total_eval_accuracy = 0
  model.eval()

  for batch in validation_dataloader:
    
    input_ids = batch[0].to(device)
    attention_masks = batch[1].to(device)
    labels = batch[2].to('cpu').numpy()
                
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    predictions = np.argmax(logits, axis=1).flatten()
    total_eval_accuracy += flat_accuracy(logits, labels)

  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.4f}".format(avg_val_accuracy))



Training...


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


  Batch    40  of  1,487. Loss: 0.950  Elapsed: 0:00:09.
  Batch    80  of  1,487. Loss: 1.166  Elapsed: 0:00:18.
  Batch   120  of  1,487. Loss: 0.445  Elapsed: 0:00:27.
  Batch   160  of  1,487. Loss: 1.185  Elapsed: 0:00:36.
  Batch   200  of  1,487. Loss: 0.564  Elapsed: 0:00:46.
  Batch   240  of  1,487. Loss: 1.034  Elapsed: 0:00:55.
  Batch   280  of  1,487. Loss: 0.623  Elapsed: 0:01:04.
  Batch   320  of  1,487. Loss: 0.258  Elapsed: 0:01:13.
  Batch   360  of  1,487. Loss: 0.572  Elapsed: 0:01:22.
  Batch   400  of  1,487. Loss: 1.653  Elapsed: 0:01:31.
  Batch   440  of  1,487. Loss: 1.127  Elapsed: 0:01:41.
  Batch   480  of  1,487. Loss: 0.700  Elapsed: 0:01:50.
  Batch   520  of  1,487. Loss: 0.484  Elapsed: 0:01:59.
  Batch   560  of  1,487. Loss: 0.587  Elapsed: 0:02:08.
  Batch   600  of  1,487. Loss: 0.517  Elapsed: 0:02:17.
  Batch   640  of  1,487. Loss: 0.639  Elapsed: 0:02:27.
  Batch   680  of  1,487. Loss: 0.878  Elapsed: 0:02:36.
  Batch   720  of  1,487. Loss:

In [None]:
predicted_labels = [] ; true_labels = []; logits_list = []

for batch in test_dataloader:
  
  input_ids = batch[0].to(device)
  attention_masks = batch[1].to(device)
  labels = batch[2]
  
  with torch.no_grad():        
      outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
                  
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  logits_list.append(logits)
  
  predictions = np.argmax(logits, axis=1).flatten()
  labels = labels.numpy().flatten()

  predicted_labels.extend( predictions )
  true_labels.extend( labels )
  

In [None]:
# Parameters:
#epochs = 2
#lr = 3e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
#adam_epsilon = 1e-8
#WARM_UP = 0
#87 86 87
from sklearn.metrics import classification_report 
print( classification_report(y_true=true_labels, y_pred=predicted_labels, zero_division=0) )

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       489
           1       0.86      0.83      0.84       370
           2       0.86      0.85      0.85       627

    accuracy                           0.85      1486
   macro avg       0.85      0.84      0.85      1486
weighted avg       0.85      0.85      0.85      1486



In [None]:
import os
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = "/content/drive/My Drive/model_bert_finetuned_1_2"

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)

model_to_save = model.module if hasattr(model, 'module') else model
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)

('/content/drive/My Drive/model_bert_finetuned_1_2/vocab.txt',
 '/content/drive/My Drive/model_bert_finetuned_1_2/special_tokens_map.json',
 '/content/drive/My Drive/model_bert_finetuned_1_2/added_tokens.json')

In [None]:
# Step 2: Re-load the saved model and vocabulary
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

**Products**

In [None]:
import os

TASK = 'SDS-P'

for set_ in get_sets(TASK):
  os.system('wget https://wothub-data.s3.amazonaws.com/Corpus/%s -nc'%set_)

In [None]:
train_path, dev_path, test_path = get_sets(TASK)

In [None]:
# Load data as pandas dataframes with two columns -- sentences and labels
train_data = pd.read_csv(train_path, sep="__label__", header=None, names=["text", "label"], engine="python")
dev_data = pd.read_csv(dev_path, sep="__label__", header=None, names=["text", "label"], engine="python")
test_data = pd.read_csv(test_path, sep="__label__", header=None, names=["text", "label"], engine="python")

In [None]:
train_data = train_data[train_data.label != 'z_amb']
dev_data = dev_data[dev_data.label != 'z_amb']
test_data = test_data[test_data.label != 'z_amb']

In [None]:
# Convert to numpy arrays
train_sentences, dev_sentences, test_sentences = [data.iloc[:,0] for data in (train_data, dev_data, test_data)]
train_labels, dev_labels, test_labels = [data.iloc[:,1] for data in (train_data, dev_data, test_data)]

In [None]:
# Check for errors in data labeling, removing nans
def remove_nulls(sentences, labels):
  lab = pd.Series(labels)
  sen = pd.Series(sentences)
  lab_nuls = pd.isnull(lab)
  sen_nuls = pd.isnull(sen)
  not_nuls = ~(lab_nuls | sen_nuls)
  lab = lab.loc[not_nuls].to_numpy()
  sen = sen.loc[not_nuls].to_numpy()
  return [sen, lab]

In [None]:
train_sentences, train_labels = remove_nulls(train_sentences, train_labels)
dev_sentences, dev_labels = remove_nulls(dev_sentences, dev_labels)
test_sentences, test_labels = remove_nulls(test_sentences, test_labels)

In [None]:
np.unique(np.concatenate((train_labels, dev_labels, test_labels)), return_counts=True)

(array([0, 1, 2]), array([3429, 1828,  695]))

In [None]:
len(train_sentences)  * (1-0.875)

594.25

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoder for labels
labelencoder = LabelEncoder()
train_labels = labelencoder.fit_transform(train_labels)
test_labels = labelencoder.transform(test_labels)
dev_labels = labelencoder.transform(dev_labels)
labelencoder.classes_

array(['z_minus_m', 'z_plus_m', 'z_zero'], dtype=object)

In [None]:
dev_sentences = np.append(dev_sentences, train_sentences[:594])
dev_labels = np.append(dev_labels, train_labels[:594])
train_sentences = train_sentences[594:]
train_labels = train_labels[594:]

In [None]:
print('Train data')
print(len(train_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Dev data')
print(len(dev_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Test data')
print(len(test_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))

Train data
0.6989247311827957
Dev data
0.1984206989247312
Test data
0.10265456989247312


In [None]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dev_polemo_data = pd.DataFrame([dev_sentences,dev_labels]).T
test_polemo_data = pd.DataFrame([test_sentences,test_labels]).T

In [None]:
dev_polemo_data.to_csv('/content/drive/My Drive/dev_polemo_products_data_preprocessed.csv')
test_polemo_data.to_csv('/content/drive/My Drive/test_polemo_products_data_preprocessed.csv')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

X = train_sentences
y = train_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
  train_sentences, test_sentences = X[train_index], X[test_index]
  train_labels, test_labels = y[train_index], y[test_index]

X = test_sentences
y = test_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for train_index, test_index in sss.split(X, y):
  dev_sentences, test_sentences = X[train_index], X[test_index]
  dev_labels, test_labels = y[train_index], y[test_index]

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoder for labels
labelencoder = LabelEncoder()
train_labels = labelencoder.fit_transform(train_labels)
test_labels = labelencoder.transform(test_labels)
dev_labels = labelencoder.transform(dev_labels)

In [None]:
# Remove long sentences.
# TO-DO Possible cut?
def remove_big(sentences, labels):
  to_remove = []
  for i, sent in enumerate(sentences):
      input_ids = tokenizer.encode(sent, add_special_tokens=True) # TO-DO: add_special_tokens
      if len(input_ids) > MAX_LEN:
        to_remove.append(i)

  sentences = np.delete(sentences, to_remove)
  labels = np.delete(labels, to_remove) 

  print('{} samples removed.'.format(len(to_remove)))

  return sentences, labels

In [None]:
# Downloading tokenizer
# From Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")

In [None]:
MAX_LEN = 128

train_sentences, train_labels = remove_big(train_sentences, train_labels)
test_sentences, test_labels = remove_big(test_sentences, test_labels)
dev_sentences, dev_labels = remove_big(dev_sentences, dev_labels)

12 samples removed.
2 samples removed.
4 samples removed.


In [None]:
from torch.utils.data import TensorDataset
# Create TensorDatasets for train/dev/test sets
def tensor_dataset(sentences, labels):
  input_ids = []
  attention_masks = []

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                          sent,                     
                          add_special_tokens = True,
                          max_length = MAX_LEN,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  dataset = TensorDataset(input_ids, attention_masks, labels)
  return dataset

In [None]:
BATCH_SIZE = 8

train_dataset = tensor_dataset(train_sentences, train_labels)
test_dataset = tensor_dataset(test_sentences, test_labels)
dev_dataset = tensor_dataset(dev_sentences, dev_labels)

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# Create the DataLoaders for train/dev/test sets.
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = BATCH_SIZE)
validation_dataloader = DataLoader(dev_dataset, sampler = SequentialSampler(dev_dataset), batch_size = BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = BATCH_SIZE)

In [None]:
# Load model with a sequence classification head
model = BertForSequenceClassification.from_pretrained(
    "dkleczek/bert-base-polish-uncased-v1", # Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(60000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
import time, datetime
import numpy as np
from tqdm import tqdm
from transformers.optimization import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

# Parameters:
epochs = 3
#lr = 1e-3 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
lr = 3e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
adam_epsilon = 1e-8
WARM_UP = 0

optimizer = AdamW(model.parameters(), lr = lr, eps = adam_epsilon)

from transformers import get_linear_schedule_with_warmup
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = WARM_UP, num_training_steps = total_steps)

train_loss_values = []
dev_acc_values = []

model.zero_grad()

t0 = time.time()
for epoch_i in range(0, epochs):  
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')

  # https://github.com/huggingface/transformers/blob/master/examples/run_glue.py
  # linie 168-183
  epoch_train_loss = 0 # Cumulative loss
  loss = 0 ;     batch_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):

    # Progress update every 40 batches.
    if step % 40 == 0 and not step == 0:
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)      
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}. Loss: {:.3f}  Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
    

    batch_loss = 0
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)    

    # clear any previously calculated gradients before backward pass
    optimizer.zero_grad()

    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    epoch_train_loss += loss.item()
    batch_loss += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    optimizer.step()
    scheduler.step()  # Update learning rate schedule

  epoch_train_loss = epoch_train_loss / len(train_dataloader)          
  train_loss_values.append(epoch_train_loss)
  
  print('Average training loss: {0:.2f}'.format(epoch_train_loss))

  # Evaluation
  total_eval_accuracy = 0
  model.eval()

  for batch in validation_dataloader:
    
    input_ids = batch[0].to(device)
    attention_masks = batch[1].to(device)
    labels = batch[2].to('cpu').numpy()
                
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    predictions = np.argmax(logits, axis=1).flatten()
    total_eval_accuracy += flat_accuracy(logits, labels)

  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.4f}".format(avg_val_accuracy))



Training...


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


  Batch    40  of    415. Loss: 0.689  Elapsed: 0:00:09.
  Batch    80  of    415. Loss: 0.295  Elapsed: 0:00:17.
  Batch   120  of    415. Loss: 0.496  Elapsed: 0:00:26.
  Batch   160  of    415. Loss: 0.963  Elapsed: 0:00:34.
  Batch   200  of    415. Loss: 0.355  Elapsed: 0:00:43.
  Batch   240  of    415. Loss: 0.100  Elapsed: 0:00:52.
  Batch   280  of    415. Loss: 0.276  Elapsed: 0:01:01.
  Batch   320  of    415. Loss: 0.787  Elapsed: 0:01:10.
  Batch   360  of    415. Loss: 1.025  Elapsed: 0:01:19.
  Batch   400  of    415. Loss: 0.539  Elapsed: 0:01:28.
Average training loss: 0.59
  Accuracy: 0.8389

Training...
  Batch    40  of    415. Loss: 0.326  Elapsed: 0:01:44.
  Batch    80  of    415. Loss: 0.516  Elapsed: 0:01:54.
  Batch   120  of    415. Loss: 0.040  Elapsed: 0:02:03.
  Batch   160  of    415. Loss: 0.018  Elapsed: 0:02:12.
  Batch   200  of    415. Loss: 0.008  Elapsed: 0:02:21.
  Batch   240  of    415. Loss: 0.131  Elapsed: 0:02:30.
  Batch   280  of    415. Lo

In [None]:
predicted_labels = [] ; true_labels = []; logits_list = []

for batch in test_dataloader:
  
  input_ids = batch[0].to(device)
  attention_masks = batch[1].to(device)
  labels = batch[2]
  
  with torch.no_grad():        
      outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
                  
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  logits_list.append(logits)
  
  predictions = np.argmax(logits, axis=1).flatten()
  labels = labels.numpy().flatten()

  predicted_labels.extend( predictions )
  true_labels.extend( labels )
  

In [None]:
# Parameters:
#epochs = 2
#lr = 3e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
#adam_epsilon = 1e-8
#WARM_UP = 0
#87 86 87
from sklearn.metrics import classification_report 
print( classification_report(y_true=true_labels, y_pred=predicted_labels, zero_division=0) )

              precision    recall  f1-score   support

           0       0.87      0.91      0.89       238
           1       0.83      0.83      0.83       127
           2       0.69      0.51      0.59        49

    accuracy                           0.84       414
   macro avg       0.80      0.75      0.77       414
weighted avg       0.84      0.84      0.84       414



In [None]:
import os
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = "/content/drive/My Drive/model_bert_finetuned_1_3"

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)

model_to_save = model.module if hasattr(model, 'module') else model
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)

('/content/drive/My Drive/model_bert_finetuned_1_3/vocab.txt',
 '/content/drive/My Drive/model_bert_finetuned_1_3/special_tokens_map.json',
 '/content/drive/My Drive/model_bert_finetuned_1_3/added_tokens.json')

In [None]:
# Step 2: Re-load the saved model and vocabulary
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

**Reviews**

In [None]:
import os

TASK = 'SDS-R'

for set_ in get_sets(TASK):
  os.system('wget https://wothub-data.s3.amazonaws.com/Corpus/%s -nc'%set_)

In [None]:
train_path, dev_path, test_path = get_sets(TASK)

In [None]:
# Load data as pandas dataframes with two columns -- sentences and labels
train_data = pd.read_csv(train_path, sep="__label__", header=None, names=["text", "label"], engine="python")
dev_data = pd.read_csv(dev_path, sep="__label__", header=None, names=["text", "label"], engine="python")
test_data = pd.read_csv(test_path, sep="__label__", header=None, names=["text", "label"], engine="python")

In [None]:
train_data = train_data[train_data.label != 'z_amb']
dev_data = dev_data[dev_data.label != 'z_amb']
test_data = test_data[test_data.label != 'z_amb']

In [None]:
# Convert to numpy arrays
train_sentences, dev_sentences, test_sentences = [data.iloc[:,0] for data in (train_data, dev_data, test_data)]
train_labels, dev_labels, test_labels = [data.iloc[:,1] for data in (train_data, dev_data, test_data)]

In [None]:
# Check for errors in data labeling, removing nans
def remove_nulls(sentences, labels):
  lab = pd.Series(labels)
  sen = pd.Series(sentences)
  lab_nuls = pd.isnull(lab)
  sen_nuls = pd.isnull(sen)
  not_nuls = ~(lab_nuls | sen_nuls)
  lab = lab.loc[not_nuls].to_numpy()
  sen = sen.loc[not_nuls].to_numpy()
  return [sen, lab]

In [None]:
train_sentences, train_labels = remove_nulls(train_sentences, train_labels)
dev_sentences, dev_labels = remove_nulls(dev_sentences, dev_labels)
test_sentences, test_labels = remove_nulls(test_sentences, test_labels)

In [None]:
np.unique(np.concatenate((train_labels, dev_labels, test_labels)), return_counts=True)

(array([0, 1, 2]), array([460, 900, 225]))

In [None]:
len(train_sentences)  * (1-0.875)

159.5

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoder for labels
labelencoder = LabelEncoder()
train_labels = labelencoder.fit_transform(train_labels)
test_labels = labelencoder.transform(test_labels)
dev_labels = labelencoder.transform(dev_labels)
labelencoder.classes_

array(['z_minus_m', 'z_plus_m', 'z_zero'], dtype=object)

In [None]:
dev_sentences = np.append(dev_sentences, train_sentences[:160])
dev_labels = np.append(dev_labels, train_labels[:160])
train_sentences = train_sentences[160:]
train_labels = train_labels[160:]

In [None]:
print('Train data')
print(len(train_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Dev data')
print(len(dev_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))
print('Test data')
print(len(test_sentences) / ( len(train_sentences) + len(dev_sentences) + len(test_sentences) ))

Train data
0.7041009463722397
Dev data
0.2056782334384858
Test data
0.09022082018927445


In [None]:
dev_polemo_data = pd.DataFrame([dev_sentences,dev_labels]).T
test_polemo_data = pd.DataFrame([test_sentences,test_labels]).T

In [None]:
dev_polemo_data.to_csv('/content/drive/My Drive/dev_polemo_reviews_data_preprocessed.csv')
test_polemo_data.to_csv('/content/drive/My Drive/test_polemo_reviews_data_preprocessed.csv')

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

X = train_sentences
y = train_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X, y):
  train_sentences, test_sentences = X[train_index], X[test_index]
  train_labels, test_labels = y[train_index], y[test_index]

X = test_sentences
y = test_labels
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for train_index, test_index in sss.split(X, y):
  dev_sentences, test_sentences = X[train_index], X[test_index]
  dev_labels, test_labels = y[train_index], y[test_index]

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encoder for labels
labelencoder = LabelEncoder()
train_labels = labelencoder.fit_transform(train_labels)
test_labels = labelencoder.transform(test_labels)
dev_labels = labelencoder.transform(dev_labels)

In [None]:
# Remove long sentences.
# TO-DO Possible cut?
def remove_big(sentences, labels):
  to_remove = []
  for i, sent in enumerate(sentences):
      input_ids = tokenizer.encode(sent, add_special_tokens=True) # TO-DO: add_special_tokens
      if len(input_ids) > MAX_LEN:
        to_remove.append(i)

  sentences = np.delete(sentences, to_remove)
  labels = np.delete(labels, to_remove) 

  print('{} samples removed.'.format(len(to_remove)))

  return sentences, labels

In [None]:
# Downloading tokenizer
# From Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")

In [None]:
MAX_LEN = 128

train_sentences, train_labels = remove_big(train_sentences, train_labels)
test_sentences, test_labels = remove_big(test_sentences, test_labels)
dev_sentences, dev_labels = remove_big(dev_sentences, dev_labels)

3 samples removed.
0 samples removed.
0 samples removed.


In [None]:
from torch.utils.data import TensorDataset
# Create TensorDatasets for train/dev/test sets
def tensor_dataset(sentences, labels):
  input_ids = []
  attention_masks = []

  for sent in sentences:
      encoded_dict = tokenizer.encode_plus(
                          sent,                     
                          add_special_tokens = True,
                          max_length = MAX_LEN,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      input_ids.append(encoded_dict['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  dataset = TensorDataset(input_ids, attention_masks, labels)
  return dataset

In [None]:
BATCH_SIZE = 8

train_dataset = tensor_dataset(train_sentences, train_labels)
test_dataset = tensor_dataset(test_sentences, test_labels)
dev_dataset = tensor_dataset(dev_sentences, dev_labels)

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
# Create the DataLoaders for train/dev/test sets.
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = BATCH_SIZE)
validation_dataloader = DataLoader(dev_dataset, sampler = SequentialSampler(dev_dataset), batch_size = BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, sampler = SequentialSampler(test_dataset), batch_size = BATCH_SIZE)

In [None]:
# Load model with a sequence classification head
model = BertForSequenceClassification.from_pretrained(
    "dkleczek/bert-base-polish-uncased-v1", # Polbert - Polish BERT by Darek Kłeczek: https://github.com/kldarek/polbert
    num_labels = 3,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(60000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [None]:
import time, datetime
import numpy as np
from tqdm import tqdm
from transformers.optimization import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

# Parameters:
epochs = 3
#lr = 1e-3 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
lr = 3e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
adam_epsilon = 1e-8
WARM_UP = 0

optimizer = AdamW(model.parameters(), lr = lr, eps = adam_epsilon)

from transformers import get_linear_schedule_with_warmup
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = WARM_UP, num_training_steps = total_steps)

train_loss_values = []
dev_acc_values = []

model.zero_grad()

t0 = time.time()
for epoch_i in range(0, epochs):  
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')

  # https://github.com/huggingface/transformers/blob/master/examples/run_glue.py
  # linie 168-183
  epoch_train_loss = 0 # Cumulative loss
  loss = 0 ;     batch_loss = 0
  model.train()

  for step, batch in enumerate(train_dataloader):

    # Progress update every 40 batches.
    if step % 40 == 0 and not step == 0:
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)      
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}. Loss: {:.3f}  Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))
    

    batch_loss = 0
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)    

    # clear any previously calculated gradients before backward pass
    optimizer.zero_grad()

    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    epoch_train_loss += loss.item()
    batch_loss += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
    optimizer.step()
    scheduler.step()  # Update learning rate schedule

  epoch_train_loss = epoch_train_loss / len(train_dataloader)          
  train_loss_values.append(epoch_train_loss)
  
  print('Average training loss: {0:.2f}'.format(epoch_train_loss))

  # Evaluation
  total_eval_accuracy = 0
  model.eval()

  for batch in validation_dataloader:
    
    input_ids = batch[0].to(device)
    attention_masks = batch[1].to(device)
    labels = batch[2].to('cpu').numpy()
                
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    predictions = np.argmax(logits, axis=1).flatten()
    total_eval_accuracy += flat_accuracy(logits, labels)

  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("  Accuracy: {0:.4f}".format(avg_val_accuracy))



Training...


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


  Batch    40  of    112. Loss: 0.836  Elapsed: 0:00:09.
  Batch    80  of    112. Loss: 0.699  Elapsed: 0:00:18.
Average training loss: 0.77
  Accuracy: 0.7411

Training...
  Batch    40  of    112. Loss: 0.129  Elapsed: 0:00:34.
  Batch    80  of    112. Loss: 0.936  Elapsed: 0:00:43.
Average training loss: 0.31
  Accuracy: 0.7946

Training...
  Batch    40  of    112. Loss: 0.073  Elapsed: 0:01:00.
  Batch    80  of    112. Loss: 0.005  Elapsed: 0:01:10.
Average training loss: 0.10
  Accuracy: 0.8036


In [None]:
predicted_labels = [] ; true_labels = []; logits_list = []

for batch in test_dataloader:
  
  input_ids = batch[0].to(device)
  attention_masks = batch[1].to(device)
  labels = batch[2]
  
  with torch.no_grad():        
      outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
                  
  logits = outputs[0]
  logits = logits.detach().cpu().numpy()
  logits_list.append(logits)
  
  predictions = np.argmax(logits, axis=1).flatten()
  labels = labels.numpy().flatten()

  predicted_labels.extend( predictions )
  true_labels.extend( labels )
  

In [None]:
# Parameters:
#epochs = 2
#lr = 3e-5 # Learning rate (Adam): 5e-5, 3e-5, 2e-5
#adam_epsilon = 1e-8
#WARM_UP = 0
#87 86 87
from sklearn.metrics import classification_report 
print( classification_report(y_true=true_labels, y_pred=predicted_labels, zero_division=0) )

              precision    recall  f1-score   support

           0       0.81      0.88      0.84        33
           1       0.86      0.86      0.86        63
           2       0.62      0.50      0.55        16

    accuracy                           0.81       112
   macro avg       0.76      0.75      0.75       112
weighted avg       0.81      0.81      0.81       112



In [None]:
import os
from transformers import WEIGHTS_NAME, CONFIG_NAME
output_dir = "/content/drive/My Drive/model_bert_finetuned_1_4"

# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
# If we have a distributed model, save only the encapsulated model
# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)

model_to_save = model.module if hasattr(model, 'module') else model
# If we save using the predefined names, we can load using `from_pretrained`
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_pretrained(output_dir)

('/content/drive/My Drive/model_bert_finetuned_1_4/vocab.txt',
 '/content/drive/My Drive/model_bert_finetuned_1_4/special_tokens_map.json',
 '/content/drive/My Drive/model_bert_finetuned_1_4/added_tokens.json')

In [None]:
# Step 2: Re-load the saved model and vocabulary
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)