In [28]:
#Step 1- Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [27]:
#Step 2- Install Transformers
!pip install transformers



In [29]:
#Step 3- Read dataset
%cd /content/gdrive/My Drive/Colab Notebooks/Bert
import torch
from tqdm.notebook import tqdm
import pandas as pd
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification

df = pd.read_excel('dataset.xlsx')
df.head()

/content/gdrive/My Drive/Colab Notebooks/Bert


Unnamed: 0,comment,Label
0,Tüm bunların yanı sıra çalışanların ücretli iz...,HumanResource
1,Çalışanların hafta tatilleri ve resmi tatiller...,HumanResource
2,Ücretsiz izin çerçevesinde iş sözleşmesi askıy...,HumanResource
3,Memurlar hastalandıkları ve bu hastalıklarını ...,HumanResource
4,Bilindiği üzere 657 sayılı Kanunda devlet memu...,HumanResource


In [30]:
#Step 4- Take a look on number of labels
df['Label'].value_counts()

HumanResource        511
ProjectManagement    501
Muhasebe             478
Name: Label, dtype: int64

In [31]:
#Step 5- See how many unique value is labels
possible_labels = df.Label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'HumanResource': 0, 'Muhasebe': 2, 'ProjectManagement': 1}

In [32]:
#Step 6- Change 'Label' column name with 'Category'
df = df.rename(columns={'Label': 'Category'})
df.head()

In [34]:
#Step 7- Add new column according to 'label dict'
df['Label'] = df.Category.replace(label_dict)
df.head()

In [36]:
#Step 8- Split the dateset as train, validation and test
#train: 15 percent of the dataset
#val:   21.25 percent of the dataset
#test:  63.75 percent of the dataset
 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(df.index.values, 
                                                     df.Label.values,
                                                     shuffle=True, 
                                                     test_size=0.15, 
                                                     random_state=1,
                                                     stratify=None)
 


 X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                   y_train, 
                                                   test_size=0.25, 
                                                   random_state=1) # 0.25 x 0.85 = 0.2125


df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df.loc[X_test, 'data_type'] = 'test'

df.groupby(['Category', 'Label', 'data_type']).count()
df.head()                           

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,comment
Category,Label,data_type,Unnamed: 3_level_1
HumanResource,0,test,67
HumanResource,0,train,342
HumanResource,0,val,102
Muhasebe,2,test,79
Muhasebe,2,train,293
Muhasebe,2,val,106
ProjectManagement,1,test,78
ProjectManagement,1,train,314
ProjectManagement,1,val,109


In [38]:
#Step 8- Encode train,val and test data separately 
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-uncased', do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].comment.values.astype('str') , 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].comment.values.astype('str'), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_test = tokenizer.batch_encode_plus(
    df[df.data_type=='test'].comment.values.astype('str'), 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].Label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].Label.values)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df[df.data_type=='test'].Label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
#Step 9- Load pre-trained bert-base-turkish model
model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-128k-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=386.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=740314769.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

In [39]:
#Step 10-Create DataLoader that is combines a dataset and a sampler, and provides an iterable over the given dataset.
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)
dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=batch_size)

In [40]:
#Step 11- Create Optimizer and Sheduler
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [41]:
#Step 12- Define performance metrics
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [21]:
#Step 13- In order for torch to use the GPU, we need to identify and specify the GPU as the device
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [23]:
#Step 14- Tell pytorch to run this model on the GPU.
if torch.cuda.is_available():
   model.cuda()

In [24]:
#Step 15- Training
import random
import numpy as np
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'models/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=317.0, style=ProgressStyle(description_widt…


Epoch 1
Training loss: 0.48594321819736214
Validation loss: 0.33391700398159335
F1 Score (Weighted): 0.911363589273412


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=317.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 0.11932963163100553
Validation loss: 0.26179007196827475
F1 Score (Weighted): 0.9495705080683421


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=317.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.044963104187379435
Validation loss: 0.2802848092739949
F1 Score (Weighted): 0.9527472716091263


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=317.0, style=ProgressStyle(description_widt…


Epoch 4
Training loss: 0.020742356524529517
Validation loss: 0.25882253504192393
F1 Score (Weighted): 0.9495849034558178


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=317.0, style=ProgressStyle(description_widt…


Epoch 5
Training loss: 0.011693407562454834
Validation loss: 0.2653369741453372
F1 Score (Weighted): 0.9526892840772968



In [49]:
#Step 16- Load and evaluate the Model
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-128k-uncased',
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('models/finetuned_BERT_epoch_5.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals)
print("F1 SCORE:")
f1_score_func(predictions, true_vals)

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificatio

Class: HumanResource
Accuracy: 66/67

Class: ProjectManagement
Accuracy: 73/78

Class: Muhasebe
Accuracy: 73/79

F1 SCORE:


0.9462983370066118