In [58]:
import pandas as pd
import torch
import os

from torch.utils.data import Dataset
from transformers import (BertTokenizer, BertForSequenceClassification, Trainer,
                          TrainingArguments, BertPreTrainedModel)
# from simpletransformers.language_modeling import LanguageModelingModel
from sklearn.metrics import accuracy_score, f1_score


In [59]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_properties(0))

True
NVIDIA GeForce RTX 3060
_CudaDeviceProperties(name='NVIDIA GeForce RTX 3060', major=8, minor=6, total_memory=12050MB, multi_processor_count=28)


In [60]:
MODEL_NAME = '/home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert'
TRAIN_FILE_LOC = '/home/abdullah/Code/dl/bnlp-resources/sentiment/SAIL_data/BN_data_train.tsv'
TEST_FILE_LOC = '/home/abdullah/Code/dl/bnlp-resources/sentiment/SAIL_data/BN_data_test.tsv'
EVAL_FILE_LOC = '/home/abdullah/Code/dl/bnlp-resources/sentiment/SAIL_data/BN_data_dev.tsv'


In [61]:
def tsv_to_text(tsv_file_loc):
    file_name = tsv_file_loc.split("/")[-1].split(".")[0]
    txt_name = tsv_file_loc.replace(".tsv", ".txt")
    txt_name = txt_name.replace("split_merged", "texts")

    if os.path.exists(txt_name):
        return [txt_name, file_name]

    df = pd.read_csv(tsv_file_loc, sep="\t")

    for txt in df["text"]:
        with open(txt_name, "a", encoding="utf8") as f:
            f.writelines(txt + "\n")
    return [txt_name, file_name]


tsv_to_text(TRAIN_FILE_LOC)


['/home/abdullah/Code/dl/bnlp-resources/sentiment/SAIL_data/BN_data_train.txt',
 'BN_data_train']

In [62]:
def tsv_to_df(csv_file_loc):
    df = pd.read_csv(csv_file_loc, sep='\t')
    # remove id column
    df = df.drop(columns=['id'])

    # replace neutral with 0, positive with 1, negative with 2 in class_label column
    df.loc[df['class'] == 'BN_NEU', 'class'] = 0
    df.loc[df['class'] == 'BN_POS', 'class'] = 1
    df.loc[df['class'] == 'BN_NEG', 'class'] = 2

    return df


train_df = tsv_to_df(TEST_FILE_LOC)

texts = train_df['text'].tolist()
print(train_df)


                                                  text class
0    'পকেটে শখানেক টাকা ... আর এই মুহুর্তে দিনব্যাপ...     2
1    'শেষ পর্যায়ে এসে গল্পটা এভাবে ভুল পথে মোড় না...     2
2    'সংসদেও ঘুমালেন সমাজকল্যাণ মন্ত্রী  http://t.c...     2
3    'সীমান্তে নারী ও শিশুসহ আটক ৮ http://t.co/Fz3d...     2
4    'আমিও মানুষ ভালা না, মনে মনে শুয়োরের বাচ্চা বল...     2
..                                                 ...   ...
199  'আবের সঙ্গে সৌজন্য সাক্ষাৎ করেছেন খালেদা জিয়া...     1
200  'ভৈরব নদ সংস্কার ও খননের দাবিতে জনউদ্যোগের মান...     1
201  'আলো টেলিফিল্মটা দেখলাম। মাঝেমধ্যে পর্যাপ্ত পর...     1
202      '@arif_rony2 তারপরে ও দেখতে পেলে খুব মজা পাই'     1
203  'চল কাবার পানে ওহে মুহাজির,পেতে প্রেম শুধা আল্...     1

[204 rows x 2 columns]


In [63]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }


In [64]:
def f1_calculator(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  f1 = f1_score(labels, preds, average='weighted')
  return {
      'f1': f1,
  }


In [65]:
train_list = tsv_to_text(TRAIN_FILE_LOC)
test_list = tsv_to_text(TEST_FILE_LOC)
eval_list = tsv_to_text(EVAL_FILE_LOC)


In [66]:
os.environ["WANDB_DISABLED"] = "true"

train_df = tsv_to_df(TRAIN_FILE_LOC)
test_df = tsv_to_df(TEST_FILE_LOC)
eval_df = tsv_to_df(EVAL_FILE_LOC)


class TINYDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(value[idx])
                for key, value in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

train_encodings = tokenizer(train_df['text'].tolist(
), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(test_df['text'].tolist(
), padding=True, truncation=True, max_length=512)
eval_encodings = tokenizer(eval_df['text'].tolist(
), padding=True, truncation=True, max_length=512)

train_dataset = TINYDataset(train_encodings, train_df['class'].tolist())
test_dataset = TINYDataset(test_encodings, test_df['class'].tolist())
eval_dataset = TINYDataset(eval_encodings, eval_df['class'].tolist())

model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3).to('cuda')
model.manual_seed = 14


Didn't find file /home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert/added_tokens.json. We won't load it.
Didn't find file /home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert/tokenizer.json. We won't load it.
loading file /home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert/vocab.txt
loading file None
loading file /home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert/special_tokens_map.json
loading file /home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert/tokenizer_config.json
loading file None
loading configuration file /home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert/config.json
Model config BertConfig {
  "_name_or_path": "google/bert_uncased_L-2_H-128_A-2",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "id2label": {
  

## 1 Epoch Train

In [67]:
# training_args = TrainingArguments(
#     output_dir=f"temp",
#     num_train_epochs=1,
#     per_device_train_batch_size=48,
#     per_device_eval_batch_size=96,
#     warmup_steps=500,
#     learning_rate=5e-5,
#     weight_decay=0.01,
#     overwrite_output_dir=True,
#     logging_dir=f"temp/logs",
#     logging_steps=15,
#     save_steps=15,
#     load_best_model_at_end=True,
#     evaluation_strategy="steps",
#     seed=14,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset

# )

# trainer.train()


using `logging_steps` to initialize `eval_steps` to 15
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 697
  Num Epochs = 1
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 15
                                      
  0%|          | 0/15 [03:27<?, ?it/s]         ***** Running Evaluation *****
  Num examples = 98
  Batch size = 96


{'loss': 132.7965, 'learning_rate': 1.5e-06, 'epoch': 1.0}



                                      
[A                                            

  0%|          | 0/15 [03:27<?, ?it/s]
[A
[ASaving model checkpoint to temp/checkpoint-15
Configuration saved in temp/checkpoint-15/config.json


{'eval_loss': 145.7812957763672, 'eval_runtime': 0.0146, 'eval_samples_per_second': 6722.191, 'eval_steps_per_second': 137.188, 'epoch': 1.0}


Model weights saved in temp/checkpoint-15/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from temp/checkpoint-15 (score: 145.7812957763672).
                                      
100%|██████████| 15/15 [00:01<00:00, 13.32it/s]

{'train_runtime': 1.1286, 'train_samples_per_second': 617.573, 'train_steps_per_second': 13.291, 'train_loss': 132.7965087890625, 'epoch': 1.0}





TrainOutput(global_step=15, training_loss=132.7965087890625, metrics={'train_runtime': 1.1286, 'train_samples_per_second': 617.573, 'train_steps_per_second': 13.291, 'train_loss': 132.7965087890625, 'epoch': 1.0})

## 100 Epoch with 1 epoch model

In [69]:
MODEL_NAME = '/home/abdullah/Code/dl/499A/best_models/1/sail_sentiment'
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=3).to('cuda')
model.manual_seed = 14

for name, param in model.named_parameters():
    if 'bert.encoder.layer.0' in name:
        # print(name, param.requires_grad)
        param.requires_grad = False
# print("\n\n\n")
# for name, param in model.named_parameters():
#     print(name, param.requires_grad)

training_args = TrainingArguments(
    output_dir=f"temp",
    num_train_epochs=100,
    per_device_train_batch_size=48,
    per_device_eval_batch_size=96,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    overwrite_output_dir=True,
    logging_dir=f"temp/logs",
    logging_steps=400,
    save_steps=400,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    seed=14,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=f1_calculator,

)

trainer.train()

# print(cool.metrics)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,

)

trainer.train()


loading configuration file /home/abdullah/Code/dl/499A/best_models/1/sail_sentiment/config.json
Model config BertConfig {
  "_name_or_path": "/home/abdullah/Code/dl/499A/best_models/epoch_3_merged_dataset_tinybert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 14,
  "intermediate_size": 512,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_

{'loss': 95.1685, 'learning_rate': 4e-05, 'epoch': 26.67}



                                      

[A[A                               
  0%|          | 0/15 [04:27<?, ?it/s]            
[A
[ASaving model checkpoint to temp/checkpoint-400
Configuration saved in temp/checkpoint-400/config.json
Model weights saved in temp/checkpoint-400/pytorch_model.bin


{'eval_loss': 52.47710418701172, 'eval_f1': 0.2783075178320948, 'eval_runtime': 0.0168, 'eval_samples_per_second': 5821.874, 'eval_steps_per_second': 118.814, 'epoch': 26.67}


                                      
  0%|          | 0/15 [04:33<?, ?it/s]            ***** Running Evaluation *****
  Num examples = 98
  Batch size = 96


{'loss': 65.2994, 'learning_rate': 3.5e-05, 'epoch': 53.33}



                                      

[A[A                               
  0%|          | 0/15 [04:33<?, ?it/s]            
[A
[ASaving model checkpoint to temp/checkpoint-800
Configuration saved in temp/checkpoint-800/config.json


{'eval_loss': 25.14168930053711, 'eval_f1': 0.3366629323118066, 'eval_runtime': 0.0141, 'eval_samples_per_second': 6934.956, 'eval_steps_per_second': 141.53, 'epoch': 53.33}


Model weights saved in temp/checkpoint-800/pytorch_model.bin
                                      
  0%|          | 0/15 [04:39<?, ?it/s]             ***** Running Evaluation *****
  Num examples = 98
  Batch size = 96


{'loss': 53.1563, 'learning_rate': 1.5e-05, 'epoch': 80.0}



                                      

[A[A                               
  0%|          | 0/15 [04:39<?, ?it/s]             
[A
[ASaving model checkpoint to temp/checkpoint-1200
Configuration saved in temp/checkpoint-1200/config.json


{'eval_loss': 22.037702560424805, 'eval_f1': 0.42408220185997964, 'eval_runtime': 0.0147, 'eval_samples_per_second': 6669.725, 'eval_steps_per_second': 136.117, 'epoch': 80.0}


Model weights saved in temp/checkpoint-1200/pytorch_model.bin
  0%|          | 0/15 [04:42<?, ?it/s]


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from temp/checkpoint-1200 (score: 22.037702560424805).

100%|██████████| 1500/1500 [00:21<00:00, 68.88it/s]
***** Running training *****
  Num examples = 697
  Num Epochs = 100
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 1
  Total optimization steps = 1500


{'train_runtime': 21.7818, 'train_samples_per_second': 3199.918, 'train_steps_per_second': 68.865, 'train_loss': 66.87862174479167, 'epoch': 100.0}


 27%|██▋       | 400/1500 [00:05<00:15, 71.50it/s]***** Running Evaluation *****
  Num examples = 98
  Batch size = 96


{'loss': 50.1589, 'learning_rate': 4e-05, 'epoch': 26.67}



 27%|██▋       | 400/1500 [00:05<00:15, 71.50it/s]Saving model checkpoint to temp/checkpoint-400
Configuration saved in temp/checkpoint-400/config.json


{'eval_loss': 21.600048065185547, 'eval_accuracy': 0.3469387755102041, 'eval_runtime': 0.0216, 'eval_samples_per_second': 4540.544, 'eval_steps_per_second': 92.664, 'epoch': 26.67}


Model weights saved in temp/checkpoint-400/pytorch_model.bin
 53%|█████▎    | 800/1500 [00:14<00:08, 81.05it/s]***** Running Evaluation *****
  Num examples = 98
  Batch size = 96


{'loss': 47.3478, 'learning_rate': 3.5e-05, 'epoch': 53.33}



 53%|█████▎    | 800/1500 [00:14<00:08, 81.05it/s]Saving model checkpoint to temp/checkpoint-800
Configuration saved in temp/checkpoint-800/config.json


{'eval_loss': 16.171661376953125, 'eval_accuracy': 0.42857142857142855, 'eval_runtime': 0.0148, 'eval_samples_per_second': 6619.03, 'eval_steps_per_second': 135.082, 'epoch': 53.33}


Model weights saved in temp/checkpoint-800/pytorch_model.bin
 80%|████████  | 1200/1500 [00:21<00:04, 73.47it/s]***** Running Evaluation *****
  Num examples = 98
  Batch size = 96


{'loss': 44.5711, 'learning_rate': 1.5e-05, 'epoch': 80.0}



 80%|████████  | 1200/1500 [00:21<00:04, 73.47it/s]Saving model checkpoint to temp/checkpoint-1200
Configuration saved in temp/checkpoint-1200/config.json


{'eval_loss': 13.91440486907959, 'eval_accuracy': 0.3877551020408163, 'eval_runtime': 0.0166, 'eval_samples_per_second': 5891.552, 'eval_steps_per_second': 120.236, 'epoch': 80.0}


Model weights saved in temp/checkpoint-1200/pytorch_model.bin
100%|█████████▉| 1493/1500 [00:27<00:00, 76.20it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from temp/checkpoint-1200 (score: 13.91440486907959).
100%|██████████| 1500/1500 [00:27<00:00, 55.24it/s]

{'train_runtime': 27.1635, 'train_samples_per_second': 2565.942, 'train_steps_per_second': 55.221, 'train_loss': 46.58281770833333, 'epoch': 100.0}





TrainOutput(global_step=1500, training_loss=46.58281770833333, metrics={'train_runtime': 27.1635, 'train_samples_per_second': 2565.942, 'train_steps_per_second': 55.221, 'train_loss': 46.58281770833333, 'epoch': 100.0})