#Necessary installations and Imports

In [1]:
!sudo apt-get install libomp-dev

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/gmihaila/ml_things.git
!pip install sentence-transformers
!pip install datasets
!pip install faiss

In [1]:
import io
import random
import os
import math
import torch
import warnings
from tqdm.notebook import tqdm
from ml_things import plot_dict, fix_text
from transformers import *
from sentence_transformers import util
import numpy as np
import pandas as pd
from datasets import Dataset
import faiss

set_seed(1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
ROOTPATH = '/content/drive/MyDrive/NlpProject/'

In [3]:
device

device(type='cuda')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Reading the data and making the appropriate text file

In [5]:
def create_train(mode = 'mesra', numberOfTrainData = 100):
  lines = []
  with open(ROOTPATH + 'all_qazals_mesra.txt') as f:
    lines = [ x.strip() for x in f.readlines()]

  texts = []
  if mode=='beyt':
    for i in range(1,len(lines),2):
      texts.append(lines[i-1]+'.'+lines[i])
  elif mode=='mesra':
    texts = lines


  all_texts = '\n'.join(texts[:min(len(lines),numberOfTrainData)])

  io.open(file= ROOTPATH + 'train.txt', mode='w', encoding='utf-8').write(all_texts)
  return texts

In [6]:
texts = create_train(mode = 'mesra', numberOfTrainData = 329707)

#Fine tuning the model

###Helper fuctions and classes

In [7]:
class ModelDataArguments(object):
  def __init__(self, train_data_file=None, 
               line_by_line=False, mlm=False, mlm_probability=0.15, 
               whole_word_mask=False, max_span_length=5,
               block_size=-1, tokenizer_name=None, 
               model_name_or_path=None):
    
    self.train_data_file = train_data_file
    self.line_by_line = line_by_line
    self.mlm = mlm
    self.whole_word_mask = whole_word_mask
    self.mlm_probability = mlm_probability
    self.max_span_length = max_span_length
    self.block_size = block_size
    self.tokenizer_name = tokenizer_name
    self.model_name_or_path = model_name_or_path
    return


def get_model_config(args: ModelDataArguments):
  model_config = AutoConfig.from_pretrained(args.model_name_or_path)
  return model_config


def get_tokenizer(args: ModelDataArguments):
  if args.tokenizer_name:
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)

  # Dont go beyond tokenizer maximum length.
  args.block_size = min(args.block_size, tokenizer.model_max_length)

  return tokenizer
  

def get_model(args: ModelDataArguments, model_config):
  if type(model_config) in MODEL_FOR_MASKED_LM_MAPPING.keys():
    return AutoModelForMaskedLM.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=model_config)


def get_dataset(args: ModelDataArguments, tokenizer: PreTrainedTokenizer, evaluate: bool=False):
  file_path = args.train_data_file
  if args.line_by_line:
    return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path,block_size=args.block_size)

  else:
    return TextDataset(tokenizer=tokenizer, file_path=file_path,block_size=args.block_size)

def get_collator(args: ModelDataArguments, model_config: PretrainedConfig,tokenizer: PreTrainedTokenizer):
  return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=args.mlm, mlm_probability=args.mlm_probability,)


###Defining the arguments

In [8]:
model_data_args = ModelDataArguments(
                                    train_data_file= ROOTPATH + 'train.txt', 
                                    line_by_line=True, 
                                    mlm=True,
                                    whole_word_mask=True,
                                    mlm_probability=0.15,
                                    max_span_length=5,
                                    block_size=50, 
                                    tokenizer_name='SajjadAyoubi/distil-bigbird-fa-zwnj', 
                                    model_name_or_path="SajjadAyoubi/distil-bigbird-fa-zwnj", 
                                    )


training_args = TrainingArguments(
                          output_dir=ROOTPATH + 'pretrain_persianbigbird',
                          overwrite_output_dir=True,
                          do_train=True, 
                          per_device_train_batch_size=10,
                          prediction_loss_only=True,
                          learning_rate = 5e-5,
                          weight_decay=0,
                          adam_epsilon = 1e-8,
                          max_grad_norm = 1.0,
                          num_train_epochs = 2,
                          save_steps = -1,
                          )


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


###Loading the model and the tokenizer

In [9]:
print('Loading model configuration...')
config = get_model_config(model_data_args)

print('Loading model`s tokenizer...')
tokenizer = get_tokenizer(model_data_args)

print('Loading actual model...')
model = get_model(model_data_args, config)

model.resize_token_embeddings(len(tokenizer))

Loading model configuration...


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--SajjadAyoubi--distil-bigbird-fa-zwnj/snapshots/98fd06440980957e6428dc823e16d56593fb805c/config.json
Model config BigBirdConfig {
  "_name_or_path": "SajjadAyoubi/distil-bigbird-fa-zwnj",
  "architectures": [
    "BigBirdForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "attention_type": "block_sparse",
  "block_size": 32,
  "bos_token_id": null,
  "classifier_dropout": null,
  "eos_token_id": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 4096,
  "model_type": "big_bird",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "num_random_blocks": 3,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "rescale_embeddings": false,
  "sep_token_id": 3,
  "torch_dtype": "float32",
  "transforme

Loading model`s tokenizer...


loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--SajjadAyoubi--distil-bigbird-fa-zwnj/snapshots/98fd06440980957e6428dc823e16d56593fb805c/vocab.txt
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--SajjadAyoubi--distil-bigbird-fa-zwnj/snapshots/98fd06440980957e6428dc823e16d56593fb805c/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--SajjadAyoubi--distil-bigbird-fa-zwnj/snapshots/98fd06440980957e6428dc823e16d56593fb805c/tokenizer_config.json
loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--SajjadAyoubi--distil-bigbird-fa-zwnj/snapshots/98fd06440980957e6428dc823e16d56593fb805c/pytorch_model.bin


Loading actual model...


All model checkpoint weights were used when initializing BigBirdForMaskedLM.

Some weights of BigBirdForMaskedLM were not initialized from the model checkpoint at SajjadAyoubi/distil-bigbird-fa-zwnj and are newly initialized: ['bert.pooler.weight', 'bert.pooler.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(42000, 768, padding_idx=0)

###Creating train dataset

In [10]:
print('Creating train dataset...')
train_dataset = get_dataset(model_data_args, tokenizer=tokenizer)
data_collator = get_collator(model_data_args, config, tokenizer)

Creating features from dataset file at /content/drive/MyDrive/NlpProject/train.txt


Creating train dataset...


### Training

In [11]:
mode = 'test'
if mode == 'train':
  print('Loading `trainer`...')
  trainer = Trainer(model=model,
                    args=training_args,
                    data_collator=data_collator,
                    train_dataset=train_dataset)

  if training_args.do_train:
    print('Start training...')
    model_path = (model_data_args.model_name_or_path 
                  if model_data_args.model_name_or_path is not None and 
                  os.path.isdir(model_data_args.model_name_or_path) 
                  else None
                  )
    trainer.train(model_path=model_path)
    trainer.save_model()
else:
  model.load_state_dict(torch.load(ROOTPATH + "pretrain_persianbigbird/pytorch_model.bin", map_location=torch.device('cuda')))

In [12]:
len(texts)

329707

#TESTING THE CODE

In [12]:
import faiss
def knn_score(train_set, test_set, n_neighbours=1):
    """
    Calculates the KNN distance
    """
    train_set=train_set.detach().cpu().numpy()
    test_set=test_set.detach().cpu().numpy()
    index = faiss.IndexFlatL2(train_set.shape[1])
    index.add(train_set)
    D, idx_ = index.search(test_set, n_neighbours)
    return idx_
    
def input_to_feature(input_,model):
    text_preprocessed = input_
    encoded = tokenizer.batch_encode_plus([text_preprocessed],max_length=50, padding='max_length', truncation=True)
    encoded = {key:torch.LongTensor(value) for key, value in encoded.items()}
    encoded['input_ids']=encoded['input_ids'].cuda()
    encoded['token_type_ids']=encoded['token_type_ids'].cuda()
    encoded['attention_mask']=encoded['attention_mask'].cuda()
    with torch.no_grad(): 
            outputs = model(**encoded)
    feature_outputs = outputs[0].mean(1)
    return feature_outputs



def training_set_feature_bank(model,Data):
        
    feature_bank=[]
 
    batch_size = 10  
    for idx in range(0, len(Data ), batch_size):
        batch = Data [idx : min(len( Data), idx+batch_size)]
        
        # encoded = tokenizer(batch)
        
        encoded = tokenizer(batch,max_length=50, padding='max_length', truncation=True)
    
        encoded = {key:torch.LongTensor(value) for key, value in encoded.items()}
        encoded['input_ids']=encoded['input_ids'].cuda()
        # encoded['token_type_ids']=encoded['token_type_ids'].cuda()
        encoded['attention_mask']=encoded['attention_mask'].cuda()

        with torch.no_grad():
            
            outputs = model(**encoded)
            
            feature_bank.append(outputs[0].mean(1))

    feature_bank_t=torch.cat(feature_bank)
    return feature_bank_t

In [13]:
docs = random.sample(texts, 15000)
queries = texts[20000:20025]

In [14]:
model.eval()
feature_bank = training_set_feature_bank(model.cuda(), docs)
test_feature_bank = training_set_feature_bank(model, queries)

Attention type 'block_sparse' is not possible if sequence_length: 50 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 352 with config.block_size = 32, config.num_random_blocks = 3. Changing attention type to 'original_full'...


In [15]:
from sklearn.neighbors import NearestNeighbors

def find_most_similar(_test_sentences_embedding, _train_sentences_embedding, _number_of_neighbors):
  knn = NearestNeighbors(n_neighbors=_number_of_neighbors)
  knn.fit(_train_sentences_embedding)
  most_similar = knn.kneighbors(_test_sentences_embedding) 
  return most_similar

In [16]:
most_similar = find_most_similar(test_feature_bank.cpu(), feature_bank.cpu(), 10)

In [17]:
for main_sent_index, main_sent in enumerate(most_similar[1]):
  print("\n مصرع اصلی شماره " + str(main_sent_index) + ":‌ " + queries[main_sent_index])
  for close_sent_index, close_sent in enumerate(main_sent):
    print(str(close_sent_index) + " : " + texts[close_sent])


 مصرع اصلی شماره 0:‌ که نه امشب آن سماعست که دف خلاص یابد
0 : دیدار دوستان که ببینند مرهمست
1 : دوشم آن سنگ دل پریشان داشت
2 : هر کجا هست خدایا به سلامت دارش
3 : چه نصیبت ز بلبل سحرست
4 : چه عذر بخت خود گویم که آن عیار شهرآشوب
5 : دردیست در دلم که ز دیوار بگذرد
6 : در پس آینه طوطی صفتم داشته‌اند
7 : باز ظفر به دست و شکاری نمی‌کنی
8 : بالله کز آفتاب فلک خوبتر شوی
9 : عجب از دیده گریان منت می‌آید

 مصرع اصلی شماره 1:‌ به طپانچه‌ای و بربط برهد به گوشمالی
0 : از جرعه تو خاک زمین در و لعل یافت
1 : مرا به عشق تو اندیشه از ملامت نیست
2 : عقلم اندر زمان نصیحت کرد
3 : ترسم این نکته به تحقیق ندانی دانست
4 : ز فکر آنان که در تدبیر درمانند در مانند
5 : بیا که با سر زلفت قرار خواهم کرد
6 : بی روی چو ماه آن نگارین
7 : صد بار توبه کردم و دیگر نمی‌کنم
8 : هزار یوسف مصری فتاده در چه ماست
9 : گر دوست می‌آید برم یا تیغ دشمن بر سرم

 مصرع اصلی شماره 2:‌ دگر آفتاب رویت منمای آسمان را
0 : که همه عمر دعاگوی و هوادار تو نیست
1 : جام مرصع تو بدین در شاهوار
2 : در خرقه زن آتش که خم ابروی ساقی
3 : که بندگان بنی