# WikiBert2WikiBert MOdel

## Tokenization and Preprocess

Installing required modules and libraries

In [1]:
%%capture
!pip install --upgrade pip
!pip install sentencepiece
!pip install transformers
!pip install datasets

In [2]:
%%capture
!rm seq2seq_trainer.py
!rm seq2seq_training_args.py
!wget https://raw.githubusercontent.com/huggingface/transformers/main/examples/legacy/seq2seq/seq2seq_trainer.py
!wget https://raw.githubusercontent.com/huggingface/transformers/main/examples/legacy/seq2seq/seq2seq_training_args.py


In [3]:
%%capture
!pip install git-python==1.0.3
!pip install rouge_score
!pip install sacrebleu

In [4]:
%%capture
!pip install Rouge
!pip install evaluate
!pip install bert_score

Downloading pre-trained models (if applicable).

In [5]:
%%capture
!gdown --id 1gxfnXVUGETAMxMLUHdQ6ACheRm1MAk9v #download five-epoch model WikiBert
# !gdown --id 1dJMTzIRchSBChMLPSVUOBwCQedezQJdX #download four-epoch model WikiBert
# !gdown --id 10D-n04YqO9V_i7MAjTVxB7-29KpjJeBS #download three-epoch-model-wikibert
# !gdown 1Mt7eYw6j2qruCPmC-DYhUEbCrzprdh2O #Download two-epoch-model-wikibert
# !gdown 1--jm_GIOdOYn4ezJwE8553A6C2rNQog- #Download one-epoch-model-Wikibert
!gdown 1-8KABMqzZM0qb3myrauhUrhWE013v1mj #download Dataset pn-summary
!gdown --id 1-1OpFWHdQOzxnzatGet4M-CImr-76Gaz #download Dataset bbc-dataset


Importing Libraries

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import datasets
from sklearn.model_selection import train_test_split
from datasets import Dataset

from seq2seq_trainer import Seq2SeqTrainer
from seq2seq_training_args import Seq2SeqTrainingArguments

import torch

from evaluate import load
from rouge import Rouge 

Downloading our base model which will be fine-tuned.

In [8]:
from transformers import (
    BertTokenizerFast,
    EncoderDecoderConfig,
    EncoderDecoderModel,
    BertConfig
)

model_name = 'm3hrdadfi/bert2bert-fa-wiki-summary'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
config = EncoderDecoderConfig.from_pretrained(model_name)
model = EncoderDecoderModel.from_pretrained(model_name, config=config)

Downloading tokenizer_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/3.61k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

Reading the dataset(it can be either bbc dataset or pn-summary dataset).

In [9]:
path_to_dataset = "/content/pn-summary.csv" # pn-summary dataset
# path_to_dataset = "/content/bbc_data.csv"
data_df = pd.read_csv(path_to_dataset)

In [10]:
data_df.head()

Unnamed: 0,News,Summarization,Summarization_Length,News_Length
0,به گزارش شانا، علی کاردر امروز (۲۷ دی ماه) در ...,مدیرعامل شرکت ملی نفت، عملکرد مدیریت امور بین‎...,39,245
1,به گزارش شانا به نقل از شرکت ملی صنایع پتروشیم...,سرپرست مدیریت برنامه‌ریزی و توسعه شرکت ملی صنا...,28,379
2,به گزارش شانا به نقل از شرکت پالایش گاز شهید ه...,پالایشگاه گاز خانگیران با هدف معرفی گوگرد بنتو...,23,325
3,به گزارش خبرنگار ایمنا، سعید نظری در صفحه اینس...,سخنگوی شورای شهر شیراز گفت: روند عمرانی و شهرس...,25,210
4,به گزارش شانا، سیدباقر مرتضوی، مشاور وزیر نفت ...,مشاور وزیر نفت و مدیرکل اچ اس یی و پدافند غیرع...,41,440


Converting pandas Dataframe to a Dataset Object

In [11]:
data_df = data_df[["News", "Summarization"]]
train_df, test_df = train_test_split(data_df, test_size=0.05)

train_dataset = Dataset.from_dict(train_df)
test_dataset = Dataset.from_dict(test_df)

dataset = datasets.DatasetDict({"train":train_dataset, "test": test_dataset})

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['News', 'Summarization'],
        num_rows: 88546
    })
    test: Dataset({
        features: ['News', 'Summarization'],
        num_rows: 4661
    })
})

In [13]:
rouge = datasets.load_metric("rouge")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Converting the target summaries and the news into features using tokenizer.

In [14]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["News"], max_length=512, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["Summarization"], max_length=256, truncation=True)
        
    return {"input_ids": input_encodings["input_ids"], 
           "attention_mask": input_encodings["attention_mask"], 
           "labels": target_encodings["input_ids"]}

dataset_tf = dataset.map(convert_examples_to_features, batched=True)

  0%|          | 0/89 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [15]:
dataset_tf

DatasetDict({
    train: Dataset({
        features: ['News', 'Summarization', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 88546
    })
    test: Dataset({
        features: ['News', 'Summarization', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4661
    })
})

In [16]:
columns = ["input_ids", "labels", "attention_mask"]
dataset_tf.set_format(type="torch", columns=columns)

Dynamically padding the data using Data collator(will be passed into the trainer)

In [17]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Setting the training Arguments.

In [18]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir='wikibert2wikibert', num_train_epochs=1, warmup_steps=500, 
                                  per_device_train_batch_size=4, per_device_eval_batch_size=1, 
                                  weight_decay=0.01, logging_steps=10, push_to_hub=False, 
                                  evaluation_strategy='steps', eval_steps=500, save_steps=1e6, 
                                  gradient_accumulation_steps=16)

Moving the model on to the GPU.

In [38]:
device = torch.device("cuda")
model.to(device)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(100000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

## First Epoch

In [None]:
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, 
                  data_collator=seq2seq_data_collator, 
                  train_dataset=dataset_tf["train"], 
                  eval_dataset=dataset_tf["test"])

trainer.train()

The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 74601
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 4662


Step,Training Loss,Validation Loss
500,3.0381,2.810192


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  Batch size = 1


Step,Training Loss,Validation Loss
500,3.0381,2.810192
1000,2.7444,2.562197
1500,2.5618,2.414572
2000,2.4424,2.321525
2500,2.2402,2.246251
3000,2.2948,2.166971
3500,2.1471,2.111527
4000,2.2957,2.075541
4500,2.24,2.05314


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  B

TrainOutput(global_step=4662, training_loss=2.5043685032236174, metrics={'train_runtime': 10665.4972, 'train_samples_per_second': 6.995, 'train_steps_per_second': 0.437, 'total_flos': 2.5273480404907776e+16, 'train_loss': 2.5043685032236174, 'epoch': 1.0})

In [None]:
torch.save(model, "//content/drive/MyDrive/Arshad/NLP/Project Models/model_wikibert_one_epoch.bin")

## Second Epoch

In [None]:
model = torch.load("/content/model_wikibert_one_epoch.bin")

In [None]:
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, 
                  data_collator=seq2seq_data_collator, 
                  train_dataset=dataset_tf["train"], 
                  eval_dataset=dataset_tf["test"])

trainer.train()

The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 74601
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 4662


Step,Training Loss,Validation Loss
500,2.1007,1.930838
1000,2.1532,1.981867
1500,2.081,1.957588
2000,1.9527,1.940817
2500,2.0564,1.896281
3000,1.9677,1.871665
3500,1.9951,1.841687
4000,1.8074,1.815098


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  B

Step,Training Loss,Validation Loss
500,2.1007,1.930838
1000,2.1532,1.981867
1500,2.081,1.957588
2000,1.9527,1.940817
2500,2.0564,1.896281
3000,1.9677,1.871665
3500,1.9951,1.841687
4000,1.8074,1.815098
4500,1.8012,1.795329


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3927
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4662, training_loss=1.9809004528130754, metrics={'train_runtime': 11041.2622, 'train_samples_per_second': 6.757, 'train_steps_per_second': 0.422, 'total_flos': 2.5281571946981184e+16, 'train_loss': 1.9809004528130754, 'epoch': 1.0})

In [None]:
torch.save(model, "/content/drive/MyDrive/model_wikibert_two_epoch.bin")

## Third Epoch

In [None]:
model = torch.load("/content/model_wikibert_two_epoch.bin")

In [None]:
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, 
                  data_collator=seq2seq_data_collator, 
                  train_dataset=dataset_tf["train"], 
                  eval_dataset=dataset_tf["test"])

trainer.train()

The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 52147
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 3259


Step,Training Loss,Validation Loss
500,3.1514,3.113261
1000,3.1296,2.945707
1500,2.9215,2.838193


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  B

Step,Training Loss,Validation Loss
500,3.1514,3.113261
1000,3.1296,2.945707
1500,2.9215,2.838193
2000,2.7771,2.762127
2500,2.7268,2.703403
3000,2.7962,2.663822


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: Summarization, News. If Summarization, News are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=3259, training_loss=2.9806972209863702, metrics={'train_runtime': 9171.7167, 'train_samples_per_second': 5.686, 'train_steps_per_second': 0.355, 'total_flos': 2.6709540348255936e+16, 'train_loss': 2.9806972209863702, 'epoch': 1.0})

In [None]:
torch.save(model, "/content/drive/MyDrive/model_wikibert_three_epoch.bin")

## Fourth Epoch

In [None]:
model = torch.load( "/content/model_wikibert_three_epoch.bin")

In [None]:
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, 
                  data_collator=seq2seq_data_collator, 
                  train_dataset=dataset_tf["train"], 
                  eval_dataset=dataset_tf["test"])

trainer.train()

The following columns in the training set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 52147
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 16
  Total optimization steps = 3259


Step,Training Loss,Validation Loss
500,2.613,2.515038


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1


Step,Training Loss,Validation Loss
500,2.613,2.515038
1000,2.5516,2.543291
1500,2.639,2.505314
2000,2.5525,2.467169
2500,2.5411,2.435625
3000,2.4629,2.404697


The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  Batch size = 1
The following columns in the evaluation set don't have a corresponding argument in `EncoderDecoderModel.forward` and have been ignored: News, Summarization. If News, Summarization are not expected by `EncoderDecoderModel.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2745
  B

TrainOutput(global_step=3259, training_loss=2.546912772408948, metrics={'train_runtime': 9285.2371, 'train_samples_per_second': 5.616, 'train_steps_per_second': 0.351, 'total_flos': 2.6708383728777216e+16, 'train_loss': 2.546912772408948, 'epoch': 1.0})

In [None]:
torch.save(model, "/content/drive/MyDrive/model_wikibert_fourth_epoch.bin")

### Fifth Epoch

In [20]:
model = torch.load("/content/model_wikibert_fourth_epoch.bin")

In [None]:
trainer = Trainer(model=model, args=training_args, tokenizer=tokenizer, 
                  data_collator=seq2seq_data_collator, 
                  train_dataset=dataset_tf["train"], 
                  eval_dataset=dataset_tf["test"])

trainer.train()

In [None]:
torch.save(model, "/content/drive/MyDrive/model_wikibert_fivth_epoch.bin")

## Evaluation

In [None]:
model = torch.load("/content/model_wikibert_fivth_epoch.bin")

In [None]:
torch.no_grad()

<torch.autograd.grad_mode.no_grad at 0x7f651a2433d0>

Generating Summaries to evaluate the model

In [30]:
def generate_summary(batch):
    # cut off at BERT max length 512
    inputs = tokenizer(batch["News"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred_summary"] = output_str

    return batch

In [31]:
test_set_sample = Dataset.from_dict(test_dataset[1000:1500])

In [None]:
torch.cuda.empty_cache()

In [None]:
batch_size = 4  # change to 64 for full evaluation

# results = test_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])
results = test_set_sample.map(generate_summary, batched=True, batch_size=batch_size)

  0%|          | 0/125 [00:00<?, ?ba/s]



In [None]:
num = 497
print(results['pred_summary'][num])
print(results['Summarization'][num])


دو طرح دانشبنیان در پژوهشگاه پلیمر و پتروشیمی برای درمان زخم پوستی تولید شده است. این دو طرح در نمایشگاه صنایع نساجی و پلاستیک رونمایی شد. این طرح با حضور رییس پژوهشگاه صنعت ، معدن و تجارت و جمعی از محققان این پژوهشگاه به صورت ویديو کنفرانس و با حضور وزیر علوم ، تحقیقات و فناوری به نمایش گذاشته شد.
با برگزاری مراسمی از دو طرح دانش‌بنیان محققان پژوهشگاه پلیمر و پتروشیمی رونمایی شد.


Computing The rouge Score

In [None]:
rouge = Rouge()
scores = rouge.get_scores(results["pred_summary"], results["Summarization"])

In [None]:
rough_dic = {}
rough_dic['rouge-1'] = []
rough_dic['rouge-2'] = []
rough_dic['rouge-l'] = []
for score in scores:
  for k,v in score.items():
    rough_dic[k].append(v['f'])

In [None]:
rough_df = pd.DataFrame(rough_dic)
rough_df.mean()

rouge-1    0.355140
rouge-2    0.156543
rouge-l    0.309140
dtype: float64

Computing Bert Score

In [None]:
bertscore = load("bertscore")
results = bertscore.compute(predictions=results["pred_summary"], references=results["Summarization"], lang="fa")

Downloading builder script:   0%|          | 0.00/7.79k [00:00<?, ?B/s]

https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpknpeplzj


Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
creating metadata file for /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpvqiakuwv


Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
creating metadata file for /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidde

Downloading vocab.txt:   0%|          | 0.00/972k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
creating metadata file for /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/

Downloading pytorch_model.bin:   0%|          | 0.00/681M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
creating metadata file for /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
loading weights file https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
All the weights of BertModel were initialized from the model checkpoint at bert-base-multilingual-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further t

In [None]:
bert_df = pd.DataFrame(results)

In [None]:
bert_df = bert_df[['precision','recall', 'f1']]
bert_df.mean()

precision    0.728616
recall       0.784571
f1           0.755035
dtype: float64

## Publishing the model!

In [None]:
!pip install huggingface_hub

In [23]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
model.push_to_hub("WikiBert2WikiBert")

Cloning https://huggingface.co/Arashasg/WikiBert2WikiBert into local empty directory.


Upload file pytorch_model.bin:   0%|          | 3.33k/1.32G [00:00<?, ?B/s]

In [None]:
tokenizer.push_to_hub("WikiBert2WikiBert")

In [None]:
model.save_pretrained("/content/drive/MyDrive/Arshad/NLP/Project Models/WikiBert2WikiBert")

Configuration saved in /content/drive/MyDrive/Arshad/NLP/Project Models/WikiBert2WikiBert/config.json
Model weights saved in /content/drive/MyDrive/Arshad/NLP/Project Models/WikiBert2WikiBert/pytorch_model.bin


In [27]:
from transformers import (
    BertTokenizerFast,
    EncoderDecoderConfig,
    EncoderDecoderModel,
    BertConfig
)

model_name = 'Arashasg/WikiBert2WikiBert'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
config = EncoderDecoderConfig.from_pretrained(model_name)
model = EncoderDecoderModel.from_pretrained(model_name, config=config)

Downloading vocab.txt:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.32G [00:00<?, ?B/s]