##PIPS

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [None]:
%%capture
!pip install sentencepiece
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:

from transformers import *


We will use weights and biases for tracking experiments and runs. Project page : https://wandb.ai/tasmiah-tahsin/fake-news-blurr

##function definitions

In [None]:
def calc_len(sent):
  word_count = len(sent.split())
  return word_count

    

In [None]:
def calc_len_wtokenizer(sent):
  input_ids = tokenizer_bert.encode(sent, add_special_tokens=True)
  length = len(input_ids)
  return length

In [None]:
def mT5_summarize(str_w):
  WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

  article_text = str_w

  input_ids = tokenizer(
      [WHITESPACE_HANDLER(article_text)],
      return_tensors="pt",
      padding="max_length",
      truncation=True,
      max_length=512
  )["input_ids"]

  output_ids = model.generate(
      input_ids=input_ids.cuda(),
      max_length=480,
      min_length=360,
      no_repeat_ngram_size=2,
      num_beams=4
  )[0]

  summary = tokenizer.decode(
      output_ids,
      skip_special_tokens=True,
      clean_up_tokenization_spaces=False
  )

  #print(summary)
  return summary
    

## Load dataset

In [None]:
path1 = '/content/drive/MyDrive/datasets/final_latest1.csv'

In [None]:
split1 = pd.read_csv(path1)

In [None]:
split1.shape

(10016, 2)

##Code for Summarization

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/730 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--mT5_multilingual_XLSum/snapshots/2437a524effdbadc327ced84595508f1e32025b3/config.json
Model config MT5Config {
  "_name_or_path": "csebuetnlp/mT5_multilingual_XLSum",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 0.6,
  "max_length": 84,
  "model_type": "mt5",
  "no_repeat_ngram_size": 2,
  "num_beams": 4,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tok

Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--csebuetnlp--mT5_multilingual_XLSum/snapshots/2437a524effdbadc327ced84595508f1e32025b3/spiece.model
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--mT5_multilingual_XLSum/snapshots/2437a524effdbadc327ced84595508f1e32025b3/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--mT5_multilingual_XLSum/snapshots/2437a524effdbadc327ced84595508f1e32025b3/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--csebuetnlp--mT5_multilingual_XLSum/snapshots/2437a524effdbadc327ced84595508f1e32025b3/config.json
Model config MT5Config {
  "_name_or_path": "csebuetnlp/mT5_multilingual_XLSum",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_f

Downloading:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--csebuetnlp--mT5_multilingual_XLSum/snapshots/2437a524effdbadc327ced84595508f1e32025b3/pytorch_model.bin
All model checkpoint weights were used when initializing MT5ForConditionalGeneration.

All the weights of MT5ForConditionalGeneration were initialized from the model checkpoint at csebuetnlp/mT5_multilingual_XLSum.
If your task is similar to the task the model of the checkpoint was trained on, you can already use MT5ForConditionalGeneration for predictions without further training.


In [None]:
model.to(device)

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(250112, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (w

In [None]:
from transformers import BertTokenizer, AutoTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer_bert = AutoTokenizer.from_pretrained('Tahsin-Mayeesha/bangla-fake-news-mbert')

Loading BERT tokenizer...


Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--Tahsin-Mayeesha--bangla-fake-news-mbert/snapshots/b9a6a1c334d68ccec965cb44e5bf62bf38dedad3/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--Tahsin-Mayeesha--bangla-fake-news-mbert/snapshots/b9a6a1c334d68ccec965cb44e5bf62bf38dedad3/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--Tahsin-Mayeesha--bangla-fake-news-mbert/snapshots/b9a6a1c334d68ccec965cb44e5bf62bf38dedad3/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--Tahsin-Mayeesha--bangla-fake-news-mbert/snapshots/b9a6a1c334d68ccec965cb44e5bf62bf38dedad3/tokenizer_config.json


**Main Code Section**

In [None]:
#use start and end variables to control how much of your DS you want to augment. The output shown after this section 
#shows how much has been successfully converted and saved in you gdrive. 
start=0
end = len(split1)
sum_count=0

# For every sentence...
for i in range(start,end):
  sent = sent = split1['text'][i]
  if(calc_len_wtokenizer(sent)<512):
      continue         
  sent = mT5_summarize(sent)

  split1.at[i,'text'] = sent
  sum_count+=1
  
  if(i%50==0):
    split1.to_csv('/content/drive/MyDrive/datasets/final_latest1.csv', index=False)
    print(i)
  #loop ends here
print("successfully done. sum count: ", sum_count)

Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors


950
1050
1250
1300
1350
1550
1600
1650
1700
1750
1800
2000
2050
2250
2300
2350
2500
2600
2750
2800
2850
2950
3200
3350
3400
3450
3550
3600
3800
3850
3900
4000
4100
4200
4250
4300


In [None]:
sum_count=0
max_len=512
max_all=512
itr=-1
for sent in split1['text']:  
  itr+=1 
  leng = calc_len_wtokenizer(sent)
  if(leng>max_all):
    print(itr)
    sum_count+=1
  if(leng>max_len):
    max_len=leng


print(sum_count,max_len)

4301
4302
4303
4308
4309
4310
4313
4314
4315
4317
4319
4320
4321
4322
4323
4324
4326
4327
4328
4329
4330
4331
4332
4333
4335
4336
4337
4341
4342
4344
4345
4346
4348
4349
4353
4354
4356
4357
4358
4362
4365
4366
4367
4368
4369
4370
4371
4374
4375
4376
4377
4379
4380
4381
4382
4383
4389
4390
4393
4394
4399
4403
4407
4409
4411
4412
4413
4414
4415
4416
4418
4419
4420
4421
4423
4424
4425
4428
4429
4432
4433
4434
4435
4446
4448
4449
4450
4454
4456
4457
4458
4461
4462
4465
4466
4473
4474
4476
4477
4478
4479
4480
4482
4483
4484
4486
4487
4488
4489
4491
4492
4493
4494
4495
4496
4497
4498
4502
4503
4511
4516
4517
4521
4522
4524
4525
4526
4531
4532
4533
4534
4535
4537
4541
4542
4544
4546
4547
4550
4551
4553
4556
4558
4559
4561
4562
4563
4564
4565
4566
4571
4572
4573
4574
4577
4578
4579
4580
4581
4582
4586
4589
4591
4592
4593
4594
4595
4596
4598
4600
4601
4602
4603
4604
4605
4608
4609
4610
4611
4612
4613
4624
4629
4631
4632
4633
4634
4635
4636
4637
4639
4640
4643
4645
4647
4648
4649
4653
4657
4658


In [None]:
split1.to_csv('/content/drive/MyDrive/datasets/final_latest1.csv', index=False)

# Resources

* fastai paper : https://arxiv.org/pdf/2002.04688.pdf
* [BERT Fine-Tuning Tutorial with PyTorch · Chris McCormick](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
* [multilingual bert](https://huggingface.co/bert-base-multilingual-cased)
* https://github.com/cdpierse/transformers-interpret
* https://blog.dataiku.com/the-learning-rate-finder-technique-how-reliable-is-it