In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [7]:
!pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


pdf to txt

In [17]:
import PyPDF2

# Load and preprocess files
pdf_file1 = 'drive/MyDrive/data_bert/file1.pdf'
pdf_file2 = 'drive/MyDrive/data_bert/file2.pdf'
obj_1 = open(pdf_file1, 'rb')
obj_2 = open(pdf_file2, 'rb')

reader_1 = PyPDF2.PdfReader(obj_1)
reader_2 = PyPDF2.PdfReader(obj_2)

x1, x2 = len(reader_1.pages)-1, len(reader_2.pages)-1
page_1, page_2 = reader_1.pages[x1], reader_2.pages[x2]

text1 = page_1.extract_text()
text2 = page_2.extract_text()

file_text1 = open(r"drive/MyDrive/data_bert//v1.txt", "a")
file_text2 = open(r"drive/MyDrive/data_bert//v2.txt", "a")

file_text1.writelines(text1)
file_text2.writelines(text2)


preprocessing txt 

In [9]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

#download nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

#lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

#Preprocessing


def preprocess(text):
    #remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    #lowercase
    text = text.lower()

    #tokenize
    words = word_tokenize(text)

    #lemmatize => group words of the same lexical field
    words = [lemmatizer.lemmatize(word) for word in words]

    #remove stop words
    words = [word for word in words if word not in stop_words]
    words = [word for word in words if word not in ['•']]

    #join it back
    text = ''.join(words)

    return text


# Read input file
with open('drive/MyDrive/data_bert/v1.txt', 'r') as file:
    v1 = file.read()
with open('drive/MyDrive/data_bert/v2.txt', 'r') as file:
    v2 = file.read()

# Preprocess text
p_v1, p_v2 = preprocess(v1), preprocess(v2)

with open(r"drive/MyDrive/data_bert/p_v1.txt", 'w') as file:
    file.write(p_v1)
with open(r"drive/MyDrive/data_bert/p_v2.txt", 'w') as file:
    file.write(p_v2)

print("Preprocessing complete.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing complete.


In [11]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [32]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import Trainer, TrainingArguments
from transformers.data.processors.squad import SquadV2Processor
from transformers import squad_convert_examples_to_features


def fine_tune_bert_large(model_name, output_dir, train_file=r'drive/MyDrive/data_bert/squad.json', max_seq_length = 384, num_train_epochs = 2,
                         per_device_train_batch_size = 8, learning_rate = 10e-5):
    

    model = BertForQuestionAnswering.from_pretrained(model_name)
    tokenizer = BertTokenizer.from_pretrained(model_name)

    processor = SquadV2Processor()
    train_examples = processor.get_train_examples(train_file)
    train_features, _ = squad_convert_examples_to_features(
        examples = train_examples,
        tokenizer = tokenizer,
        max_seq_length = max_seq_length,
        doc_stride = 128,
        max_query_length = 64,
        is_training = True
    )

    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_attention_masks = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
    all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
    all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)

    training_args = TrainingArguments(
        output_dir = output_dir,
        overwrite_output_dir = True,
        do_train = True,
        do_eval = False,
        per_device_train_batch_size = per_device_train_batch_size,
        num_train_epochs = num_train_epochs,
        learning_rate = learning_rate,
        save_steps = 10_000,
        save_total_limit = 2,
        logging_dir = output_dir,
        logging_steps = 1_000,
    )

    trainer = Trainer(
            model = model,
            args = training_args,
            train_dataset = torch.utils.data.TensorDataset(all_input_ids, all_attention_masks, all_token_type_ids, 
                                                     all_start_positions, all_end_positions),
            prediction_loss_only = True,
    )

    trainer.train()



In [20]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")



In [30]:
import json
from datasets import Dataset

def squad_dataset_to_json(dataset, output_file):
   
    data = dataset.to_dict()
    for key in data.keys():
        data[key] = data[key]

    with open(output_file, 'w') as f:
        json.dump(data, f)
    return output_file

squad_dataset_to_json(squad, 'drive/MyDrive/data_bert/squad.json')

'drive/MyDrive/data_bert/squad.json'

In [33]:
fine_tune_bert_large(model_name='bert-base-uncased', output_dir = 'drive/MyDrive/data_bert/ooutput-finetune')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

NotADirectoryError: ignored