# **Keyphrase Extraction using BERT Embeddings Method**

In [None]:
!pip install transformers torch seqeval sklearn datasets
!pip install seqeval
!pip install datasets
!pip install transformers

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned 

In [None]:
import os
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from collections import Counter
import re
import torch
from torch.nn import CrossEntropyLoss

# Download data
dataset = load_dataset("midas/kp20k", split="train").select(range(5000))  # استخدام 5000 عينة

# Convert BIO labels to numbers
BIO_MAP = {"O": 0, "B": 1, "I": 2}

# Text cleaning function
def clean_text(text):
    if isinstance(text, list):  # If the text is a list of words
        text = " ".join(text)
    # Remove symbols other than letters
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Divide texts into words
    tokens = text.split()
    return tokens

# Text processing and BIO nomenclature
def preprocess_data(example):
    # تنظيف النصوص
    tokens = clean_text(example['document'])
    tags = example['doc_bio_tags']

    # Make sure the length matches the words and labels
    max_length = 512
    tokens = tokens[:max_length]
    tags = tags[:max_length]

    # Convert labels to numbers
    tags = [BIO_MAP[tag] for tag in tags]

    return {"tokens": tokens, "tags": tags}

# Data processing application
processed_dataset = dataset.map(preprocess_data)

# Oversampling for classes "B" and "I"
def oversample_data(dataset):
    oversampled_data = []
    for example in dataset:
        tags = example["tags"]
        tag_counts = Counter(tags)

        # If it contains a "B" or an "I," repeat it several times
        if tag_counts[BIO_MAP["B"]] > 0 or tag_counts[BIO_MAP["I"]] > 0:
            oversampled_data.extend([example] * 3)  # تكرار 3 مرات
        else:
            oversampled_data.append(example)

    return oversampled_data

oversampled_dataset = oversample_data(processed_dataset)
balanced_dataset = Dataset.from_list(oversampled_dataset)

# Download Tokenizer for SciBERT model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Convert text and labels to input format for the form
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=512
    )
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word IDs
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore these codes in the calculation
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the conversion function
tokenized_dataset = balanced_dataset.map(tokenize_and_align_labels, batched=True)

# Split data into training and testing
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Download the SciBERT model with classifications setup
model = AutoModelForTokenClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=3,  # عدد الفئات: B, I, O
    id2label={0: "O", 1: "B", 2: "I"},
    label2id={"O": 0, "B": 1, "I": 2}
)

# **Gradual Unfreezing **
# Freezing the first layers (8 layers)
for param in model.bert.encoder.layer[:8].parameters():
    param.requires_grad = False

# Trainer preparation
data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Initial training
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# First stage training
trainer.train()

# ** Unfreeze extra layers (Gradual Unfreezing) **
# Unfreeze the next layers (layers 8-12)
for param in model.bert.encoder.layer[:12].parameters():
    param.requires_grad = True

# Update training settings
training_args.num_train_epochs = 5  # Additional training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# Second stage training
trainer.train()

# Model evaluation
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Restore text and labels while discarding filled symbols and reconverting numbers to text labels
reverse_BIO_MAP = {0: "O", 1: "B", 2: "I"}
true_labels = []
predicted_labels = []

for label_list, prediction_list in zip(labels, predictions):
    filtered_true_labels = []
    filtered_predicted_labels = []
    for label, prediction in zip(label_list, prediction_list):
        if label != -100:  # Ignore filled icons
            filtered_true_labels.append(reverse_BIO_MAP[label])
            filtered_predicted_labels.append(reverse_BIO_MAP[prediction])
    true_labels.append(filtered_true_labels)
    predicted_labels.append(filtered_predicted_labels)

# Performance calculation
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("Accuracy:", accuracy_score(true_labels, predicted_labels))
print("Precision:", precision_score(true_labels, predicted_labels))
print("Recall:", recall_score(true_labels, predicted_labels))
print("F1 Score:", f1_score(true_labels, predicted_labels))
print("Classification Report:\n", classification_report(true_labels, predicted_labels))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.63k [00:00<?, ?B/s]

kp20k.py:   0%|          | 0.00/6.53k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


test.jsonl:   0%|          | 0.00/51.6M [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

valid.jsonl:   0%|          | 0.00/51.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Map:   0%|          | 0/13694 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.1537,0.152137
2,0.1296,0.130963
3,0.1263,0.126907


Epoch,Training Loss,Validation Loss
1,0.0821,0.095002
2,0.0473,0.060763
3,0.0358,0.044422


Epoch,Training Loss,Validation Loss
1,0.0821,0.095002
2,0.0473,0.060763
3,0.0358,0.044422
4,0.014,0.0389
5,0.0116,0.036633


Accuracy: 0.9908405334096623
Precision: 0.8444717780511138
Recall: 0.9032305218535301
F1 Score: 0.8728633987851391
Classification Report:
               precision    recall  f1-score   support

           _       0.84      0.90      0.87     13682

   micro avg       0.84      0.90      0.87     13682
   macro avg       0.84      0.90      0.87     13682
weighted avg       0.84      0.90      0.87     13682



# **Keyphrase Extraction using Traditional Method**

In [None]:
!pip install transformers torch seqeval sklearn datasets
!pip install seqeval
!pip install datasets
!pip install transformers

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned 

In [None]:
import os
import numpy as np
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from collections import Counter
import re
import torch
from torch.nn import CrossEntropyLoss

# Download data
dataset = load_dataset("midas/kp20k", split="train").select(range(5000))  # استخدام 5000 عينة

# Convert BIO labels to numbers
BIO_MAP = {"O": 0, "B": 1, "I": 2}

# Text cleaning function
def clean_text(text):
    if isinstance(text, list):  # If the text is a list of words
        text = " ".join(text)
    # Remove symbols other than letters
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Divide texts into words
    tokens = text.split()
    return tokens

# Text processing and BIO nomenclature
def preprocess_data(example):
    # تنظيف النصوص
    tokens = clean_text(example['document'])
    tags = example['doc_bio_tags']

    # Make sure the length matches the words and labels
    max_length = 512
    tokens = tokens[:max_length]
    tags = tags[:max_length]

    # Convert labels to numbers
    tags = [BIO_MAP[tag] for tag in tags]

    return {"tokens": tokens, "tags": tags}

# Data processing application
processed_dataset = dataset.map(preprocess_data)

# Oversampling for classes "B" and "I"
def oversample_data(dataset):
    oversampled_data = []
    for example in dataset:
        tags = example["tags"]
        tag_counts = Counter(tags)

        # If it contains a "B" or an "I," repeat it several times
        if tag_counts[BIO_MAP["B"]] > 0 or tag_counts[BIO_MAP["I"]] > 0:
            oversampled_data.extend([example] * 3)  # Repeat 3 times
        else:
            oversampled_data.append(example)

    return oversampled_data

oversampled_dataset = oversample_data(processed_dataset)
balanced_dataset = Dataset.from_list(oversampled_dataset)

# Download Tokenizer for SciBERT model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

# Convert text and labels to input format for the form
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length", max_length=512
    )
    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Get word identifiers
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore these codes in the calculation
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply the conversion function
tokenized_dataset = balanced_dataset.map(tokenize_and_align_labels, batched=True)

# Split data into training and testing
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Download the SciBERT model with classifications setup
model = AutoModelForTokenClassification.from_pretrained(
    "allenai/scibert_scivocab_uncased",
    num_labels=3,  # عدد الفئات: B, I, O
    id2label={0: "O", 1: "B", 2: "I"},
    label2id={"O": 0, "B": 1, "I": 2}
)

# ** Freeze the lower layers (Gradual Unfreezing) **
# Freezing the first layers (8 layers)
for param in model.bert.encoder.layer[:8].parameters():
    param.requires_grad = False

# Trainer preparation (Trainer)
data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Initial training
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# First stage training
trainer.train()

# ** Unfreeze extra layers (Gradual Unfreezing) **
# Unfreeze the next layers (layers 8-12)
for param in model.bert.encoder.layer[:12].parameters():
    param.requires_grad = True

# Update training settings
training_args.num_train_epochs = 5  # Additional training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

# Second stage training
trainer.train()

# Model evaluation
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Restore text and labels while discarding filled symbols and reconverting numbers to text labels
reverse_BIO_MAP = {0: "O", 1: "B", 2: "I"}
true_labels = []
predicted_labels = []

for label_list, prediction_list in zip(labels, predictions):
    filtered_true_labels = []
    filtered_predicted_labels = []
    for label, prediction in zip(label_list, prediction_list):
        if label != -100:  # تجاهل الرموز المعبأة
            filtered_true_labels.append(reverse_BIO_MAP[label])
            filtered_predicted_labels.append(reverse_BIO_MAP[prediction])
    true_labels.append(filtered_true_labels)
    predicted_labels.append(filtered_predicted_labels)

# Performance calculation
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("Accuracy:", accuracy_score(true_labels, predicted_labels))
print("Precision:", precision_score(true_labels, predicted_labels))
print("Recall:", recall_score(true_labels, predicted_labels))
print("F1 Score:", f1_score(true_labels, predicted_labels))
print("Classification Report:\n", classification_report(true_labels, predicted_labels))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.63k [00:00<?, ?B/s]

kp20k.py:   0%|          | 0.00/6.53k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


test.jsonl:   0%|          | 0.00/51.6M [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

valid.jsonl:   0%|          | 0.00/51.6M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Map:   0%|          | 0/13694 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.1437,0.146108
2,0.1127,0.136016
3,0.1021,0.130053


Epoch,Training Loss,Validation Loss
1,0.0826,0.093281
2,0.0468,0.071336
3,0.0272,0.055528
4,0.0168,0.045713
5,0.0154,0.044336


Accuracy: 0.9898644819714069
Precision: 0.8407020364415863
Recall: 0.8919053372183925
F1 Score: 0.8655470878306148
Classification Report:
               precision    recall  f1-score   support

           _       0.84      0.89      0.87     14071

   micro avg       0.84      0.89      0.87     14071
   macro avg       0.84      0.89      0.87     14071
weighted avg       0.84      0.89      0.87     14071

