In [1]:
!pip install evaluate



In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = pd.read_csv('/kaggle/input/reskill-dataset-v1/reskill_dataset_v1.csv')

# Get unique skills
all_skills = df['skills'].str.split(', ', expand=True).stack().unique()
unique_skills = {skill: idx for idx, skill in enumerate(all_skills)}

len(unique_skills)

42334

In [2]:
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['resume_text', 'skills', '__index_level_0__'],
        num_rows: 8260
    })
    test: Dataset({
        features: ['resume_text', 'skills', '__index_level_0__'],
        num_rows: 2065
    })
})

In [7]:
# from transformers import BertTokenizer

# tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

# # Convert the skills into multi-hot encoded vectors
# def tokenize_and_format(examples):
#     tokenized_inputs = tokenizer(
#         examples["resume_text"], 
#         padding="max_length", 
#         truncation=True,
#         max_length=512
#     )
    
#     # Convert skills into multi-hot encoded vectors
#     labels = []
#     for skill_list in examples["skills"]:
#         label = [0] * len(unique_skills)
#         for skill in skill_list.split(", "):
#             if skill in unique_skills:
#                 label[unique_skills[skill]] = 1
#         labels.append(label)
    
#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [6]:
# # Apply the tokenize_and_format function
# tokenized_dataset = dataset.map(tokenize_and_format, batched=True, remove_columns=["__index_level_0__"])

# print(tokenized_dataset)

In [8]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["resume_text"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"skills"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    tokenized_inputs["skills"] = labels
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/8260 [00:00<?, ? examples/s]

ValueError: word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast` class).

In [9]:
tokenized_dataset['train'][69]

{'resume_text': 'about me i am a cloud engineering professional with over 2 years of experience in the it and telecommunications industry. i am currently working at sampath bank plc where i am mainly responsible for migrating the bank s on-prem infrastructure to cloud. i have also worked with several other telecommunications services providers and vendors in sri lanka including dialog axiata plc huawei technologies lanka pvt ltd and sri lanka telecom. i am skilled in a variety of technologies including linux systems cloud computing devops iot and automation etc. i am also an energetic and self-motivated individual who is always eager to develop my knowledge and skills. i am a good leader with proven leadership communication and interpersonal skills. work experince 2023 may present sampath bank plc lead cloud engineer 1. 2. 3. 4. planning and execution of end-to-end cloud migration project of sampath bank. feasibility study and cloud service provider discussions. creation of cloud roadm

In [8]:
print(tokenized_dataset["train"].features)

{'resume_text': Value(dtype='string', id=None), 'skills': Value(dtype='string', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}


In [11]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=42334)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    target_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

In [13]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mabhie7[0m ([33mabhie7-Navrachana University[0m). Use [1m`wandb login --relogin`[0m to force relogin




ValueError: Expected input batch_size (4096) to match target batch_size (338672).

In [21]:
# from transformers import Trainer, TrainingArguments, BertTokenizer, BertForTokenClassification
# from datasets import Dataset, DatasetDict

# # Initialize tokenizer and model
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(unique_skills))

# # Define dataset preprocessing
# def tokenize_and_format(examples):
#     tokenized_inputs = tokenizer(
#         examples["resume_text"], 
#         padding="max_length", 
#         truncation=True,
#         max_length=512
#     )
    
#     labels = []
#     for skill_list in examples["skills"]:
#         label = [0] * len(unique_skills)
#         for skill in skill_list.split(", "):
#             if skill in unique_skills:
#                 label[unique_skills[skill]] = 1
#         labels.append(label)
    
#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

# # Tokenize datasets
# dataset = DatasetDict({
#     'train': train_dataset,
#     'test': test_dataset
# })

# tokenized_dataset = dataset.map(tokenize_and_format, batched=True, remove_columns=["__index_level_0__"])

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/8260 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [22]:
# # Initialize Trainer
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
# )

# # Train the model
# trainer.train()



ValueError: Expected input batch_size (4096) to match target batch_size (338672).