In [43]:
# !pip install torch transformers pandas scikit-learn accelerate

# %pip install --upgrade transformers torch accelerate

%pip show transformers torch accelerate

Name: transformers
Version: 4.44.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /Users/abhiraj/Espresso/Developer_Stuff/projects/skill-prediction-model/.venv/lib/python3.12/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 
---
Name: torch
Version: 2.4.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /Users/abhiraj/Espresso/Developer_Stuff/projects/skill-prediction-model/.venv/lib/python3.12/site-packages
Requires: filelock, fsspec, jinja2, n

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# load dataset
df = pd.read_csv('dataset/reskill_dataset_v1.csv')

# split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_df.head()

Unnamed: 0,resume_text,skills
2821,mobile +61 469 778 024 email rahulsharma9431@g...,"amazon web services (aws), azure sentinel, car..."
1963,anupama b. pradeep anupamapradeep111@gmail.com...,"salesforce.com, salesforce, sfd, asp.net, boom..."
7,nikit juneja c l o u d e n g i n e e r s e n i...,"informatica, powercenter, teradata, oracle, un..."
693,ayman salaheldeen adam mohammed it system admi...,"microsoft azure, azure virtual machines, azure..."
1429,job description role devops engineer duration ...,"git, msbuild, maven, nuget, tsqlt, selenium, a..."


In [2]:
# extract skills from the dataset
all_skills = df['skills'].apply(lambda x: [skill.strip() for skill in x.split(',')])

# flatten the list of lists and find unique skills
unique_skills = set(skill for sublist in all_skills for skill in sublist)

unique_skills = list(unique_skills)

print(f"Unique skills: {unique_skills[:10]}")
print(f"Number of unique skills: {len(unique_skills)}")

Unique skills: ['mcas', 'apollo/relay', 'query variables', 'bluetooth low energy', 'pstn/isdn', 'nconf', 'ci/cd tools (bamboo', 'dirsearch', 'togaf', 'computer systems']
Number of unique skills: 18857


In [3]:
from transformers import BertTokenizer

# load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(df):
    return tokenizer(
        df['resume_text'].tolist(),
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

# tokenize the training and validation data
train_encodings = tokenize_data(train_df)
val_encodings = tokenize_data(val_df)

train_encodings

  from .autonotebook import tqdm as notebook_tqdm


{'input_ids': tensor([[  101,  4684,  1009,  ...,  7621,  1998,   102],
        [  101,  2019,  6279,  ...,  5604,  1998,   102],
        [  101, 23205,  4183,  ...,  6521,  2000,   102],
        ...,
        [  101,  7187,  3748,  ...,  2449,  3208,   102],
        [  101,  8882, 19300,  ..., 27390,  2229,   102],
        [  101,  6670,  7556,  ...,  5896,  2241,   102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [12]:
import torch
from sklearn.preprocessing import MultiLabelBinarizer

# initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# fit and transform the labels
train_labels = mlb.fit_transform(train_df['skills'].apply(lambda x: [skill.strip() for skill in x.split(',')]))
val_labels = mlb.transform(val_df['skills'].apply(lambda x: [skill.strip() for skill in x.split(',')]))

class ResumeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)  # convert skills to tensor
        return item

    def __len__(self):
        return len(self.labels)

# create dataset objects
train_dataset = ResumeDataset(train_encodings, train_labels)
val_dataset = ResumeDataset(val_encodings, val_labels)



In [22]:
train_dataset[10]

{'input_ids': tensor([  101,  1051,  1038,  1046,  1041,  1039,  1056,  1045,  1058,  1041,
          4813,  3076,  2007,  2051,  2968,  1998, 12317,  2764, 22676,  1998,
          6951,  2083,  3454,  2007,  2204,  3834,  4281,  2003,  6224,  2000,
          2707,  1037,  2476,  1012,  9657,  2551,  2007,  8146,  7578,  8578,
          1998,  3716,  3085,  1998, 10326,  4619,  2007,  9373,  4824,  1998,
          2070,  6742,  3325,  1012,  1039,  1051,  1050,  1056,  1037,  1039,
          1056,  5754,  1012,  2703, 24096,  1030,  5840,  2575,  2620, 24434,
         17788,  2692,  2475, 20917,  4014,  1012,  4012, 16770,  1013,  1013,
          7479,  1012,  4957,  2063, 11586,  1012,  4012,  1013,  1999,  1013,
          5754,  1011,  2984,  1011,  2703,  1011,  6365,  2692, 16086,  2683,
         16932,  2620,  1013,  2171,  5754,  2984,  2703,  5907,  2931, 10662,
          2796,  1041,  1040,  1057,  1039,  1037,  1056,  1045,  1051,  1050,
          5065,  2015,  2297,  1011,  2

In [23]:
from transformers import BertForTokenClassification

# load the BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(unique_skills))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.train_losses = []
        self.eval_losses = []

    def on_epoch_end(self):
        train_loss = self.state.log_history[-1]['loss']
        eval_loss = self.evaluate(self.eval_dataset)['eval_loss']

        self.train_losses.append(train_loss)
        self.eval_losses.append(eval_loss)

        predictions, labels, _ = self.predict(self.eval_dataset)
        preds = predictions.argmax(axis=2)
        accuracy = (preds == labels).mean()

        print(f"Train Loss: {train_loss:.4f}, Validation Loss: {eval_loss:.4f}, Accuracy: {accuracy:.4f}")

In [25]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='model/test_v1',
    evaluation_strategy='epoch',
    logging_dir='model/test_v1/logs',
    logging_steps=10,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)



In [26]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

  0%|          | 0/1520 [18:29<?, ?it/s]
  0%|          | 0/1520 [17:03<?, ?it/s]
  0%|          | 0/1520 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 8.74 GB, other allocations: 194.67 MB, max allowed: 9.07 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(trainer.train_losses, label='Train Loss')
plt.plot(trainer.eval_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()