In [1]:
!pip install pdfplumber
!pip install tiktoken

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m64.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading p

In [35]:
import os
import pdfplumber
from tqdm import tqdm
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizerFast
from torchinfo import summary

In [3]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
def extract_text_from_pdf(pdf_path):

    text = list()
    with pdfplumber.open(pdf_path) as pdf:

        for page in pdf.pages:
            page_text = page.extract_text()

            if page_text:
                text.append(page_text)

    return "\n".join(text)

In [5]:
def load_all_resumes(single_dir_abs_path):

    documents = list()

    for root,_,files in os.walk(single_dir_abs_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root,file)
                text = extract_text_from_pdf(pdf_path)
                if text.strip():
                    documents.append(text)
    return documents

In [6]:
root_dir = "/kaggle/input/resume-dataset/data/data"

In [7]:
def process_resumes_per_category(single_dir):
    return single_dir, load_all_resumes(os.path.join(root_dir,single_dir))

In [None]:
"""
data_dict = dict()
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    
    parallel_pools = [pool.submit(process_resumes_per_category, single_dir) for single_dir in os.listdir(root_dir)]
    for single_pool in tqdm(as_completed(parallel_pools), total=len(parallel_pools)):
        try:
            single_dir, resumes_raw_text_list = single_pool.result()
            data_dict[single_dir] = resumes_raw_text_list
        except Exception as e:
            print(f"Error processing {single_dir}: {e}")
"""

In [None]:
"""
with open("data_dict.pkl","wb") as file_handle:
    pickle.dump(data_dict,file_handle)
"""

In [8]:
with open("/kaggle/input/proprocessed-data-pickle-file/data_dict.pkl","rb") as file_handle:
    data_dict = pickle.load(file_handle)

In [9]:
bert_base_context_len = 512

In [10]:
resume_text = list()
label = list()

for k,v in data_dict.items():
    for single_resume_text in v:
        
        resume_text.append(single_resume_text)
        label.append(k)

data = pd.DataFrame(data={"Resume Text":resume_text,"Label":label})

In [11]:
labels2idx = dict(zip(data_dict.keys(),range(0,len(data_dict.keys()))))

In [12]:
data.head()

Unnamed: 0,Resume Text,Label
0,"PRE-PRESS GRAPHIC DESIGNER\nSummary\nCreative,...",DESIGNER
1,PRINCIPLE DESIGNER / OWNER\nProfessional Summa...,DESIGNER
2,PROJECT DESIGNER\nSummary\nTeam-oriented and c...,DESIGNER
3,INTERIOR DESIGNER\nSummary\nA results oriented...,DESIGNER
4,PRESENTATION DESIGNER\nSummary\nCustomer Servi...,DESIGNER


In [13]:
idxes = np.arange(data.shape[0])
np.random.shuffle(idxes)
shuffled_data = data.iloc[idxes]
shuffled_data.reset_index(drop=True,inplace=True)

In [14]:
shuffled_data

Unnamed: 0,Resume Text,Label
0,SALES\nSummary\nDedicated security enforcement...,SALES
1,TIMESHARE SALES\nSummary\nI am extremely confi...,SALES
2,SIMULATOR TECHNICIAN\nSummary\nExperienced Ele...,AVIATION
3,SENIOR DIGITAL PRODUCER/MULTIMEDIA SPECIALIST\...,DIGITAL-MEDIA
4,ACCOUNTANT\nHighlights\nMicrosoft Office : Int...,ACCOUNTANT
...,...,...
2478,"TEACHER\nSummary\nHighly enthusiasticÂ ,motiva...",TEACHER
2479,NEW BUSINESS DEVELOPMENT MANAGER\nSummary\nBUS...,BUSINESS-DEVELOPMENT
2480,ACCOUNTANT I\nSummary\nFlexible A ccountant wh...,ACCOUNTANT
2481,CUSTOMER SERVICE REP\nCareer Focus\nTo find a ...,AVIATION


In [15]:
max_chunks = 0

for single_resume_text in resume_text:
    chunked_encoded_text = tokenizer(text=single_resume_text,max_length=512,truncation=True,
                                return_overflowing_tokens=True,stride=256,
                                return_tensors="pt",padding="max_length")

    if len(chunked_encoded_text["input_ids"]) > max_chunks:
        max_chunks = len(chunked_encoded_text["input_ids"])

In [16]:
print(max_chunks)

25


In [17]:
data_dict = dict(shuffled_data)
resume_text = data_dict["Resume Text"]
label = data_dict["Label"]

In [18]:
def training_data_generator():

    for single_resume_text,y in zip(resume_text[0:2000],label[0:2000]):
        chunked_encoded_text = tokenizer(text=single_resume_text,max_length=bert_base_context_len,
                                         truncation=True,return_overflowing_tokens=True,
                                         stride=256,return_tensors="pt",padding="max_length")

        yield chunked_encoded_text["input_ids"],torch.tensor(labels2idx[y])

In [19]:
def testing_data_generator():

    for single_resume_text,y in zip(resume_text[2000:],label[2000:]):
        chunked_encoded_text = tokenizer(text=single_resume_text,
                                         max_length=bert_base_context_len,
                                        truncation=True,return_overflowing_tokens=True,
                                        stride=256,return_tensors="pt",padding="max_length")
        yield chunked_encoded_text["input_ids"],torch.tensor(labels2idx[y])

In [26]:
class SingleAttentionHead(torch.nn.Module):

    def __init__(self,query_key_embedding_dim,value_embedding_dim,sha_dim,masked,is_dropout,
                dropout_probability):
        super().__init__()

        self.sha_dim = sha_dim
        self.masked = masked
        self.is_dropout = is_dropout

        self.query_projection_layer = torch.nn.Linear(in_features=query_key_embedding_dim,
                                                     out_features=sha_dim,bias=False)
        self.key_projection_layer = torch.nn.Linear(in_features=query_key_embedding_dim,
                                                   out_features=sha_dim,bias=False)
        if self.is_dropout:
            self.single_head_attn_mask_dropout = torch.nn.Dropout(p=dropout_probability)
            
        self.value_projection_layer = torch.nn.Linear(in_features=value_embedding_dim,
                                                     out_features=sha_dim,bias=False)
        self.softmax_activation = torch.nn.Softmax(dim=1)

    def forward(self,query_embedding,key_embedding,value_embedding):

        projected_query = self.query_projection_layer(query_embedding)
        projected_key = self.key_projection_layer(key_embedding)
        projected_value = self.value_projection_layer(value_embedding)

        query_key_similarity_search = torch.matmul(projected_query,torch.transpose(projected_key,1,0))/torch.sqrt(torch.tensor([self.sha_dim]))

        if self.masked:
            query_key_similarity_search = torch.tril(query_key_similarity_search,0)
            
        query_key_soft_search = self.softmax_activation(query_key_similarity_search)

        if self.is_dropout:
            query_key_soft_search = self.single_head_attn_mask_dropout(query_key_soft_search)
            
        weighted_attn_embedding = torch.matmul(query_key_soft_search,projected_value)

        return weighted_attn_embedding

In [25]:
class MultiHeadAttentionLayer(torch.nn.Module):

    def __init__(self,query_key_embedding_dim,value_embedding_dim,num_attn_heads,masked,
                is_dropout,dropout_probability):
        super().__init__()
        
        sha_dim = value_embedding_dim//num_attn_heads
        self.attn_heads = list()
        
        for _ in range(num_attn_heads):
            self.attn_heads.append(SingleAttentionHead(query_key_embedding_dim,value_embedding_dim,
                                       sha_dim,masked,is_dropout,dropout_probability))

        self.mha_projection_layer = torch.nn.Linear(in_features=value_embedding_dim,
                                                   out_features=value_embedding_dim,bias=False)
        self.is_dropout = is_dropout

        if self.is_dropout:
            self.mha_dropout_layer = torch.nn.Dropout(p=dropout_probability) 

    def forward(self,query_embedding,key_embedding,value_embedding):

        attn_heads_weighted_embeddings = list()

        for single_attn_head in self.attn_heads:
            attn_heads_weighted_embeddings.append(single_attn_head(query_embedding,key_embedding,
                                                                  value_embedding))

        mha_concatenated_embeddings = torch.cat(attn_heads_weighted_embeddings,dim=1)
        mha_output = self.mha_projection_layer(mha_concatenated_embeddings)

        if self.is_dropout:
            mha_output = self.mha_dropout_layer(mha_output)
        
        return mha_output

In [24]:
class EncoderLayer(torch.nn.Module):

    def __init__(self,input_embedding_dim,num_attn_heads,is_dropout,dropout_probability,
                is_pre_norm,ffn_projection_dim,ffn_activation):
        super().__init__()

        activation_functions = {
            "relu": torch.nn.ReLU,
            "sigmoid": torch.nn.Sigmoid,
            "tanh": torch.nn.Tanh,
            "gelu": torch.nn.GELU
        }

        self.is_dropout = is_dropout
        self.is_pre_norm = is_pre_norm

        self.mha_layer = MultiHeadAttentionLayer(input_embedding_dim,input_embedding_dim,
                                                num_attn_heads,False,is_dropout,dropout_probability)
        self.first_layer_norm = torch.nn.LayerNorm(input_embedding_dim)

        if is_dropout:
            self.first_dropout_layer = torch.nn.Dropout(p=dropout_probability)
            
        self.ffn_inner_layer = torch.nn.Linear(in_features=input_embedding_dim,
                                              out_features=ffn_projection_dim)
        self.ffn_inner_activation = activation_functions[ffn_activation]()
        self.ffn_output_layer = torch.nn.Linear(in_features=ffn_projection_dim,
                                               out_features=input_embedding_dim)
        self.second_layer_norm = torch.nn.LayerNorm(input_embedding_dim)

        if is_dropout:
            self.second_dropout_layer = torch.nn.Dropout(p=dropout_probability)

    
    def forward(self,input_embedding):

        mha_layer_out = self.mha_layer(input_embedding,input_embedding,input_embedding)

        if self.is_pre_norm:
            first_layer_norm_out = mha_layer_out + self.first_layer_norm(input_embedding)
        else:
            first_layer_norm_out = self.first_layer_norm(input_embedding + mha_layer_out)
        
        if self.is_dropout:
            first_layer_norm_out = self.first_dropout_layer(first_layer_norm_out)
            
        higher_dim_projection = self.ffn_inner_layer(first_layer_norm_out)
        higher_dim_projection = self.ffn_inner_activation(higher_dim_projection)
        ffn_out = self.ffn_output_layer(higher_dim_projection)

        if self.is_pre_norm:
            encoder_layer_out = ffn_out + self.second_layer_norm(first_layer_norm_out)
        else:
            encoder_layer_out = self.second_layer_norm(first_layer_norm_out + ffn_out)

        if self.is_dropout:
            encoder_layer_out = self.second_dropout_layer(encoder_layer_out)

        return encoder_layer_out

In [30]:
class BERT(torch.nn.Module):

    def __init__(self,model_context_len,vocab_size,model_dim,num_encoder_layers,num_attn_heads,
                dropout_probability,is_sequence_classification):
        super().__init__()

        self.model_context_len = model_context_len
        self.is_sequence_classification = is_sequence_classification
        self.token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size,
                                                  embedding_dim=model_dim)
        self.pos_encoding_layer = torch.nn.Embedding(num_embeddings=model_context_len,
                                                    embedding_dim=model_dim)
        self.segment_embedding_layer = torch.nn.Embedding(num_embeddings=2,
                                                          embedding_dim=model_dim)
        self.embedding_sum_layer_norm = torch.nn.LayerNorm(model_dim)
        self.first_dropout_layer = torch.nn.Dropout(p=dropout_probability)
        self.encoder_layer_stack = list()

        for _ in range(num_encoder_layers):
            self.encoder_layer_stack.append(EncoderLayer(model_dim,num_attn_heads,True,
                                                         dropout_probability,False,4*model_dim,
                                                         "gelu"))


    def forward(self,X):

        X = X.to(torch.int)
        token_embedding = self.token_embedding_layer(X)
        position_ids = torch.arange(start=0,end=self.model_context_len)
        pos_encoding = self.pos_encoding_layer(position_ids)

        if self.is_sequence_classification:
            segment_ids = torch.zeros(self.model_context_len,)
            
        segment_encoding = self.segment_embedding_layer(segment_ids)
        
        input_embedding = token_embedding + pos_encoding + segment_embedding
        input_embedding = self.embedding_sum_layer_norm(input_embedding)
        input_embedding = self.first_droput_layer(input_embedding)

        for single_encoding_layer in self.encoder_layer_stack:
            output_embedding = single_encoding_layer(input_embedding)
            input_embedding = output_embedding

        return output_embedding

In [31]:
my_bert_model = BERT(512,30522,768,12,12,0.1,True)