In [1]:
!pip install pdfplumber
!pip install tiktoken

Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six-20251230-py3-none-any.whl.metadata (4.3 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.3.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.9-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20251230-py3-none-any.whl (6.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading p

In [2]:
import os
import pdfplumber
from tqdm import tqdm
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import pandas as pd
import numpy as np
import torch

In [3]:
def extract_text_from_pdf(pdf_path):

    text = list()
    with pdfplumber.open(pdf_path) as pdf:

        for page in pdf.pages:
            page_text = page.extract_text()

            if page_text:
                text.append(page_text)

    return "\n".join(text)

In [4]:
def load_all_resumes(single_dir_abs_path):

    documents = list()

    for root,_,files in os.walk(single_dir_abs_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root,file)
                text = extract_text_from_pdf(pdf_path)
                if text.strip():
                    documents.append(text)
    return documents

In [5]:
root_dir = "/kaggle/input/resume-dataset/data/data"

In [6]:
def process_resumes_per_category(single_dir):
    return single_dir, load_all_resumes(os.path.join(root_dir,single_dir))

In [7]:
"""
data_dict = dict()
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    
    parallel_pools = [pool.submit(process_resumes_per_category, single_dir) for single_dir in os.listdir(root_dir)]
    for single_pool in tqdm(as_completed(parallel_pools), total=len(parallel_pools)):
        try:
            single_dir, resumes_raw_text_list = single_pool.result()
            data_dict[single_dir] = resumes_raw_text_list
        except Exception as e:
            print(f"Error processing {single_dir}: {e}")
"""

'\ndata_dict = dict()\nwith ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:\n    \n    parallel_pools = [pool.submit(process_resumes_per_category, single_dir) for single_dir in os.listdir(root_dir)]\n    for single_pool in tqdm(as_completed(parallel_pools), total=len(parallel_pools)):\n        try:\n            single_dir, resumes_raw_text_list = single_pool.result()\n            data_dict[single_dir] = resumes_raw_text_list\n        except Exception as e:\n            print(f"Error processing {single_dir}: {e}")\n'

In [8]:
gpt_tokenizer_encodings = tiktoken.get_encoding("o200k_base")

In [9]:
"""
with open("data_dict.pkl","wb") as file_handle:
    pickle.dump(data_dict,file_handle)
"""

'\nwith open("data_dict.pkl","wb") as file_handle:\n    pickle.dump(data_dict,file_handle)\n'

In [10]:
with open("/kaggle/input/proprocessed-data-pickle-file/data_dict.pkl","rb") as file_handle:
    data_dict = pickle.load(file_handle)

In [11]:
encoded_data_dict = {"Resume Encoded Text":[], "Suitable Job":[]}
row_idx = 0
max_len = 0

for k,v in data_dict.items():
    for resume_text in v:

        encoded_resume_text = gpt_tokenizer_encodings.encode(resume_text)
        encoded_data_dict["Resume Encoded Text"].append(encoded_resume_text)
        encoded_data_dict["Suitable Job"].append(k)

        if len(encoded_resume_text) > max_len:
            max_len = len(encoded_resume_text)

In [12]:
data = pd.DataFrame(data=encoded_data_dict)

In [13]:
data.head()

Unnamed: 0,Resume Encoded Text,Suitable Job
0,"[31899, 10316, 14704, 160873, 2022, 91112, 866...",DESIGNER
1,"[8621, 58311, 111063, 91112, 866, 820, 114095,...",DESIGNER
2,"[91089, 91112, 866, 198, 18610, 198, 15804, 49...",DESIGNER
3,"[60309, 53016, 91112, 866, 198, 18610, 198, 32...",DESIGNER
4,"[47, 62591, 5710, 91112, 866, 198, 18610, 198,...",DESIGNER


In [14]:
data["Resume Encoded Text"] = data["Resume Encoded Text"].apply(lambda x: x + [0]*(max_len - len(x)))

In [15]:
shuffled_data = data.iloc[np.random.choice(np.arange(0,data.shape[0]),size=(data.shape[0],),replace=False)]

In [16]:
shuffled_data.reset_index(inplace=True,drop=True)

In [17]:
shuffled_data

Unnamed: 0,Resume Encoded Text,Suitable Job
0,"[50701, 117626, 827, 65746, 195177, 50, 134235...",PUBLIC-RELATIONS
1,"[10683, 16978, 37901, 53509, 198, 18610, 198, ...",SALES
2,"[151336, 105894, 168651, 91112, 866, 198, 4217...",DESIGNER
3,"[28497, 197665, 198, 18610, 198, 147542, 261, ...",BANKING
4,"[28497, 197665, 5307, 193894, 94797, 198, 5974...",BANKING
...,...,...
2478,"[965, 44227, 152192, 92715, 99726, 53016, 3790...",INFORMATION-TECHNOLOGY
2479,"[17499, 84807, 15000, 163761, 37901, 53509, 19...",FITNESS
2480,"[191011, 2694, 14576, 2824, 149253, 15680, 379...",BANKING
2481,"[10683, 16978, 22069, 8695, 71255, 198, 18610,...",FITNESS


In [18]:
labels2idx = dict(zip(data_dict.keys(),range(0,len(data_dict.keys()))))

In [19]:
training_data = data.iloc[0:int(0.7*data.shape[0])]
testing_data = data.iloc[int(0.7*data.shape[0]):]

In [20]:
len(training_data.iloc[0,0])

6697

In [21]:
def training_data_generator(mb_size=79):

    for i in range(training_data.shape[0]//mb_size):

        X_mb = np.array(training_data.iloc[i*mb_size:(i+1)*mb_size,0])
        y_mb = np.array(training_data.iloc[i*mb_size:(i+1)*mb_size,1])

        yield X_mb, y_mb

In [None]:
class BERTEncoderLayer(torch.nn.Module):

    def __init__(self):
        super().__init__()

        self.mha_layer = MHA() 

    def forward(self,emb_plus_pos_encoding):

        pass
        return encoder_embedding