In [92]:
!pip install pdfplumber
!pip install tiktoken



In [93]:
import os
import pdfplumber
from tqdm import tqdm
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import pandas as pd
import numpy as np
import torch

In [94]:
def extract_text_from_pdf(pdf_path):

    text = list()
    with pdfplumber.open(pdf_path) as pdf:

        for page in pdf.pages:
            page_text = page.extract_text()

            if page_text:
                text.append(page_text)

    return "\n".join(text)

In [95]:
def load_all_resumes(single_dir_abs_path):

    documents = list()

    for root,_,files in os.walk(single_dir_abs_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                pdf_path = os.path.join(root,file)
                text = extract_text_from_pdf(pdf_path)
                if text.strip():
                    documents.append(text)
    return documents

In [96]:
root_dir = "/kaggle/input/resume-dataset/data/data"

In [97]:
def process_resumes_per_category(single_dir):
    return single_dir, load_all_resumes(os.path.join(root_dir,single_dir))

In [98]:
"""
data_dict = dict()
with ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:
    
    parallel_pools = [pool.submit(process_resumes_per_category, single_dir) for single_dir in os.listdir(root_dir)]
    for single_pool in tqdm(as_completed(parallel_pools), total=len(parallel_pools)):
        try:
            single_dir, resumes_raw_text_list = single_pool.result()
            data_dict[single_dir] = resumes_raw_text_list
        except Exception as e:
            print(f"Error processing {single_dir}: {e}")
"""

'\ndata_dict = dict()\nwith ThreadPoolExecutor(max_workers=os.cpu_count()) as pool:\n    \n    parallel_pools = [pool.submit(process_resumes_per_category, single_dir) for single_dir in os.listdir(root_dir)]\n    for single_pool in tqdm(as_completed(parallel_pools), total=len(parallel_pools)):\n        try:\n            single_dir, resumes_raw_text_list = single_pool.result()\n            data_dict[single_dir] = resumes_raw_text_list\n        except Exception as e:\n            print(f"Error processing {single_dir}: {e}")\n'

In [99]:
gpt_tokenizer_encodings = tiktoken.get_encoding("o200k_base")

In [100]:
"""
with open("data_dict.pkl","wb") as file_handle:
    pickle.dump(data_dict,file_handle)
"""

'\nwith open("data_dict.pkl","wb") as file_handle:\n    pickle.dump(data_dict,file_handle)\n'

In [101]:
with open("/kaggle/input/proprocessed-data-pickle-file/data_dict.pkl","rb") as file_handle:
    data_dict = pickle.load(file_handle)

In [102]:
encoded_data_dict = {"Resume Encoded Text":[], "Suitable Job":[]}
row_idx = 0
max_len = 0

for k,v in data_dict.items():
    for resume_text in v:

        encoded_resume_text = gpt_tokenizer_encodings.encode(resume_text)
        encoded_data_dict["Resume Encoded Text"].append(encoded_resume_text)
        encoded_data_dict["Suitable Job"].append(k)

        if len(encoded_resume_text) > max_len:
            max_len = len(encoded_resume_text)

In [103]:
data = pd.DataFrame(data=encoded_data_dict)

In [104]:
data.head()

Unnamed: 0,Resume Encoded Text,Suitable Job
0,"[31899, 10316, 14704, 160873, 2022, 91112, 866...",DESIGNER
1,"[8621, 58311, 111063, 91112, 866, 820, 114095,...",DESIGNER
2,"[91089, 91112, 866, 198, 18610, 198, 15804, 49...",DESIGNER
3,"[60309, 53016, 91112, 866, 198, 18610, 198, 32...",DESIGNER
4,"[47, 62591, 5710, 91112, 866, 198, 18610, 198,...",DESIGNER


In [105]:
data["Resume Encoded Text"] = data["Resume Encoded Text"].apply(lambda x: x + [0]*(max_len - len(x)))

In [106]:
shuffled_data = data.iloc[np.random.choice(np.arange(0,data.shape[0]),size=(data.shape[0],),replace=False)]

In [107]:
shuffled_data.reset_index(inplace=True,drop=True)

In [108]:
shuffled_data

Unnamed: 0,Resume Encoded Text,Suitable Job
0,"[10683, 16978, 167548, 44376, 198, 59740, 3165...",SALES
1,"[164244, 5710, 134235, 2260, 11021, 28394, 202...",AVIATION
2,"[136931, 1301, 5307, 128858, 103434, 37273, 11...",DIGITAL-MEDIA
3,"[183308, 10271, 8004, 965, 22600, 198, 59740, ...",FITNESS
4,"[88139, 195177, 50, 182360, 4769, 198, 18610, ...",PUBLIC-RELATIONS
...,...,...
2478,"[18, 35, 91112, 866, 14, 160873, 2022, 91112, ...",DESIGNER
2479,"[49, 1233, 12069, 119679, 535, 198, 18610, 198...",CHEF
2480,"[8621, 58311, 140055, 182360, 4769, 198, 18610...",CONSULTANT
2481,"[145863, 10551, 103434, 457, 161225, 22600, 19...",DIGITAL-MEDIA


In [109]:
labels2idx = dict(zip(data_dict.keys(),range(0,len(data_dict.keys()))))

In [110]:
training_data = data.iloc[0:int(0.7*data.shape[0])]
testing_data = data.iloc[int(0.7*data.shape[0]):]

In [111]:
def training_data_generator(mb_size=79):

    for i in range(training_data.shape[0]//mb_size):

        X_mb = np.array(training_data.iloc[i*mb_size:(i+1)*mb_size,0])
        y_mb = np.array(training_data.iloc[i*mb_size:(i+1)*mb_size,1])

        yield X_mb, y_mb