<a href="https://colab.research.google.com/github/abhie7/advanced-python/blob/main/Llama3_8b_ResumeSummarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
#@title Import necessary dependencies
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes
!pip install pandas PyPDF2

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.5
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
#We now add LoRA adapters so we only need to update 1 to 10% of all parameters!
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

In [5]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [6]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.8194
2,2.2928
3,1.691
4,1.9463
5,1.6427
6,1.6017
7,1.1935
8,1.2567
9,1.1061
10,1.1661


In [23]:
from PyPDF2 import PdfReader
import re
import os
import pandas as pd

class PdfParser:
    '''Extract and clean text from a PDF file.'''
    def __init__(self, filepath: str):
        self.filepath = filepath

    def extract_and_clean_text(self) -> str:
        reader = PdfReader(self.filepath)
        text = '\n'.join(page.extract_text() for page in reader.pages)
        cleaned_text = self.clean_text(text)
        return cleaned_text

    @staticmethod
    def clean_text(text):
        text = re.sub("(\n+)", " ", text)
        text = re.sub("(\s){2,}", "", text)
        text = re.sub(r"[-\(\)\"#\;:<>\{\}\-=~|]", '', text)
        text = re.sub(r"([IVXLCDM]+\.)|[•◦○●]", '', text)
        text = text.strip() #.lower()
        return text

In [21]:
def process_resumes(directory_path):
    resumes_dir = './Resumes/'
    resumes = []
    summaries = []

    # List all PDF files in the directory
    for filename in os.listdir(resumes_dir):
        if filename.endswith(".pdf"):
            filepath = os.path.join(resumes_dir, filename)
            parser = PdfParser(filepath)
            resume_text = parser.extract_and_clean_text()
            resumes.append((filename, resume_text))

            # Prepare the prompt and feed it to the model
            resume_input = resume_text
            inputs = tokenizer(
              [
                  alpaca_prompt.format(
                      """Imagine you are a seasoned recruiter tasked with reviewing resumes.
                          Note:
                          1. Role Play: Assume the persona of a seasoned recruiter who is skilled in evaluating candidates' qualifications and experiences.
                          2. Objective: Provide a concise summary of the applicant's resume, highlighting their key skills, experiences, and achievements in approximately 100 words.
                          3. Clarity and Accessibility: Ensure the summary is understandable to individuals outside the applicant's field, avoiding industry-specific jargon.
                          4. Focus Areas: Do NOT write anything about YOURSELF. Concentrate on the applicant and just begin the summary with their information and their professional background, including job roles, responsibilities, and notable accomplishments. Also, mention any unique qualities or skills that differentiate the applicant from others. Do not Boast about the applicant.
                          5. Review and Refine: Make adjustments as needed to enhance clarity and ensure it accurately represents the applicant's profile. Do not write the summary ikn points. Write it in one single paragraph. Please complete the entire summary in approximately 100 words.""", # instruction
                      resume_input, # input
                      "", # output - leave this blank for generation!
                  )
              ], return_tensors = "pt").to("cuda")

            # Generate summary
            generated_summary = model.generate(**inputs, max_new_tokens=128)
            summary_text = tokenizer.decode(generated_summary[0], skip_special_tokens=True)
            summaries.append(summary_text)

    return resumes, summaries

resumes, summaries = process_resumes('./Resumes/')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [25]:
def save_to_excel(resumes, summaries):
    # Extracting the instruction once since it remains constant for all entries
    instruction = """Imagine you are a seasoned recruiter tasked with reviewing resumes.
                     Note:
                     1. Role Play: Assume the persona of a seasoned recruiter who is skilled in evaluating candidates' qualifications and experiences.
                     2. Objective: Provide a concise summary of the applicant's resume, highlighting their key skills, experiences, and achievements in approximately 100 words.
                     3. Clarity and Accessibility: Ensure the summary is understandable to individuals outside the applicant's field, avoiding industry-specific jargon.
                     4. Focus Areas: Do NOT write anything about YOURSELF. Concentrate on the applicant and just begin the summary with their information and their professional background, including job roles, responsibilities, and notable accomplishments. Also, mention any unique qualities or skills that differentiate the applicant from others. Do not Boast about the applicant.
                     5. Review and Refine: Make adjustments as needed to enhance clarity and ensure it accurately represents the applicant's profile. Do not write the summary in points. Write it in one single paragraph. Please complete the entire summary in approximately 100 words."""

    # Preparing the data for the DataFrame
    data = {
        'Instruction': [instruction] * len(summaries),  # Repeating the instruction for each entry
        'Input': [r[1] for r in resumes],  # Resume texts
        'Response': summaries  # Summarized responses
    }

    # Creating a DataFrame and saving it to an Excel file
    df = pd.DataFrame(data)
    df.to_excel('summarized_resumes.xlsx', index=False)

save_to_excel(resumes, summaries)

In [15]:
# FastLanguageModel.for_inference(model)
# inputs = tokenizer(
# [
#     alpaca_prompt.format(
#         """Imagine you are a seasoned recruiter tasked with reviewing resumes.
#             Note:
#             1. Role Play: Assume the persona of a seasoned recruiter who is skilled in evaluating candidates' qualifications and experiences.
#             2. Objective: Provide a concise summary of the applicant's resume, highlighting their key skills, experiences, and achievements in approximately 100 words.
#             3. Clarity and Accessibility: Ensure the summary is understandable to individuals outside the applicant's field, avoiding industry-specific jargon.
#             4. Focus Areas: Do NOT write anything about YOURSELF. Concentrate on the applicant and just begin the summary with their information and their professional background, including job roles, responsibilities, and notable accomplishments. Also, mention any unique qualities or skills that differentiate the applicant from others. Do not Boast about the applicant.
#             5. Review and Refine: Make adjustments as needed to enhance clarity and ensure it accurately represents the applicant's profile. Do not write the summary ikn points. Write it in one single paragraph. Please complete the entire summary in approximately 100 words.""", # instruction
#         resume_input, # input
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer)
# _ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Imagine you are a seasoned recruiter tasked with reviewing resumes.
            Note:
            1. Role Play: Assume the persona of a seasoned recruiter who is skilled in evaluating candidates' qualifications and experiences.
            2. Objective: Provide a concise summary of the applicant's resume, highlighting their key skills, experiences, and achievements in approximately 100 words.
            3. Clarity and Accessibility: Ensure the summary is understandable to individuals outside the applicant's field, avoiding industry-specific jargon.
            4. Focus Areas: Do NOT write anything about YOURSELF. Concentrate on the applicant and just begin the summary with their information and their professional background, including job roles, responsibilities, and notable accomplishments. Als

In [17]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')