In [None]:
!pip install transformers accelerate autoawq

In [None]:
!pip install --upgrade torch torchvision

In [31]:
import torch
from awq import AutoAWQForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoAWQForCausalLM.from_pretrained(
  model_id,
  torch_dtype=torch.float16,
  low_cpu_mem_usage=True,
  device_map="auto",
)


def generate(input_text):
  prompt = [
    {"role": "system", "content": "You are a helpful assistant, that responds in very concise manner with only main points."},
    {"role": "user", "content": input_text},
  ]
  inputs = tokenizer.apply_chat_template(
    prompt,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
  ).to("cuda")

  outputs = model.generate(**inputs, do_sample=True, max_new_tokens=512)
  return tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
# Import required libraries
from pathlib import Path

# Set up directories for data
data_folder = Path('/content/Data/')
pdf_folder = data_folder / "pdf_folder"
input_json_folder = data_folder / "input_json_folder"
output_json_folder = data_folder / "output_json_folder"

# Create directories if they don't exist
Path(pdf_folder).mkdir(parents=True, exist_ok=True)
Path(input_json_folder).mkdir(parents=True, exist_ok=True)
Path(output_json_folder).mkdir(parents=True, exist_ok=True)

In [None]:
# Install necessary libraries
!pip install -q PyMuPDF Pillow pdfplumber pytesseract Faker

# Install Tesseract OCR (if needed for OCR tasks)
!sudo apt-get install -qq tesseract-ocr

# Clone the specific branch from GitHub repository into the pdf_folder
!git clone -b pdfs https://github.com/jaimonjacob/codeinventory/ "/content/Data/pdf_folder"

In [41]:
# Load the Resume.csv file from the specified path
import pandas as pd

csv_path = "/content/Data/Resume.csv"  # Updated path
resumes_df = pd.read_csv(csv_path, sep=';', on_bad_lines='skip')

# Display the first few rows of the DataFrame to verify it loaded correctly
print(resumes_df.head())

                  ID,Resume_str,Resume_html,Category
0  16852973,"         HR ADMINISTRATOR/MARKETING ...
1  HR Administrator     Dec 2013   to   Current  ...
2      ""> <div class=""paragraph PARAGRAPH_NAME ...
3      ""> <div class=""name"" itemprop=""name"">...
4  HR ADMINISTRATOR</span> </div> </div> </div> <...


In [None]:
import os
import fitz
from PIL import Image
import io
import pytesseract
import json


def extract_all_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = ""

    for page_num in range(len(doc)):
        page = doc[page_num]


        page_text = page.get_text("text")
        all_text += page_text + "\n"

        image_list = page.get_images(full=True)
        for img in image_list:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            image = Image.open(io.BytesIO(image_bytes))

            ocr_text = pytesseract.image_to_string(image)
            all_text += ocr_text + "\n"

    return all_text

def process_pdfs_in_folder(folder_path):
    extracted_data = {}

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            extracted_text = extract_all_text_from_pdf(pdf_path)


            extracted_data[filename] = extracted_text
            print(f"Processed {filename}")

    json_output_path = os.path.join(input_json_folder, "extracted_resumes.json")
    with open(json_output_path, "w", encoding="utf-8") as json_file:
        json.dump(extracted_data, json_file, indent=4, ensure_ascii=False)
    print(f"All text saved to {json_output_path}")

folder_path = pdf_folder
process_pdfs_in_folder(folder_path)

In [None]:
from faker import Faker
fake = Faker()

def summarize_resume(resume_text):
    sections = {
        "Employment History": "",
        "Skills": "",
        "Education": "",
        "Certifications": ""
    }

    prompts = {
        "Employment History": "Summarize the employment history from the following resume: ",
        "Skills": "List the skills mentioned in the following resume: ",
        "Education": "Summarize the education details from the following resume: ",
        "Certifications": "List any certifications mentioned in the following resume: "
    }

    for section, prompt in prompts.items():
        input_text = prompt + resume_text
        summary = generate(input_text)
        sections[section] = summary

    sections["Name"] = fake.name()
    return sections

def process_resumes_from_json(json_file_path):
    with open(json_file_path, "r", encoding="utf-8") as json_file:
        resumes = json.load(json_file)

    summarized_resumes = {}

    for filename, resume_text in resumes.items():
        summarized_resumes[filename] = summarize_resume(resume_text)
        print(f"Summarized {filename}")

    with open(summarized_json_file_path, "w", encoding="utf-8") as output_json_file:
        json.dump(summarized_resumes, output_json_file, indent=4, ensure_ascii=False)
    print(f"All resumes summarized and saved to {summarized_json_file_path}")

json_file_path = input_json_folder / "extracted_resumes.json"
summarized_json_file_path = output_json_folder / "summarized_resumes.json"
process_resumes_from_json(json_file_path)

In [44]:
ideal = f"""

### **Experience:**

**Data Scientist**
XYZ Analytics, City, State
*January 2020 – Present*
- Developed machine learning models to predict customer behavior, improving targeting strategies.
- Analyzed large datasets to extract insights and optimize business processes.
- Collaborated with cross-functional teams to implement data-driven solutions.
- Led the deployment of AI-powered tools that increased operational efficiency by 20%.

**Junior Data Scientist**
Tech Innovators, City, State
*June 2017 – December 2019*
- Assisted in the development and maintenance of predictive models for financial forecasting.
- Conducted statistical analysis and data cleaning to prepare datasets for modeling.
- Created interactive dashboards using Tableau and Power BI for data visualization.
- Presented findings to stakeholders, helping to inform key business decisions.

### **Skills:**
- Machine Learning (Supervised & Unsupervised)
- Statistical Analysis & Data Visualization
- Data Cleaning & Feature Engineering
- Python (Pandas, NumPy, Scikit-learn, TensorFlow)
- SQL & NoSQL Databases
- Data Visualization Tools (Tableau, Power BI, Matplotlib)
- Big Data Technologies (Hadoop, Spark)
- Cloud Platforms (AWS, Azure)
- Model Deployment (Flask, FastAPI, Docker)

### **Education:**

**M.S. in Data Science**
State University, City, State
*May 2017*

**B.S. in Computer Science**
State University, City, State
*May 2015*

### **Certifications:**
- Certified Data Scientist (CDS)
- Google Cloud Professional Data Engineer
- Microsoft Certified: Azure Data Scientist Associate
- Coursera: Deep Learning Specialization
"""


In [None]:
with open(summarized_json_file_path, "r", encoding="utf-8") as json_file:
  jsfile = json.load(json_file)

final_results = []
for file, details in jsfile.items():
    output = ""
    output += f"Name:\n{details['Name']}\n\n"
    output += f"Employment History:\n{details['Employment History']}\n\n"
    output += f"Skills:\n{details['Skills']}\n\n"
    output += f"Education:\n{details['Education']}\n\n"
    output += f"Certifications:\n{details['Certifications']}\n\n"
    output += "\n" + "-"*40 + "\n\n"
    result = generate(f"I am hiring for a Data Scientist. I need you to compare the candidate resume with the ideal resume and give me a summary of your thoughts by mentioning the name of the candidate. I am more focused on experience and skills than education or certifications. All the summaries should follow the same format with these sections: Name; Score out of 10; Summary; and AI suggestion. The AI suggestion should specify whether we should hire the candidate or not and why. Ideal resume: {ideal}. Candidate resume: {output}")
    print(result)
    final_results.append(result)

In [46]:
from IPython.display import Markdown

for result in final_results:
  Markdown(print(result))

**Name:** Jennifer Smith; **Score:** 6/10; **Summary:** Jennifer has relevant internship experience at Niantic, but her role was an intern, and her experience is limited to a single company. She lacks experience in leading projects or teams and has limited skills in data visualization and cloud platforms. **AI Suggestion:** Do not hire Jennifer, as her experience is not as extensive as required for the Data Scientist position.
**Name:** Timothy May  
**Score:** 2/10  
**Summary:** The candidate lacks relevant experience in data science, with a history of management and technical support roles. The skills mentioned are not directly applicable to data science.  
**AI Suggestion:** Do not hire Timothy May. His experience and skills are not aligned with the ideal data scientist resume.
**Name:** Keith Martin
**Score:** 8/10
**Summary:** Keith Martin has relevant experience as a Data Scientist, with notable achievements in deploying recommendation engines, improving customer wait times, and