In [None]:
!pip install pdfplumber python-docx beautifulsoup4

In [None]:
import spacy
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import files
from bs4 import BeautifulSoup
import pdfplumber
import docx

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_nouns_verbs(text):
  doc = nlp(text)
  nouns = [token.text for token in doc if token.pos_ == "NOUN"]
  verbs = [token.text for token in doc if token.pos_ == "VERB"]
  return nouns, verbs

def get_matching_percentage(resume_words, job_words):
  common_words = set(resume_words) & set(job_words)
  total_words = len(set(resume_words)) + len(set(job_words))
  matching_percentage = (len(common_words) / total_words) * 100
  return matching_percentage

def calculate_matching_percentage(list1, list2):
    vectorizer = CountVectorizer().fit_transform([' '.join(list1), ' '.join(list2)])
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    return cosine_matrix[0, 1] * 100

# Function to upload files in Google Colab
def upload_files():
    uploaded = files.upload()
    file_names = list(uploaded.keys())
    return file_names

# Function to extract text from PDFs
def extract_text_from_pdf(file_path):
    text=""
    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
    return text

# Function to extract text from DOCX files
def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    text = ''
    for paragraph in doc.paragraphs:
        text += paragraph.text + '\n'
    return text

# Function to extract text from TXT files
def extract_text_from_txt(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    return text


# Function to extract text based on file extension
def extract_text(file_path):
    if file_path.endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith('.docx'):
        return extract_text_from_docx(file_path)
    elif file_path.endswith('.txt'):
        return extract_text_from_txt(file_path)
    else:
        raise ValueError("Unsupported file format")

# Function to preprocess text (clean HTML tags, etc.)
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Regex Cleaning
    text = re.sub(r"[^a-zA-Z0-9\s'-.]", '', text)
    # Clean html tags.
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    clean_text = clean_text.replace('\n', ' ').replace('\r', '')
    return clean_text



In [None]:
# Upload resume files
print("Upload resume files (PDF, DOCX, or TXT)")
resume_file_names = upload_files()

# Upload job description files
print("Upload job description files (PDF, DOCX, or TXT)")
job_description_file_names = upload_files()

Upload resume files (PDF, DOCX, or TXT)


Saving resume_1.pdf to resume_1.pdf
Upload job description files (PDF, DOCX, or TXT)


Saving JD_sde.txt to JD_sde (1).txt


In [None]:
# Extract, preprocess text, and split into sentences
resume_text = [preprocess_text(extract_text(file_name)) for file_name in resume_file_names]
job_description_text = [preprocess_text(extract_text(file_name)) for file_name in job_description_file_names]



resume_text = ' '.join(resume_text)
job_description_text = ' '.join(job_description_text)

print("\nresume_text: ", resume_text)
print("\njob_description_text: ", job_description_text)


resume_text:  aman raj (cid131) 9334817772 (cid128) aman raj website  1717amanrajgmail.com (cid239) linkedin  github work experience national thermal power corporation limited (ntpc, barh) july 2023  august 2023 trainee ntpcs barh township  conducted a comprehensive data analysis using python, unveiling actionable insights for enhancing profitability through expense reduction in operations and maintenance of 1320mw power plant. projects  leadership authored a research paper on fog computing  cloud computing literature (cid128) click  presented a fog to cloud data control framework, incorporating an xor-based delta encoding algorithm to efficiently manage the data overload from iot devices to the cloud.  it was concluded from the simulated result that the proposed algorithm reduces the total computation cost by an average of 3.6, 6.2, 7.1, and 11.9 for bat, ga, pso and ga-pso algorithms respectively. private api for development  django, rest framework, mysql  github  focused on control

In [None]:
# Extract nouns and verbs
resume_nouns, resume_verbs = extract_nouns_verbs(resume_text)
job_nouns, job_verbs = extract_nouns_verbs(job_description_text)

print("Resume nouns:", resume_nouns)
print("\nResume verbs:", resume_verbs)
print("\nJob nouns:", job_nouns)
print("\nJob verbs:", job_verbs)

# Calculate matching percentages
resume_words = resume_nouns + resume_verbs
job_words = job_nouns + job_verbs

# Calculate similarity score
similarity_score = calculate_matching_percentage(set(resume_words), set(job_words))
print(f"\nSimilarity Score: {similarity_score:.4f} %")

matching_percentage = get_matching_percentage(resume_words, job_words)
print("\nMatching percentage:", matching_percentage)

verb_matching_percentage = calculate_matching_percentage(set(resume_verbs), set(job_verbs))
noun_matching_percentage = calculate_matching_percentage(set(resume_nouns), set(job_nouns))

print(f"\nVerb Matching Percentage: {verb_matching_percentage:.4f} %")
print(f"\nNoun Matching Percentage: {noun_matching_percentage:.4f} %")

Resume nouns: ['cid128', 'aman', 'raj', 'website', 'work', 'experience', 'power', 'corporation', 'barh', 'trainee', 'ntpcs', 'township', 'data', 'analysis', 'python', 'insights', 'profitability', 'expense', 'reduction', 'operations', 'maintenance', 'power', 'plant', 'projects', 'leadership', 'research', 'paper', 'cid128', 'fog', 'control', 'framework', 'delta', 'data', 'overload', 'devices', 'cloud', 'result', 'computation', 'cost', 'average', 'bat', 'ga', 'pso', 'algorithms', 'api', 'development', 'django', 'rest', 'framework', 'mysql', 'access', 'api', 'crud', 'operations', 'mysql', 'database', 'security', 'session', 'authentication', 'mechanism', 'api', 'hits', 'users', 'limits', 'frequency', 'api', 'calls', 'data', 'visualization', 'census', 'rest', 'cloud', 'cid128', 'stack', 'web', 'application', 'census', 'data', 'population', 'accessibility', 'insights', 'trends', 'eage', 'college', 'student', 'chapter', 'website', 'django', 'rest', 'framework', 'javascript', 'cid128', 'head', 

In [None]:
# Sectional Targeting.

# Function to extract specific sections from resume using regex
def extract_sections(resume_text):
    sections = {
        "Summary": "",
        "Work Experience": "",
        "Skills": "",
        "Education": "",
        "Certifications": "",
        "Achievements":""
    }

    # Define regex patterns for each section
    patterns = {
        "Summary": re.compile(r"summary|objective", re.IGNORECASE),
        "Work Experience": re.compile(r"experience|employment|work history", re.IGNORECASE),
        "Skills": re.compile(r"skills|competencies|abilities", re.IGNORECASE),
        "Education": re.compile(r"education|academic", re.IGNORECASE),
        "Certifications": re.compile(r"certifications|licenses", re.IGNORECASE),
        "Achievements": re.compile(r"achievements|awards", re.IGNORECASE)
    }

    # Extract text for each section
    for section, pattern in patterns.items():
        match = pattern.search(resume_text)
        if match:
            start = match.end()
            end = len(resume_text)
            for next_section in patterns.values():
                next_match = next_section.search(resume_text, start)
                if next_match:
                    end = min(end, next_match.start())
            sections[section] = resume_text[start:end].strip()

    return sections

# Function to extract verbs and nouns
def extract_verbs_and_nouns(text):
    doc = nlp(text)
    verbs = [token.text for token in doc if token.pos_ == "VERB"]
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return verbs, nouns


# Extract sections from resume
sections = extract_sections(resume_text)

# Extract verbs and nouns from resume sections
resume_keywords = []
resume_verbs = []
resume_nouns = []
for section_text in sections.values():
    print("\nSection: ",section_text)
    verbs, nouns = extract_verbs_and_nouns(section_text)
    resume_keywords.extend(verbs)
    resume_keywords.extend(nouns)
    resume_verbs.extend(verbs)
    resume_nouns.extend(nouns)

# Extract verbs and nouns from job description
job_verbs, job_nouns = extract_verbs_and_nouns(job_description_text)
job_keywords = job_verbs + job_nouns

# Calculate similarity score
similarity_score = calculate_matching_percentage(set(resume_keywords), set(job_keywords))
print(f"\nSimilarity Score: {similarity_score:.4f} %")

matching_percentage = get_matching_percentage(resume_keywords, job_keywords)
print("\nMatching percentage:", matching_percentage)

verb_matching_percentage = calculate_matching_percentage(set(resume_verbs), set(job_verbs))
noun_matching_percentage = calculate_matching_percentage(set(resume_nouns), set(job_nouns))

print(f"\nVerb Matching Percentage: {verb_matching_percentage:.4f} %")
print(f"\nNoun Matching Percentage: {noun_matching_percentage:.4f} %")



Section:  

Section:  national thermal power corporation limited (ntpc, barh) july 2023  august 2023 trainee ntpcs barh township  conducted a comprehensive data analysis using python, unveiling actionable insights for enhancing profitability through expense reduction in operations and maintenance of 1320mw power plant. projects  leadership authored a research paper on fog computing  cloud computing literature (cid128) click  presented a fog to cloud data control framework, incorporating an xor-based delta encoding algorithm to efficiently manage the data overload from iot devices to the cloud.  it was concluded from the simulated result that the proposed algorithm reduces the total computation cost by an average of 3.6, 6.2, 7.1, and 11.9 for bat, ga, pso and ga-pso algorithms respectively. private api for development  django, rest framework, mysql  github  focused on controlled access, the api integrates crud operations with a mysql database, enhancing security with session authentic



> **Observation** : While sectional targeting, and calculating similarity score using cosine_similarity, There is reduce in overall matching percentage due to the net total tokens in section.

