<a href="https://colab.research.google.com/github/afzalzada/AfzalApps/blob/main/CV_Matcher_ATOMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Cell 1: Install/Setup (Run once)
!pip install streamlit pdfplumber python-docx pypandoc scikit-learn spacy nltk
!git clone https://github.com/srbhr/Resume-Matcher.git
%cd Resume-Matcher
!python -m spacy download en_core_web_sm  # For NLP parsing (experience, skills, education)
import nltk
nltk.download('stopwords')
nltk.download('punkt')

Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pypandoc
  Downloading pypandoc-1.15-py3-none-any.whl.metadata (16 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading streamli

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Cell 2: Import Libraries and Define Functions
import os
import pandas as pd
from pdfplumber import open as pdf_open
from docx import Document
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from io import BytesIO

# Text extraction function (handles PDF, DOCX, RTF; skips corrupt)
def extract_text(file_path):
    try:
        if file_path.endswith('.pdf'):
            with pdf_open(file_path) as pdf:
                return ' '.join(page.extract_text() or '' for page in pdf.pages)
        elif file_path.endswith(('.docx', '.doc')):
            doc = Document(file_path)
            return ' '.join(p.text for p in doc.paragraphs if p.text.strip())
        elif file_path.endswith('.rtf'):
            # Simple RTF strip (or use pypandoc if installed)
            with open(file_path, 'r') as f:
                text = f.read()
            text = re.sub(r'\\[^ ]*', '', text)  # Basic RTF clean
            return re.sub(r'[^\w\s]', ' ', text)  # Clean non-alphanum
        return None
    except:
        return None

# CV Parser (extracts experience, education, skills, name, phone, email using NLP)
import spacy
nlp = spacy.load('en_core_web_sm')

def parse_cv(text):
    doc = nlp(text)
    entities = {'name': 'Not found', 'position': 'Not found', 'org': 'Not found', 'total_exp': 0, 'relevant_exp': 0, 'education': 'Not found', 'phone': 'Not found', 'email': 'Not found', 'skills': []}

    # Name (first proper nouns)
    names = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    entities['name'] = names[0] if names else 'Not found'

    # Education (patterns like 'BSc', 'Master', 'PhD')
    edu_patterns = [tok.text for tok in doc if any(term in tok.text.lower() for term in ['bachelor', 'master', 'phd', 'degree', 'bsc', 'msc'])]
    entities['education'] = ', '.join(set(edu_patterns)) if edu_patterns else 'Not found'

    # Experience (years, roles; estimate total/relevant)
    years = re.findall(r'(\d+)(?:\s*(?:years?|yrs?))', text.lower())
    entities['total_exp'] = sum(int(y) for y in years[:3]) if years else 0  # Top 3 for total
    # Relevant: Telecom/role match (customize with JD keywords)
    telecom_keywords = ['telecom', 'telecommunications', 'network', '4g', 'wireless']  # Add from JD
    relevant_years = sum(int(y) for y in years if any(kw in text.lower() for kw in telecom_keywords))
    entities['relevant_exp'] = relevant_years or entities['total_exp'] * 0.5  # Fallback estimate

    # Position/Org (job titles, companies)
    positions = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'WORK_OF_ART']]
    entities['position'] = positions[0] if positions else 'Not found'
    entities['org'] = positions[1] if len(positions) > 1 else 'Not found'

    # Skills (noun chunks)
    entities['skills'] = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) <= 3]

    # Phone/Email regex
    phone_match = re.search(r'\b(\+?\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4})\b', text)
    entities['phone'] = phone_match.group(1) if phone_match else 'Not found'
    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    entities['email'] = email_match.group(0) if email_match else 'Not found'

    return entities

In [3]:
# Cell 3: Upload Files (Run this to upload JD and CVs)
from google.colab import files
print("Upload JD (PDF/DOCX/RTF):")
jd_uploaded = files.upload()
jd_file = list(jd_uploaded.keys())[0]
jd_text = extract_text(jd_file)
if not jd_text:
    print("JD extraction failed!")

print("Upload CVs (multiple, PDF/DOCX/RTF):")
cv_uploaded = files.upload()  # Upload all 100+ at once

cv_data = []
for filename in cv_uploaded.keys():
    cv_text = extract_text(filename)
    if cv_text:
        parsed = parse_cv(cv_text)
        parsed['cv_name'] = filename
        parsed['text'] = cv_text    #add raw text back
        cv_data.append(parsed)
        print(f"Parsed {filename}: {parsed['name']}")

Upload JD (PDF/DOCX/RTF):


Saving Manager – Procurement.pdf to Manager – Procurement.pdf
Upload CVs (multiple, PDF/DOCX/RTF):


Saving @ S.H CVr 2.pdf to @ S.H CVr 2.pdf
Saving 0001- Waheedullah's CV Cover letter-Manager Procurment.pdf to 0001- Waheedullah's CV Cover letter-Manager Procurment.pdf
Saving 001 Khyber Resume CV.pdf to 001 Khyber Resume CV.pdf
Saving 01.Safiullah Afzali CV-Procurement Manager.pdf to 01.Safiullah Afzali CV-Procurement Manager.pdf
Saving 1- SAFI Khaleelullah..pdf to 1- SAFI Khaleelullah..pdf
Saving 2.Safiullah Afzali-Cover Letter.pdf to 2.Safiullah Afzali-Cover Letter.pdf
Saving 09- Wali Kochi's Updated  CV.pdf to 09- Wali Kochi's Updated  CV.pdf
Saving 2025 Faridullah Hotak Cv.pdf to 2025 Faridullah Hotak Cv.pdf
Saving 9102025-Mohammad Zubair Rahimi-CV.pdf to 9102025-Mohammad Zubair Rahimi-CV.pdf
Saving Abdul Maftoon Maftoon Cover Letter and CV.pdf to Abdul Maftoon Maftoon Cover Letter and CV.pdf
Saving Abdul Qader Sulaimankhail CV+Cover L.pdf to Abdul Qader Sulaimankhail CV+Cover L.pdf
Parsed @ S.H CVr 2.pdf: Samiullah
Parsed 0001- Waheedullah's CV Cover letter-Manager Procurment.pd

In [5]:
# JD Parsing and Scoring/Ranking
jd_doc = nlp(jd_text)
jd_keywords = [token.text.lower() for token in jd_doc if token.pos_ in ['NOUN', 'PROPN'] and not token.is_stop]
jd_keywords.extend(['telecom', 'experience', 'years'])  # Boost priorities

# TF-IDF for similarity (weights experience/education/skills)
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
jd_corpus = [jd_text]
cv_corpus = [cv['text'][:2000] for cv in cv_data]  # Truncate for efficiency
all_corpus = jd_corpus + cv_corpus

tfidf_matrix = vectorizer.fit_transform(all_corpus)
jd_vec = tfidf_matrix[0:1]
cv_vecs = tfidf_matrix[1:]

# Cosine similarity scores (higher = better match)
scores = cosine_similarity(jd_vec, cv_vecs).flatten()

# Weighting: 50% experience (role/tenure/telecom), 30% education, 20% skills
weighted_scores = []
for i, score in enumerate(scores):
    exp_weight = 0.5 * (cv_data[i]['relevant_exp'] / 10) + 0.5 * score  # Normalize exp
    edu_weight = 0.3 * (1 if 'degree' in cv_data[i]['education'].lower() else 0)
    skill_weight = 0.2 * sum(1 for skill in jd_keywords if skill in cv_data[i]['skills'])
    weighted = exp_weight + edu_weight + skill_weight
    weighted_scores.append(weighted * 100)  # Scale to 0-100

# Rank top 20
ranked = sorted(zip(cv_data, weighted_scores), key=lambda x: x[1], reverse=True)[:20]
top_20 = [dict(cv, score=round(s, 2), reasons=f"Match: {s:.1f}% (Exp: {cv['relevant_exp']}yrs telecom/role, Edu: {cv['education']}, Skills: {len(cv['skills'])} matches)") for cv, s in ranked]

# Export Excel (fix: save to filesystem first)
output = BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
    df = pd.DataFrame(top_20)
    df[['cv_name', 'name', 'position', 'org', 'total_exp', 'relevant_exp', 'education', 'phone', 'email', 'reasons']].to_excel(writer, index=False, sheet_name='Top 20')
output.seek(0)  # Reset pointer
with open('top_20_candidates.xlsx', 'wb') as f:
    f.write(output.getvalue())  # Save to filesystem
from google.colab import files
files.download('top_20_candidates.xlsx')  # Download the saved file

print("Top 20 ranked and Excel downloaded!")
print(df[['cv_name', 'score', 'reasons']])  # Preview

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Top 20 ranked and Excel downloaded!
                                              cv_name  score  \
0                            001 Khyber Resume CV.pdf  250.0   
1                 2.Safiullah Afzali-Cover Letter.pdf  170.0   
2      01.Safiullah Afzali CV-Procurement Manager.pdf  135.0   
3   0001- Waheedullah's CV Cover letter-Manager Pr...  115.0   
4               9102025-Mohammad Zubair Rahimi-CV.pdf  115.0   
5                           1- SAFI Khaleelullah..pdf   60.0   
6                                     @ S.H CVr 2.pdf   30.0   
7                        2025 Faridullah Hotak Cv.pdf   30.0   
8            Abdul Qader Sulaimankhail CV+Cover L.pdf   25.0   
9                    09- Wali Kochi's Updated  CV.pdf    0.0   
10      Abdul Maftoon Maftoon Cover Letter and CV.pdf    0.0   

                                              reasons  
0   Match: 250.0% (Exp: 40yrs telecom/role, Edu: D...  
1   Match: 170.0% (Exp: 24yrs telecom/role, Edu: B...  
2   Match: 135.0% (Exp: 13y