<a href="https://colab.research.google.com/github/afzalzada/AfzalApps/blob/main/CV_Matcher_ATOMA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Cell 1: Install/Setup (Run once)
!pip install streamlit pdfplumber python-docx pypandoc scikit-learn spacy nltk
!git clone https://github.com/srbhr/Resume-Matcher.git
%cd Resume-Matcher
!python -m spacy download en_core_web_sm  # For NLP parsing (experience, skills, education)
import nltk
nltk.download('stopwords')
nltk.download('punkt')

Cloning into 'Resume-Matcher'...
remote: Enumerating objects: 3239, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 3239 (delta 0), reused 0 (delta 0), pack-reused 3235 (from 2)[K
Receiving objects: 100% (3239/3239), 110.84 MiB | 37.74 MiB/s, done.
Resolving deltas: 100% (1694/1694), done.
/content/Resume-Matcher/Resume-Matcher
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m12.8/12.8 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Cell 2: Import Libraries and Define Functions
import os
import pandas as pd
from pdfplumber import open as pdf_open
from docx import Document
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from io import BytesIO

# Text extraction function (handles PDF, DOCX, RTF; skips corrupt)
def extract_text(file_path):
    try:
        if file_path.endswith('.pdf'):
            with pdf_open(file_path) as pdf:
                return ' '.join(page.extract_text() or '' for page in pdf.pages)
        elif file_path.endswith(('.docx', '.doc')):
            doc = Document(file_path)
            return ' '.join(p.text for p in doc.paragraphs if p.text.strip())
        elif file_path.endswith('.rtf'):
            # Simple RTF strip (or use pypandoc if installed)
            with open(file_path, 'r') as f:
                text = f.read()
            text = re.sub(r'\\[^ ]*', '', text)  # Basic RTF clean
            return re.sub(r'[^\w\s]', ' ', text)  # Clean non-alphanum
        return None
    except:
        return None

# CV Parser (extracts experience, education, skills, name, phone, email using NLP)
import spacy
nlp = spacy.load('en_core_web_sm')

def parse_cv(text):
    doc = nlp(text)
    entities = {'name': 'Not found', 'position': 'Not found', 'org': 'Not found', 'total_exp': 0, 'relevant_exp': 0, 'education': 'Not found', 'phone': 'Not found', 'email': 'Not found', 'skills': []}

    # Name (first proper nouns)
    names = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
    entities['name'] = names[0] if names else 'Not found'

    # Education (patterns like 'BSc', 'Master', 'PhD')
    edu_patterns = [tok.text for tok in doc if any(term in tok.text.lower() for term in ['bachelor', 'master', 'phd', 'degree', 'bsc', 'msc'])]
    entities['education'] = ', '.join(set(edu_patterns)) if edu_patterns else 'Not found'

    # Experience (years, roles; estimate total/relevant)
    years = re.findall(r'(\d+)(?:\s*(?:years?|yrs?))', text.lower())
    entities['total_exp'] = sum(int(y) for y in years[:3]) if years else 0  # Top 3 for total
    # Relevant: Telecom/role match (customize with JD keywords)
    telecom_keywords = ['telecom', 'telecommunications', 'network', '5g', 'wireless']  # Add from JD
    relevant_years = sum(int(y) for y in years if any(kw in text.lower() for kw in telecom_keywords))
    entities['relevant_exp'] = relevant_years or entities['total_exp'] * 0.5  # Fallback estimate

    # Position/Org (job titles, companies)
    positions = [ent.text for ent in doc.ents if ent.label_ in ['ORG', 'WORK_OF_ART']]
    entities['position'] = positions[0] if positions else 'Not found'
    entities['org'] = positions[1] if len(positions) > 1 else 'Not found'

    # Skills (noun chunks)
    entities['skills'] = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) <= 3]

    # Phone/Email regex
    phone_match = re.search(r'\b(\+?\d{1,3}[-.\s]?\d{3}[-.\s]?\d{3}[-.\s]?\d{4})\b', text)
    entities['phone'] = phone_match.group(1) if phone_match else 'Not found'
    email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
    entities['email'] = email_match.group(0) if email_match else 'Not found'

    return entities

# Function to check if file contains CV/resume content (process if any CV elements present)
def is_resume(file_name, text):
    cv_indicators = ["work experience", "education", "skills", "employment", "experience", "qualifications", "resume", "cv"]
    text_lower = text.lower() if text else ""

    # Accept if any CV indicator is present, regardless of cover letter content
    if any(indicator in text_lower for indicator in cv_indicators):
        return True

    # Reject only if no CV indicators are found
    return False

In [16]:
# Cell 3: Upload Files (Run this to upload JD and CVs)
from google.colab import files
print("Upload JD (PDF/DOCX/RTF):")
jd_uploaded = files.upload()
jd_file = list(jd_uploaded.keys())[0]
jd_text = extract_text(jd_file)
if not jd_text:
    print("JD extraction failed!")

print("Upload CVs and other documents (multiple, PDF/DOCX/RTF):")
cv_uploaded = files.upload()  # Upload all 100+ at once

cv_data = []
for filename in cv_uploaded.keys():
    cv_text = extract_text(filename)
    if cv_text and is_resume(filename, cv_text):
        parsed = parse_cv(cv_text)
        parsed['cv_name'] = filename
        parsed['text'] = cv_text  # Preserve raw text
        cv_data.append(parsed)
        print(f"Processed document: {filename} - Name: {parsed['name']}")
    else:
        print(f"Skipped document: {filename} - No CV/resume content detected")

Upload JD (PDF/DOCX/RTF):


Saving Manager ‚Äì Procurement.pdf to Manager ‚Äì Procurement (4).pdf
Upload CVs and other documents (multiple, PDF/DOCX/RTF):


Saving MA-Sakha-CV.pdf to MA-Sakha-CV (1).pdf
Saving MOF - CV - 2025.pdf to MOF - CV - 2025 (1).pdf
Saving Sharifullah CV1.pdf to Sharifullah CV1 (1).pdf
Saving elham-Cv.pdf to elham-Cv (1).pdf
Saving my cv.pdf to my cv (1).pdf
Saving 1- SAFI Khaleelullah..pdf to 1- SAFI Khaleelullah. (1).pdf
Saving Abdul Maftoon Maftoon Cover Letter and CV.pdf to Abdul Maftoon Maftoon Cover Letter and CV (2).pdf
Saving Update of CV.pdf to Update of CV (1).pdf
Saving C V.pdf to C V (1).pdf
Saving fayaz new (7).pdf to fayaz new (7) (1).pdf
Saving Cv Iqbal.pdf to Cv Iqbal (1).pdf
Saving abdul_Qadir_Nawed_CV.pdf to abdul_Qadir_Nawed_CV (1).pdf
Saving MP. Mirbat Khan Mangal CV¬≤.üåç¬≤.pdf to MP. Mirbat Khan Mangal CV¬≤.üåç¬≤ (1).pdf
Saving Nasratullah Zarif's CV.pdf to Nasratullah Zarif's CV (1).pdf
Saving Abdul Rauof Azim  2025 Cover letter To whom it may concern..pdf to Abdul Rauof Azim  2025 Cover letter To whom it may concern. (1).pdf
Saving Hamid Resume & Cover Letter.pdf to Hamid Resume & Cover Let



Processed document: MA-Sakha-CV (1).pdf - Name: Munir Ahmad ‚ÄúSakha
Skipped document: MOF - CV - 2025 (1).pdf - No CV/resume content detected
Processed document: Sharifullah CV1 (1).pdf - Name: Skill
Processed document: elham-Cv (1).pdf - Name: Mohammad Elham Safi
Brand
Processed document: my cv (1).pdf - Name: phone:(+93)794657448
E-mill
Processed document: 1- SAFI Khaleelullah. (1).pdf - Name: MS Word
Processed document: Abdul Maftoon Maftoon Cover Letter and CV (2).pdf - Name: Abdul Maftoon




Processed document: Update of CV (1).pdf - Name: Azizurrahman Safi




Processed document: C V (1).pdf - Name: Shafiullah Amani
Processed document: fayaz new (7) (1).pdf - Name: Bahvan‚Äôs
Processed document: Cv Iqbal (1).pdf - Name: Curriculum Vitae




Processed document: abdul_Qadir_Nawed_CV (1).pdf - Name: Abdul Qadir




Processed document: MP. Mirbat Khan Mangal CV¬≤.üåç¬≤ (1).pdf - Name: MIRBAT KHAN MANGAL




Processed document: Nasratullah Zarif's CV (1).pdf - Name: Nasratullah Zarif
Telephone
Processed document: Abdul Rauof Azim  2025 Cover letter To whom it may concern. (1).pdf - Name: Abdul Rauof Azim
Processed document: Hamid Resume & Cover Letter (1).pdf - Name: Shah Shaheed
Processed document: mohammad aqua new cv (1).pdf - Name: Mohammad Aqua kohistani
Processed document: Ziauddin ZIa-p (1).pdf - Name: Ziauddin Zia





Processed document: saeed+cv (1).pdf - Name: Madam
Processed document: fahim Mirkhil CV (1).pdf - Name: Khushhal Khan
Processed document: UPDATED CV AHMAD FIRDOWS BEHGUZIN (1).pdf - Name: Ahmad Firdows
Processed document: @ S.H CVr 2 (1).pdf - Name: Samiullah
Processed document: Sabawoon Cv for Procurement Manager - ATOMA nternational (1).pdf - Name: Farhatullah Sabawoon
sabawoon466@gmail.com
Processed document: 001 Khyber Resume CV (1).pdf - Name: Khyber Massoudy
Processed document: Mohammad yahia nori cv (1).pdf - Name: Request
Processed document: Farhad Mahir CV (1).pdf - Name: Alami Plaza
Processed document: Shamsulhadi_Ahadi CV (1).pdf - Name: Shamsulhadi Ahadi
Skipped document: Samiullah Cv (1).PDF - No CV/resume content detected
Processed document: 0001- Waheedullah's CV Cover letter-Manager Procurment (1).pdf - Name: Warehouse
Processed document: resume (1).pdf - Name: Hikmatullah Atal
Processed document: CL & CV Zabiullah Rahmani (1).pdf - Name: Zabiullah Rahmani Curr Residenc



Processed document: Mirwais Attaie CV & Application Letter (1).pdf - Name: Khair Khana
Processed document: Asadullah‚Äôs CV (1).pdf - Name: Curriculum Vitae
Processed document: Cover letter of Haidari (1).pdf - Name: Cultures
Processed document: Rahimullah Asadi CV (1).pdf - Name: Rahimullah Asadi CV
Processed document: Azizullah_Saleem_CV (1).pdf - Name: C++,Matlab
Processed document: Khalil cv  (1).pdf - Name: KHALILURAHAN ASTAN
khalilurahman1752@gmail.com
Processed document: Muhibullah khaksar CV (1).pdf - Name: Muhibullah Khaksar
Email
Processed document: mohammad abid(1) (1).pdf - Name: Ihaveabachelordegreeinpoliticalscienceand4years‚Äôexperiencein"FinanceManager




Processed document: imran cv 1 (1).pdf - Name: Jalal Abad




Processed document: Navedullah Resume (1).pdf - Name: Navedullah Ibrahim
Processed document: Hqjoo CV & Cover Letter for Porcuement Manager at ATOMA (1).pdf - Name: Amir Mohammad
Processed document: Fawad Bustan - CV (1).pdf - Name: Concern
Processed document: Khwaja.Shafi.Mohammad.sediqi (1).pdf - Name: Khwaja Shafi Mohammad Sediqi
Processed document: Atta M Taniwal CV. (1).pdf - Name: Atta Mohammad Taniwal
Processed document: ahmadi cv (1).pdf - Name: Jafar Ali "Ahmadi
Processed document: Sayed Idrees Sadat - Cover Letter and CV (1).pdf - Name: Sayed Idrees Sadat
Processed document: Sayed Shafi Mosawi CV (1).pdf - Name: Sayed Shafi Mosawi
Processed document: Mohammad Suliman Nesaar (1).pdf - Name: Mohammad Suliman Nesaar
Processed document: Assad Ullah Hamidi Resume (1).pdf - Name: Abbottabad Pakistan
Processed document: C&V Muohammad ullah Mukhlis-13-1-1-1-1 (1).pdf - Name: Mohammad ullah Mukhlis

Processed document: CV + Cover letter _ Abdul Mujeeb Omar _ ATOMA (1).pdf - Name: Stat



Processed document: Aziz Rahim CV (1).docx - Name: April.2022-Nov.2024
Skipped document: Ahmad Khisraw Hotak_Manager Procurement.xlsx - No CV/resume content detected
Skipped document: khwaja_shafi_mohammad.sediqi_Manager - Procurement.xlsx - No CV/resume content detected
Skipped document: Job Application Form Afsar Salihe.xlsx - No CV/resume content detected
Skipped document: Job Application Form, Mashal Behnaward_ Procurement Manager.xlsx - No CV/resume content detected
Skipped document: AbdulWaliKochi_ Manager - Procurement.xlsx - No CV/resume content detected
Skipped document: Job Application Form.xlsx - No CV/resume content detected
Skipped document: Abdul Bashir Yumgani_ Procurement manager.xlsx - No CV/resume content detected
Skipped document: Amanullah Yawari-Manager - Procurement.xlsx - No CV/resume content detected
Skipped document: Ahmad Omar Dost Job Application Form.xlsx - No CV/resume content detected
Skipped document: Atta M Taniwal Application.xlsx - No CV/resume content

In [17]:
# Cell4 # JD Parsing and Scoring/Ranking
jd_doc = nlp(jd_text)
jd_keywords = [token.text.lower() for token in jd_doc if token.pos_ in ['NOUN', 'PROPN'] and not token.is_stop]
jd_keywords.extend(['telecom', 'experience', 'years'])  # Boost priorities

# TF-IDF for similarity (weights experience/education/skills)
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
jd_corpus = [jd_text]
cv_corpus = [cv['text'][:2000] for cv in cv_data]  # Truncate for efficiency
all_corpus = jd_corpus + cv_corpus

tfidf_matrix = vectorizer.fit_transform(all_corpus)
jd_vec = tfidf_matrix[0:1]
cv_vecs = tfidf_matrix[1:]

# Cosine similarity scores (higher = better match)
scores = cosine_similarity(jd_vec, cv_vecs).flatten()

# Weighting: 50% experience (role/tenure/telecom), 30% education, 20% skills
weighted_scores = []
for i, score in enumerate(scores):
    exp_weight = 0.5 * (cv_data[i]['relevant_exp'] / 10) + 0.5 * score  # Normalize exp
    edu_weight = 0.3 * (1 if 'degree' in cv_data[i]['education'].lower() else 0)
    skill_weight = 0.2 * sum(1 for skill in jd_keywords if skill in cv_data[i]['skills'])
    weighted = exp_weight + edu_weight + skill_weight
    weighted_scores.append(weighted * 100)  # Scale to 0-100

# Rank top 20
ranked = sorted(zip(cv_data, weighted_scores), key=lambda x: x[1], reverse=True)[:20]
top_20 = [dict(cv, score=round(s, 2), reasons=f"Match: {s:.1f}% (Exp: {cv['relevant_exp']}yrs telecom/role, Edu: {cv['education']}, Skills: {len(cv['skills'])} matches)") for cv, s in ranked]

# Export Excel (save to filesystem first)
output = BytesIO()
with pd.ExcelWriter(output, engine='openpyxl') as writer:
    df = pd.DataFrame(top_20)
    df[['cv_name', 'name', 'position', 'org', 'total_exp', 'relevant_exp', 'education', 'phone', 'email', 'reasons']].to_excel(writer, index=False, sheet_name='Top 20')
output.seek(0)  # Reset pointer
with open('top_20_candidates.xlsx', 'wb') as f:
    f.write(output.getvalue())  # Save to filesystem
from google.colab import files
files.download('top_20_candidates.xlsx')  # Download the saved file

print("Top 20 ranked and Excel downloaded!")
print(df[['cv_name', 'score', 'reasons']])  # Preview

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Top 20 ranked and Excel downloaded!
                                              cv_name  score  \
0                        001 Khyber Resume CV (1).pdf  250.0   
1                                Update of CV (1).pdf  190.0   
2    Sayed Idrees Sadat - Cover Letter and CV (1).pdf  185.0   
3             2.Safiullah Afzali-Cover Letter (1).pdf  170.0   
4      Mirwais Attaie CV & Application Letter (1).pdf  160.0   
5                           Resume_Habibullah (1).pdf  140.0   
6   01.Safiullah Afzali CV-Procurement Manager (1)...  135.0   
7                           Fawad Bustan - CV (1).pdf  130.0   
8                                 MA-Sakha-CV (1).pdf  120.0   
9   0001- Waheedullah's CV Cover letter-Manager Pr...  115.0   
10          9102025-Mohammad Zubair Rahimi-CV (1).pdf  115.0   
11              MP. Mirbat Khan Mangal CV¬≤.üåç¬≤ (1).pdf  100.0   
12                        Noorullah Noor's CV (1).pdf  100.0   
13                       Jalaluddin Hakimi CV (1).pdf   90.0   