In [4]:
!pip install PyPDF2
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-p

In [5]:
import pdfplumber
import re
from difflib import get_close_matches
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Create the dictionary with job roles as keys and skills as values
data = {
    'Software Engineer': 'Skilled in Python, machine learning, and web development with Django',
    'Backend Developer': 'Experience in Java, Spring Boot, and RESTful web services',
    'Data Analyst': 'Proficient in SQL, data analysis, and Tableau dashboards',
    'Frontend Developer': 'Expert in HTML, CSS, JavaScript, and React development',
    'AI/ML Engineer': 'Knowledge of deep learning, neural networks, and NLP tasks',
    'DevOps Engineer': 'Experienced in DevOps, Docker, Kubernetes, and cloud deployment'
}

skill_keywords = [
    'python', 'java', 'machine learning', 'django', 'spring boot', 'sql',
    'data analysis', 'tableau', 'html', 'css', 'javascript', 'react',
    'nlp', 'neural networks', 'docker', 'kubernetes'
]

# Step 2: Preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    return set(text.split())  # Convert to a set of words

preprocessed_data = {role: preprocess_text(skills) for role, skills in data.items()}

# Step 3: Extract text from PDF resume using pdfplumber
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""  # Handle None if text extraction fails
    return text

# Step 4: Extract the "Objective" section from the resume
def extract_objective(text):
    objective_regex = re.compile(r"(objective|career objective)(.*?)(\n[A-Z]|$)", re.IGNORECASE | re.DOTALL)
    match = objective_regex.search(text)
    if match:
        return match.group(2).strip()  # Return the text between Objective and next section
    else:
        return None

# Step 5: Extract skills from the resume text based on predefined keywords
def extract_skills_from_resume(resume_text):
    resume_text = preprocess_text(resume_text)  # Preprocess the text
    matched_skills = set()
    for skill in skill_keywords:
        if get_close_matches(skill, resume_text, n=1, cutoff=0.8):  # 80% similarity threshold
            matched_skills.add(skill)
    return matched_skills

# Step 6: Match extracted skills to the job roles
def match_skills_to_roles(matched_skills):
    matching_roles = []
    for role, skills in preprocessed_data.items():
        matching_skills = matched_skills.intersection(skills)
        if len(matching_skills) >= 2:  # If two or more skills match, consider the role suitable
            matching_roles.append((role, matching_skills))
    return matching_roles

# Step 7: Calculate Cosine Similarity
def calculate_similarity(resume_text, job_description):
    vectorizer = TfidfVectorizer()
    documents = [job_description, resume_text]
    tfidf_matrix = vectorizer.fit_transform(documents)
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0] * 100  # Return similarity as percentage

# Step 8: Main logic to analyze the PDF resume
def analyze_resume(pdf_path):
    # Extract the text from the PDF
    resume_text = extract_text_from_pdf(pdf_path)

    # Extract the objective section
    objective_text = extract_objective(resume_text)
    if objective_text:
        print("Extracted Objective:", objective_text)
    else:
        print("Objective not found in the resume.")

    # Define a job description for comparison (You can modify this)
    job_description = input("Enter the job description: ")

    # Calculate similarity
    similarity_percentage = calculate_similarity(resume_text, job_description)
    print(f"Resume is {similarity_percentage:.2f}% similar to the job description.")

    # Extract skills from the resume
    extracted_skills = extract_skills_from_resume(resume_text)

    # Match the extracted skills to job roles
    matching_roles = match_skills_to_roles(extracted_skills)

    # Print the results
    if matching_roles:
        print("The resume is suitable for the following job roles based on skill matches:")
        for role, matched_skills in matching_roles:
            print(f"- {role}: matched skills {', '.join(matched_skills)}")
    else:
        print("No suitable job roles found based on the resume's skills.")

# Get PDF file path from user input
pdf_path = input("Please enter the file path of the resume (PDF): ")
print()
analyze_resume(pdf_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Please enter the file path of the resume (PDF): /content/Shree_Krishna_Kanth (1).pdf

Objective not found in the resume.
Enter the job description: Good knowledge in python, sql
Resume is 10.05% similar to the job description.
The resume is suitable for the following job roles based on skill matches:
- Data Analyst: matched skills sql, tableau


In [6]:
import pdfplumber
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import make_pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Prepare labeled dataset for job roles and skills
data = {
    'resume_text': [
        "Skilled in Python, machine learning, and web development with Django.",
        "Experience in Java, Spring Boot, and RESTful web services.",
        "Proficient in SQL, data analysis, and Tableau dashboards.",
        "Expert in HTML, CSS, JavaScript, and React development.",
        "Knowledge of deep learning, neural networks, and NLP tasks.",
        "Experienced in DevOps, Docker, Kubernetes, and cloud deployment."
    ],
    'job_role': [
        'Software Engineer',
        'Backend Developer',
        'Data Analyst',
        'Frontend Developer',
        'AI/ML Engineer',
        'DevOps Engineer'
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Step 2: Create the ML model
X = df['resume_text']
y = df['job_role']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that first vectorizes the text and then applies Logistic Regression
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))

# Train the model
pipeline.fit(X_train, y_train)

# Step 3: Evaluate the model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%')

# Step 4: Function to extract text from PDF resume using pdfplumber
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""  # Handle None if text extraction fails
    return text

# Step 5: Extract the "Objective" section from the resume
def extract_objective(text):
    objective_regex = re.compile(r"(objective|career objective)(.*?)(\n[A-Z]|$)", re.IGNORECASE | re.DOTALL)
    match = objective_regex.search(text)
    if match:
        return match.group(2).strip()  # Return the text between Objective and next section
    else:
        return None

# Step 6: Main logic to analyze the PDF resume
def analyze_resume(pdf_path):
    # Extract the text from the PDF
    resume_text = extract_text_from_pdf(pdf_path)

    # Extract the objective section
    objective_text = extract_objective(resume_text)
    if objective_text:
        print("Extracted Objective:", objective_text)
    else:
        print("Objective not found in the resume.")

    # Use the ML model to predict job role based on resume text
    predicted_role = pipeline.predict([resume_text])[0]
    print(f'The predicted job role for the resume is: {predicted_role}')

# Get PDF file path from user input
pdf_path = input("Please enter the file path of the resume (PDF): ")
print()
analyze_resume(pdf_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                    precision    recall  f1-score   support

 Backend Developer       0.00      0.00      0.00       1.0
   DevOps Engineer       0.00      0.00      0.00       0.0
Frontend Developer       0.00      0.00      0.00       0.0
 Software Engineer       0.00      0.00      0.00       1.0

          accuracy                           0.00       2.0
         macro avg       0.00      0.00      0.00       2.0
      weighted avg       0.00      0.00      0.00       2.0

Accuracy: 0.00%
Please enter the file path of the resume (PDF): /content/Shree_Krishna_Kanth (1).pdf

Objective not found in the resume.
The predicted job role for the resume is: Data Analyst
