In [12]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [27]:
# Import necessary libraries
import pandas as pd
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Define functions for text extraction and processing

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_objective(text):
    objective_regex = re.compile(r"(objective|career objective)(.*?)(\n[A-Z]|$)", re.IGNORECASE | re.DOTALL)
    objective_regex
    match = objective_regex.search(text)
    if match:
        return match.group(2).strip()  # Return the text between Objective and next section
    else:
        return "Objective not found"

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Step 2: Load job descriptions from CSV
job_descriptions_df = pd.read_csv("/content/shuffled_job_descriptions.csv")
job_descriptions_df
# Preprocess job descriptions
preprocessed_job_descriptions = job_descriptions_df['Job Description'].apply(preprocess_text)

# Prepare for model training
job_titles = job_descriptions_df['Job Title']

# Step 3: Input resume path
resume_pdf_path = input("Enter the resume[pdf] path: ")
resume_text = extract_text_from_pdf(resume_pdf_path)
objective_text = extract_objective(resume_text)

# If an objective is found, preprocess it; otherwise, use the full resume
if objective_text != "Objective not found":
    preprocessed_resume = preprocess_text(objective_text)
else:
    preprocessed_resume = preprocess_text(resume_text)

# Combine job descriptions and the processed resume for TF-IDF
documents = preprocessed_job_descriptions.tolist() + [preprocessed_resume]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Step 4: Create train-test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix[:-1], job_titles, test_size=0.2, random_state=42)

# Step 5: Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Check the accuracy of the model
accuracy = model.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")

# Step 6: Predict the job title for the provided resume
predicted_job_title = model.predict(tfidf_matrix[-1:])
print(f"The resume matches the job title: {predicted_job_title[0]}")

# Step 7: Calculate the similarity and reasons for selection
cosine_sim = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1])
similarity_percentage = cosine_sim[0] * 100

# Get the index of the best matching job description
best_match_index = cosine_sim.argmax()
best_job_description = preprocessed_job_descriptions.iloc[best_match_index]
best_job_title = job_titles.iloc[best_match_index]

# Extracting keywords for explanation
def extract_keywords(text):
    # A simple keyword extraction: words appearing frequently
    tokens = word_tokenize(text)
    return set(word for word in tokens if word.isalpha())

# Extract keywords from the best job description and resume
job_description_keywords = extract_keywords(best_job_description)
resume_keywords = extract_keywords(preprocessed_resume)

# Find matching keywords
matching_keywords = job_description_keywords.intersection(resume_keywords)

# Print similarity and reasons for selection
print(f"Similarity score for {best_job_title}: {similarity_percentage[best_match_index]:.2f}%")
print("Reasons for selection:")
print(f"- Matching Keywords: {', '.join(matching_keywords) if matching_keywords else 'None'}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enter the resume[pdf] path: /content/Stockholm-Resume-Template-Simple.pdf
Model accuracy: 0.96
The resume matches the job title: Accountant
Similarity score for Data Analyst: 16.86%
Reasons for selection:
- Matching Keywords: performance, inventory, chain, order, supply


In [23]:
job_descriptions_df

Unnamed: 0,Job Title,Job Description
0,Human Resources Manager,Collaborate with department heads to create tr...
1,Sales Manager,Develop strategies for selling to different cu...
2,Software Engineer,"Build and maintain web applications, ensuring ..."
3,Sales Manager,Lead the team in identifying new customer acqu...
4,Data Analyst,Use machine learning techniques to develop pre...
...,...,...
329,Customer Service Representative,Ensure that all customer interactions are in c...
330,Sales Manager,Ensure that the sales team is compliant with a...
331,Accountant,Accountants frequently analyze trends in finan...
332,Data Analyst,Analyze data using various tools such as Excel...


NameError: name 'objective_regex' is not defined

In [29]:
f=pd.read_csv("/content/shuffled_job_descriptions.csv")
f

Unnamed: 0,Job Title,Job Description
0,Human Resources Manager,Collaborate with department heads to create tr...
1,Sales Manager,Develop strategies for selling to different cu...
2,Software Engineer,"Build and maintain web applications, ensuring ..."
3,Sales Manager,Lead the team in identifying new customer acqu...
4,Data Analyst,Use machine learning techniques to develop pre...
...,...,...
329,Customer Service Representative,Ensure that all customer interactions are in c...
330,Sales Manager,Ensure that the sales team is compliant with a...
331,Accountant,Accountants frequently analyze trends in finan...
332,Data Analyst,Analyze data using various tools such as Excel...


In [30]:
f["Job Title"].unique()

array(['Human Resources Manager', 'Sales Manager', 'Software Engineer',
       'Data Analyst', 'Customer Service Representative', 'Accountant',
       'Marketing Manager', 'Project Manager'], dtype=object)

In [32]:
pip install PyMuPDF

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.11


In [41]:
import pandas as pd
import re
import fitz  # PyMuPDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load job descriptions dataset
job_df = pd.read_csv("/content/shuffled_job_descriptions.csv")

# Function to extract the career objective from the resume
def extract_profile(resume_text):
    # Assume the profile is the first section in the resume
    # This regex looks for lines starting with "Profile" or "Objective"
    match = re.search(r'(Profile|Objective|Career|OBJECTIVE|Career Objective|Summary)(.*?)(Education|Experience|Skills|$)', resume_text, re.DOTALL)
    if match:
        return match.group(2).strip()
    return ""

# Load and read the resume from a PDF file
resume_file_path = input("Enter the resume path :")  # Change this to your resume file path
doc = fitz.open(resume_file_path)  # Open the PDF file
resume_text = ""
for page in doc:  # Iterate through each page
    resume_text += page.get_text()  # Extract text from each page
doc.close()  # Close the PDF file

# Extract the profile or career objective from the resume
profile_text = extract_profile(resume_text)

# If no profile is found, alert the user
if not profile_text:
    print("No career objective/profile found in the resume.")
else:
    print("Extracted Profile/Career Objective:")
    print(profile_text)

# Combine the job titles and descriptions into a single text for analysis
job_descriptions = job_df['Job Title'] + " " + job_df['Job Description']
job_descriptions = job_descriptions.astype(str)  # Ensure all entries are strings

# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit the model on job descriptions and the extracted profile
tfidf_matrix = vectorizer.fit_transform(job_descriptions.tolist() + [profile_text])

# Calculate cosine similarity between the profile and job descriptions
similarity_matrix = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

# Get job titles sorted by similarity score
similarity_scores = list(enumerate(similarity_matrix.flatten()))
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

# Number of job suggestions
num_suggestions = 5
print(f"\nTop {num_suggestions} Job Roles Suggested:")

# Display the top suggested job roles
for idx, score in similarity_scores[:num_suggestions]:
    print(f"Job Title: {job_df['Job Title'][idx]}, Similarity Score: {score:.4f}")


Enter the resume path :/content/Shree_Krishna_Kanth (1).pdf
Extracted Profile/Career Objective:
Highly motivated M.Sc. student in Data Science with a strong foundation in data analysis, visualization, and proficiency in machine 
learning techniques like Python libraries and algorithms. Eager to leverage my experience in data cleaning, feature engineering, and model 
evaluation to excel as a Data Analyst. Possesses excellent problem-solving skills and a passion for extracting valuable insights from data to 
inform strategic decision-making. 
 
EDUCATION 
Bishop Heber College Tiruchirappalli, Tamil Nadu. 
• 
Master of Science in Data Science CGPA- 7.71 
 
 
 
 
 
AUG 2022 – MAY 2024 
• 
Bachelor of Science in Mathematics CGPA- 8.38 
 
 
 
 
 
 JUN 2019 – MAY 2022 
Areas of Study: 
 
 
Machine Learning: Regression, Classification, Clustering, Algorithms, Model Building 
 
Data Analysis: Data Mining, Data cleaning, Data Modeling, Data Visualization 
 
Big Data, SQL, NLP, TensorFlow, PyTorc

In [47]:
import pandas as pd
import fitz  # PyMuPDF or use pdfplumber
import re

# Step 1: Load skills from the CSV file
def load_skills_from_csv(file_path):
    df = pd.read_csv(file_path)
    return df

# Step 2: Extract text from PDF resume
def extract_text_from_pdf(pdf_file):
    text = ""
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Step 3: Extract skills from resume text
def extract_skills_from_text(text):
    # Basic extraction method; can be improved
    skills_pattern = r"\b[a-zA-Z\s]+\b"  # regex to find words
    words = re.findall(skills_pattern, text)
    return [word.strip().lower() for word in words if word.strip()]

# Step 4: Compare extracted skills with the CSV skills
def match_skills(extracted_skills, df):
    matched_roles = {}
    for index, row in df.iterrows():
        profession = row['Profession']
        skills = row['Skills'].lower().split(", ")
        matched_skills = set(extracted_skills).intersection(set(skills))
        if matched_skills:
            matched_roles[profession] = list(matched_skills)
    return matched_roles

# Step 5: Main Function
def main(pdf_file, csv_file):
    # Load skills from CSV
    df = load_skills_from_csv(csv_file)

    # Extract text from the PDF
    resume_text = extract_text_from_pdf(pdf_file)

    # Extract skills from the resume text
    extracted_skills = extract_skills_from_text(resume_text)

    # Match extracted skills with job roles
    matched_roles = match_skills(extracted_skills, df)

    # Print results
    if matched_roles:
        print("Suggested Job Roles based on matched skills:")
        for role, skills in matched_roles.items():
            print(f" - {role}: Matched Skills: {', '.join(skills)}")
    else:
        print("No matching roles found based on the extracted skills.")

# Example Usage
if __name__ == "__main__":
    # Path to your PDF resume and CSV file
    pdf_file_path = '/content/Shree_Krishna_Kanth (1).pdf'  # Change this to your PDF file path
    csv_file_path = '/content/dataset.csv'  # Change this to your CSV file path

    main(pdf_file_path, csv_file_path)


Suggested Job Roles based on matched skills:
 - Doctor: Matched Skills: communication
 - Teacher: Matched Skills: communication
 - Nurse: Matched Skills: communication, time management
 - Software Developer: Matched Skills: teamwork
 - Graphic Designer: Matched Skills: communication
 - Lawyer: Matched Skills: communication
 - Pharmacist: Matched Skills: communication
 - Veterinarian: Matched Skills: communication
 - Marketing Specialist: Matched Skills: communication
 - Construction Manager: Matched Skills: leadership
 - Financial Analyst: Matched Skills: communication
 - Human Resources Manager: Matched Skills: communication
 - Journalist: Matched Skills: communication, time management
 - Chef: Matched Skills: teamwork, time management
 - Psychologist: Matched Skills: communication
 - Social Worker: Matched Skills: communication
 - Pilot: Matched Skills: communication, teamwork


In [54]:
ds=pd.read_csv("/content/UpdatedResumeDataSet.csv")
ds

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...
