Dataset : https://www.kaggle.com/datasets/jillanisofttech/updated-resume-dataset

# Import Library

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from google.colab import drive

import re

from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import pickle
import os

In [30]:
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Read Dataset

In [31]:
df = pd.read_csv('/content/gdrive/MyDrive/Reference/Data Science/Fundamental/Data Science Project/Wingstop/Resume-Screening-App/UpdatedResumeDataSet.csv')

In [32]:
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [33]:
df.shape

(962, 2)

# Exploring Categories

In [34]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Java Developer,84
Testing,70
DevOps Engineer,55
Python Developer,48
Web Designing,45
HR,44
Hadoop,42
Blockchain,40
ETL Developer,40
Operations Manager,40


In [35]:
px.bar(df['Category'].value_counts())

In [36]:
df['Category'].unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [42]:
counts = df['Category'].value_counts()
labels = counts.index

fig = px.pie(
    names=labels,
    values=counts,
    title='Category Distribution',
    color_discrete_sequence=px.colors.sequential.Plasma
)

fig.show()

# Cleaning Data

In [10]:
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [11]:
cleanResume("follow my #### $ #  #noorsaeed link http://ngodingyo.com and mail in @gmain.com")

'follow my link and mail in '

In [12]:
df['Resume'] = df['Resume'].apply(lambda x: cleanResume(x))

In [13]:
df['Resume'][0]

'Skills Programming Languages Python pandas numpy scipy scikit learn matplotlib Sql Java JavaScript JQuery Machine learning Regression SVM Na ve Bayes KNN Random Forest Decision Trees Boosting techniques Cluster Analysis Word Embedding Sentiment Analysis Natural Language processing Dimensionality reduction Topic Modelling LDA NMF PCA Neural Nets Database Visualizations Mysql SqlServer Cassandra Hbase ElasticSearch D3 js DC js Plotly kibana matplotlib ggplot Tableau Others Regular Expression HTML CSS Angular 6 Logstash Kafka Python Flask Git Docker computer vision Open CV and understanding of Deep learning Education Details Data Science Assurance Associate Data Science Assurance Associate Ernst Young LLP Skill Details JAVASCRIPT Exprience 24 months jQuery Exprience 24 months Python Exprience 24 monthsCompany Details company Ernst Young LLP description Fraud Investigations and Dispute Services Assurance TECHNOLOGY ASSISTED REVIEW TAR Technology Assisted Review assists in a elerating the 

# words into categorical values

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [16]:
#save raw category before encode, this is important to crosscheck the original data
df['Category_Raw'] = df['Category']

In [17]:
le.fit(df['Category'])
df['Category'] = le.transform(df['Category'])

In [18]:
df.Category.unique()

array([ 6, 12,  0,  1, 24, 16, 22, 14,  5, 15,  4, 21,  2, 11, 18, 20,  8,
       17, 19,  7, 13, 10,  9,  3, 23])

In [19]:
df.head()

Unnamed: 0,Category,Resume,Category_Raw
0,6,Skills Programming Languages Python pandas num...,Data Science
1,6,Education Details May 2013 to May 2017 B E UIT...,Data Science
2,6,Areas of Interest Deep Learning Control System...,Data Science
3,6,Skills R Python SAP HANA Tableau SAP HANA SQL ...,Data Science
4,6,Education Details MCA YMCAUST Faridabad Haryan...,Data Science


# Vactorization

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

tfidf.fit(df['Resume'])
requredresume = tfidf.transform(df['Resume'])

# Splitting

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(requredresume, df['Category'], test_size=0.2, random_state=42)

In [23]:
X_train.shape

(769, 7351)

In [24]:
X_test.shape

(193, 7351)

# Modeling

In [25]:
clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(X_train,y_train)
ypred = clf.predict(X_test)

In [26]:
print(accuracy_score(y_test,ypred))
print(mean_absolute_error(y_test,ypred))
print(mean_squared_error(y_test,ypred))
print(r2_score(y_test,ypred))

0.9844559585492227
0.21243523316062177
2.9585492227979273
0.9376943465095084


In [27]:
print(classification_report(y_test,ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00         4
           5       1.00      1.00      1.00         9
           6       1.00      0.60      0.75         5
           7       1.00      1.00      1.00         8
           8       1.00      0.93      0.96        14
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         7
          11       1.00      1.00      1.00         6
          12       1.00      1.00      1.00        12
          13       1.00      1.00      1.00         4
          14       1.00      1.00      1.00         7
          15       1.00      1.00      1.00        15
          16       1.00      1.00      1.00         8
          17       1.00    

# Prediction System

In [None]:
pickle.dump(tfidf,open('tfidf.pkl','wb'))
pickle.dump(clf, open('clf.pkl', 'wb'))


In [None]:
myresumetest = """I am a data scientist specializing in machine
learning, deep learning, and computer vision. With
a strong background in mathematics, statistics,
and programming, I am passionate about
uncovering hidden patterns and insights in data.
I have extensive experience in developing
predictive models, implementing deep learning
algorithms, and designing computer vision
systems. My technical skills include proficiency in
Python, Sklearn, TensorFlow, and PyTorch.
What sets me apart is my ability to effectively
communicate complex concepts to diverse
audiences. I excel in translating technical insights
into actionable recommendations that drive
informed decision-making.
If you're looking for a dedicated and versatile data
scientist to collaborate on impactful projects, I am
eager to contribute my expertise. Let's harness the
power of data together to unlock new possibilities
and shape a better future.
Contact & Sources
Email: ali.gitcode@gmail.com
Phone: 081219404957
Youtube: Artificial Intelligence
ABOUT ME
WORK EXPERIENCE
SKILLES
Muhamad Ali
LANGUAGES
English
Java
Indonesia
I am a versatile data scientist with expertise in a wide
range of projects, including machine learning,
recommendation systems, deep learning, and computer
vision. Throughout my career, I have successfully
developed and deployed various machine learning models
to solve complex problems and drive data-driven
decision-making
Machine Learnine
Deep Learning
Computer Vision
Recommendation Systems
Data Visualization
Programming Languages (Python, SQL)
Data Preprocessing and Feature Engineering
Model Evaluation and Deployment
Statistical Analysis
Communication and Collaboration
"""

In [None]:
lf = pickle.load(open('clf.pkl', 'rb'))
cleaned_resume = cleanResume(myresumetest)
input_features = tfidf.transform([cleaned_resume])
prediction_id = clf.predict(input_features)[0]

# Map category ID to category name
category_mapping = {
    15: "Java Developer",
    23: "Testing",
    8: "DevOps Engineer",
    20: "Python Developer",
    24: "Web Designing",
    12: "HR",
    13: "Hadoop",
    3: "Blockchain",
    10: "ETL Developer",
    18: "Operations Manager",
    6: "Data Science",
    22: "Sales",
    16: "Mechanical Engineer",
    1: "Arts",
    7: "Database",
    11: "Electrical Engineering",
    14: "Health and fitness",
    19: "PMO",
    4: "Business Analyst",
    9: "DotNet Developer",
    2: "Automation Testing",
    17: "Network Security Engineer",
    21: "SAP Developer",
    5: "Civil Engineer",
    0: "Advocate",
}

category_name = category_mapping.get(prediction_id, "Unknown")

print("Predicted Category:", category_name)
print(prediction_id)


Predicted Category: Data Science
6


# Upload Resume from PDF

In [None]:
pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.24.11


In [None]:
import fitz

In [None]:
# Load the trained classifier and TfidfVectorizer
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

def clean_resume(text):
    """Clean the resume text by applying basic preprocessing."""
    return text.lower()

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path):
    """Extract text from a PDF document."""
    text = ""
    doc = fitz.open(file_path)
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    doc.close()
    return text

In [None]:
category_mapping = {
    15: "Java Developer", 23: "Testing", 8: "DevOps Engineer",
    20: "Python Developer", 24: "Web Designing", 12: "HR",
    13: "Hadoop", 3: "Blockchain", 10: "ETL Developer",
    18: "Operations Manager", 6: "Data Science", 22: "Sales",
    16: "Mechanical Engineer", 1: "Arts", 7: "Database",
    11: "Electrical Engineering", 14: "Health and fitness",
    19: "PMO", 4: "Business Analyst", 9: "DotNet Developer",
    2: "Automation Testing", 17: "Network Security Engineer",
    21: "SAP Developer", 5: "Civil Engineer", 0: "Advocate",
}

# Function to predict the category of a resume
def predict_category(file_path):
    """Predict the job category of the uploaded resume."""
    # Extract and clean the resume text
    resume_text = extract_text_from_pdf(file_path)
    cleaned_text = clean_resume(resume_text)

    # Transform the cleaned text using TfidfVectorizer
    input_features = tfidf.transform([cleaned_text])

    # Make prediction using the trained classifier
    prediction_id = clf.predict(input_features)[0]

    # Get the category name from the mapping
    category_name = category_mapping.get(prediction_id, "Unknown")

    return category_name

## Read PDF Resume

In [None]:
# Test the function with a sample resume PDF
resume_file_path = "aliresume.pdf"

if os.path.exists(resume_file_path):
    predicted_category = predict_category(resume_file_path)
    print(f"Predicted Category: {predicted_category}")
else:
    print(f"File not found: {resume_file_path}")

Predicted Category: Data Science
