Installing dependencies

In [1]:
!pip install PyPDF2 scikit-learn pandas numpy


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


Importing Libraries

In [2]:
import PyPDF2
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re


Dataset (Sample job/resume classification data)

In [8]:
data = {
    "resume_text": [
        "Python, Data analysis, Machine Learning, Pandas, SQL, Statistics",
        "Java, Spring Boot, Microservices, REST API, AWS",
        "Excel, Financial reporting, Forecasting, Accounting, Power BI",
        "Deep learning, CNN, image processing, TensorFlow, Keras",
        "Customer support, communication skills, sales CRM"
    ],
    "label": ["Data Science", "Software", "Finance", "AI Engineer", "Support"]
}

df = pd.DataFrame(data)
df


Unnamed: 0,resume_text,label
0,"Python, Data analysis, Machine Learning, Panda...",Data Science
1,"Java, Spring Boot, Microservices, REST API, AWS",Software
2,"Excel, Financial reporting, Forecasting, Accou...",Finance
3,"Deep learning, CNN, image processing, TensorFl...",AI Engineer
4,"Customer support, communication skills, sales CRM",Support


Converting to TF-IDF

In [4]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["resume_text"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


Accuracy: 0.0
              precision    recall  f1-score   support

    Software       0.00      0.00      0.00       1.0
     Support       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Resume PDF Reader Function

In [5]:
def extract_text_from_pdf(uploaded_file):
    pdf_reader = PyPDF2.PdfReader(uploaded_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text


Skill Extraction Function

In [6]:
skills_list = ["python", "sql", "excel", "machine learning", "deep learning", "power bi",
               "nlp", "aws", "java", "tableau", "statistics", "pandas", "r", "communication"]

def extract_skills(text):
    text = text.lower()
    found = []
    for skill in skills_list:
        if skill in text:
            found.append(skill)
    return list(set(found))


Uploading & Predicting Resume Category

In [7]:
from google.colab import files
uploaded = files.upload()

for filename in uploaded.keys():
    resume_text = extract_text_from_pdf(filename)
    cleaned_text = re.sub(r'\s+', ' ', resume_text)
    vectorized = tfidf.transform([cleaned_text])
    prediction = model.predict(vectorized)[0]

    print("\n Resume Name:", filename)
    print(" Predicted Job Role:", prediction)
    print(" Extracted Skills:", extract_skills(cleaned_text))


Saving AZHAR DETAILED RESUME.pdf to AZHAR DETAILED RESUME.pdf

 Resume Name: AZHAR DETAILED RESUME.pdf
 Predicted Job Role: Data Science
 Extracted Skills: ['sql', 'r', 'deep learning', 'power bi', 'machine learning', 'nlp', 'excel', 'statistics', 'python']
