<a href="https://colab.research.google.com/github/anukriti-khare/Resume-Classifier/blob/main/ResumeParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

uploaded = files.upload()

Saving Resume_data.zip to Resume_data (1).zip


In [2]:
import zipfile
import os

zip_path = "Resume_data.zip"
extract_dir = "resumes_pdf"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

data_dir = os.path.join(extract_dir, "data")

print("Sample extracted PDF resumes:")
print(os.listdir(data_dir)[:5])

Sample extracted PDF resumes:
['HR', 'APPAREL', 'BPO', 'AUTOMOBILE', 'CONSULTANT']


In [3]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [4]:
import os
print(os.listdir("/content/resumes_pdf"))
print(os.listdir("/content/resumes_pdf/data"))

['data']
['HR', 'APPAREL', 'BPO', 'AUTOMOBILE', 'CONSULTANT', 'FITNESS', 'INFORMATION-TECHNOLOGY', 'ADVOCATE', 'BANKING', 'ARTS', 'BUSINESS-DEVELOPMENT', 'SALES', 'CHEF', 'AVIATION', 'FINANCE', 'ACCOUNTANT', 'HEALTHCARE', 'AGRICULTURE', 'DIGITAL-MEDIA', 'PUBLIC-RELATIONS', 'TEACHER', 'CONSTRUCTION', 'ENGINEERING', 'DESIGNER']


In [5]:
import os
import fitz

base_path = "/content/resumes_pdf/data"
extracted_resumes = []

for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)

    if os.path.isdir(folder_path):
        for file in os.listdir(folder_path):
            if file.endswith(".pdf"):
                file_path = os.path.join(folder_path, file)
                try:
                    doc = fitz.open(file_path)
                    text = ""
                    for page in doc:
                        text += page.get_text()
                    doc.close()

                    extracted_resumes.append({
                        "filename": file,
                        "label": folder,
                        "text": text
                    })
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

print(f"Total resumes extracted: {len(extracted_resumes)}")
print("Sample data:")
print(extracted_resumes[0]) if extracted_resumes else print("No data extracted.")

Total resumes extracted: 2484
Sample data:
{'filename': '10399912.pdf', 'label': 'HR', 'text': "HR PERSONNEL ASSISTANT\nSummary\nI am a U.S. citizen who is authorized to work in the US for any employer. I have worked 8 years as an Office Clerk, 2 years as a Student\nIntern/Office Assistant, and 4 years as a Contractor. I am applying for the Data Entry Clerk position (Advert ID# 224278 Advert ID# 224278).\nMy skills and experiences include: Administrative Support, Auditing, File Management, Meeting Facilitation, Office Materials Management, &\nInventory Management.\nHighlights\nCOMPUTER SKILLS: Microsoft Word, MS Excel, MS Outlook, MS PowerPoint, PeopleSoft. TYPING SKILLS: 40-60 WPM.\nADDITIONAL SKILLS: Administrative Support, Auditing, Clerical, Copy, Customer Service, Data Entry, Delivery, Documentation, Fax, File\nManagement, Letters, Meeting Facilitation, Organizational Skills, Proofreading, Receptionist, Research, Scanning, Scheduling, Secretarial,\nTelephone Skills, Office Equipme

In [6]:
import pandas as pd

df = pd.DataFrame(extracted_resumes)
df = df[['text', 'label']]  # Only keep relevant columns
print(df.head())

                                                text label
0  HR PERSONNEL ASSISTANT\nSummary\nI am a U.S. c...    HR
1  HR GENERALIST\nProfessional Summary\nDependabl...    HR
2  HR SPECIALIST\nSummary\nResults-driven profess...    HR
3  HR MANAGER\nSummary\nHUMAN RESOURCES MANAGER E...    HR
4  HR ASSISTANT\nSummary\nHighly motivated, and a...    HR


In [7]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', '', text)  # Keep only letters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [24]:
import random

def fake_synonym_replacement(text, n=3):
    words = text.split()
    if len(words) < 1:
        return text  # return as-is if no words
    for _ in range(min(n, len(words))):
        idx = random.randint(0, len(words)-1)
        words[idx] = words[idx][::-1]  # reverse the word
    return ' '.join(words)

In [25]:
df_aug = df.copy()
df_aug['clean_text'] = df_aug['clean_text'].apply(lambda x: fake_synonym_replacement(x, n=2))

df_combined = pd.concat([df, df_aug]).reset_index(drop=True)

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000
max_len = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_combined['clean_text'])

X = tokenizer.texts_to_sequences(df_combined['clean_text'])
X = pad_sequences(X, maxlen=max_len)

In [27]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
y = le.fit_transform(df_combined['label'])
y = to_categorical(y)

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [30]:
history = model.fit(X_train, y_train, epochs=15, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 133ms/step - accuracy: 0.0635 - loss: 3.1521 - val_accuracy: 0.2837 - val_loss: 2.9191
Epoch 2/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 130ms/step - accuracy: 0.3014 - loss: 2.6663 - val_accuracy: 0.5201 - val_loss: 1.8875
Epoch 3/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 132ms/step - accuracy: 0.5033 - loss: 1.8933 - val_accuracy: 0.6087 - val_loss: 1.4790
Epoch 4/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 142ms/step - accuracy: 0.6261 - loss: 1.3856 - val_accuracy: 0.7153 - val_loss: 1.2048
Epoch 5/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 131ms/step - accuracy: 0.7595 - loss: 0.9750 - val_accuracy: 0.8008 - val_loss: 0.8998
Epoch 6/15
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 132ms/step - accuracy: 0.8539 - loss: 0.6214 - val_accuracy: 0.8541 - val_loss: 0.6855
Epoch 7/15

In [31]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.8751 - loss: 0.5441
Test Accuracy: 0.8823


In [32]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    epochs=30,
                    batch_size=16,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop])

Epoch 1/30
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 74ms/step - accuracy: 0.9938 - loss: 0.0378 - val_accuracy: 0.8773 - val_loss: 0.5550
Epoch 2/30
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 79ms/step - accuracy: 0.9941 - loss: 0.0424 - val_accuracy: 0.8803 - val_loss: 0.5932
Epoch 3/30
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 0.9915 - loss: 0.0493 - val_accuracy: 0.8823 - val_loss: 0.6446
Epoch 4/30
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 78ms/step - accuracy: 0.9879 - loss: 0.0480 - val_accuracy: 0.8732 - val_loss: 0.6490


In [33]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.8646 - loss: 0.5412
Test Accuracy: 0.8773


In [41]:
model.save("resume_classifier_cnn.h5")



In [42]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [43]:
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [44]:
with open("accuracy.txt", "w") as f:
    f.write(f"Test Accuracy: {accuracy:.4f}")

In [45]:
from google.colab import files

files.download("resume_classifier_cnn.h5")
files.download("tokenizer.pkl")
files.download("label_encoder.pkl")
files.download("accuracy.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>