In [None]:
import pandas as pd
import re
import numpy as np  # Added for centroid computation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import joblib

In [3]:
df = pd.read_csv('resume_data.csv')
print("Number of resumes per category:\n", df['Category'].value_counts())
print("Dataset Shape:", df.shape)

Number of resumes per category:
 Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Sales                        40
Data Science                 40
Mechanical Engineer          40
ETL Developer                40
Blockchain                   40
Operations Manager           40
Arts                         36
Database                     33
Health and fitness           30
PMO                          30
Electrical Engineering       30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
Civil Engineer               24
SAP Developer                24
Advocate                     20
Name: count, dtype: int64
Dataset Shape: (962, 2)


In [7]:
# Clean resume text
def clean_resume(text):
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters & numbers
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Split into words
    return ' '.join(text)

In [8]:
df['Cleaned_Resume'] = df['Resume'].apply(clean_resume)

In [9]:
vectorizer = TfidfVectorizer(max_features=3000)  # Extract top 3000 keywords
X = vectorizer.fit_transform(df['Cleaned_Resume']).toarray()
y = df['Category']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [11]:
# === New Code: Compute Centroids for Each Category ===
centroids = {}
for category in set(y_train):
    # Get indices of resumes in this category
    category_indices = np.where(y_train == category)[0]
    # Extract their TF-IDF vectors
    category_vectors = x_train[category_indices]
    # Compute the mean vector (centroid)
    centroid = np.mean(category_vectors, axis=0)
    centroids[category] = centroid
# Save centroids to a file
joblib.dump(centroids, 'category_centroids.pkl')
print("📦 Category centroids saved successfully!")

📦 Category centroids saved successfully!


In [15]:
model = MultinomialNB()
model.fit(x_train, y_train)


In [None]:

y_pred = model.predict(x_test)
print("\n✅ Accuracy of Model:", accuracy_score(y_test, y_pred))
print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
report = classification_report(y_test, y_pred, output_dict=True)


✅ Accuracy of Model: 0.9278350515463918
📊 Confusion Matrix:
 [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0]
 [0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
joblib.dump(model, 'ats_nb_model.pkl')
joblib.dump(vectorizer, 'ats_vectorizer.pkl')
print("\n📦 Model and vectorizer saved successfully!")


📦 Model and vectorizer saved successfully!


In [22]:
report_summary = {label: round(metrics["f1-score"], 2) for label, metrics in report.items() if label in df['Category'].unique()}

In [23]:
report_summary

{'Advocate': 0.0,
 'Arts': 1.0,
 'Automation Testing': 0.4,
 'Blockchain': 1.0,
 'Business Analyst': 1.0,
 'Civil Engineer': 1.0,
 'Data Science': 1.0,
 'Database': 1.0,
 'DevOps Engineer': 1.0,
 'DotNet Developer': 1.0,
 'ETL Developer': 1.0,
 'Electrical Engineering': 1.0,
 'HR': 1.0,
 'Hadoop': 1.0,
 'Health and fitness': 1.0,
 'Java Developer': 0.78,
 'Mechanical Engineer': 1.0,
 'Network Security Engineer': 1.0,
 'Operations Manager': 1.0,
 'PMO': 1.0,
 'Python Developer': 1.0,
 'SAP Developer': 0.86,
 'Sales': 1.0,
 'Testing': 0.82,
 'Web Designing': 1.0}