## EDA

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [131]:
df = pd.read_csv('datasets/UpdatedResumeDataset.csv')
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [129]:
synth_df = pd.read_csv('datasets/synthetic_resume_dataset.csv')
synth_df.head()

Unnamed: 0,Resume,Category
0,Here are 20 synthetic resume texts for the job...,Java Developer
1,**Java Backend Developer.** Proficient in Core...,Java Developer
2,**Senior Java Engineer.** Specializing in Java...,Java Developer
3,**Junior Java Developer.** Solid understanding...,Java Developer
4,"**Java Developer.** Key skills include Java 8,...",Java Developer


In [133]:
combined_df = pd.concat([df, synth_df], ignore_index=True)
combined_df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [134]:
combined_df.isna().sum()

Category    0
Resume      0
dtype: int64

In [135]:
combined_df['Category'].value_counts()

Category
Java Developer               105
Testing                       91
DevOps Engineer               76
Python Developer              69
Web Designing                 66
HR                            65
Hadoop                        63
Sales                         61
Data Science                  61
Mechanical Engineer           61
ETL Developer                 61
Blockchain                    61
Operations Manager            61
Arts                          57
Database                      54
Health and fitness            51
PMO                           51
Electrical Engineering        51
Business Analyst              49
DotNet Developer              49
Automation Testing            47
Network Security Engineer     46
Civil Engineer                45
SAP Developer                 45
Advocate                      41
Name: count, dtype: int64

In [5]:
#duplicate resume
df['Resume'].duplicated().sum()

np.int64(796)

In [6]:
#drop duplicate resume
df.drop_duplicates(subset=['Resume'], keep='first', inplace=True)

In [7]:
df['Resume'].duplicated().sum()

np.int64(0)

In [8]:
df['Category'].value_counts()

Category
Java Developer               13
Database                     11
Advocate                     10
HR                           10
Data Science                 10
Automation Testing            7
DevOps Engineer               7
Testing                       7
DotNet Developer              7
Hadoop                        7
SAP Developer                 6
Python Developer              6
Health and fitness            6
Civil Engineer                6
Arts                          6
Business Analyst              6
Sales                         5
Blockchain                    5
Mechanical Engineer           5
ETL Developer                 5
Electrical Engineering        5
Network Security Engineer     5
Web Designing                 4
Operations Manager            4
PMO                           3
Name: count, dtype: int64

In [10]:
cleaned_df = df.to_csv("datasets/cleaned_resume.csv", index=False)

## FE

In [11]:
cleaned_df = pd.read_csv('datasets/cleaned_resume.csv')
cleaned_df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [12]:
from sklearn.preprocessing import  LabelEncoder
le = LabelEncoder()
cleaned_df['Category_Encoded'] = le.fit_transform(cleaned_df['Category'])

In [41]:
for i, label in enumerate(le.classes_):
    print(f"{i}: {label}")

0: Advocate
1: Arts
2: Automation Testing
3: Blockchain
4: Business Analyst
5: Civil Engineer
6: Data Science
7: Database
8: DevOps Engineer
9: DotNet Developer
10: ETL Developer
11: Electrical Engineering
12: HR
13: Hadoop
14: Health and fitness
15: Java Developer
16: Mechanical Engineer
17: Network Security Engineer
18: Operations Manager
19: PMO
20: Python Developer
21: SAP Developer
22: Sales
23: Testing
24: Web Designing


In [13]:
import re
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    text = re.sub('http\\S+\\s*', ' ', text)
    text = re.sub('RT|cc', ' ', text)
    text = re.sub('#\\S+', '', text)
    text = re.sub('@\\S+', ' ', text)
    text = re.sub('[^a-zA-Z0-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [14]:
cleaned_df['Cleaned_Resume'] = cleaned_df['Resume'].apply(preprocess_text)

cleaned_df.head()

Unnamed: 0,Category,Resume,Category_Encoded,Cleaned_Resume
0,Data Science,Skills * Programming Languages: Python (pandas...,6,skill programming language python panda numpy ...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,6,education detail may 2013 may 2017 b e uit rgp...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",6,area interest deep learning control system des...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,6,skill r python sap hana tableau sap hana sql s...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",6,education detail mca ymcaust faridabad haryana...


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(
    ngram_range=(1,3),
    max_df=0.9,
    min_df=2,
    stop_words='english'
)

X = tf_idf.fit_transform(cleaned_df['Cleaned_Resume'])
vocab = tf_idf.get_feature_names_out()

In [16]:
from sklearn.model_selection import train_test_split

y = cleaned_df['Category_Encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training

In [17]:
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score,precision_score,f1_score

In [18]:
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'CatBoost': CatBoostClassifier(),
    'SVM': SVC(),
}

In [19]:
metrics_list = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    metrics_list.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1
    })

metrics_df = pd.DataFrame(metrics_list)
metrics_df.sort_values(by='Accuracy', ascending=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Learning rate set to 0.071363
0:	learn: 3.1966716	total: 1.02s	remaining: 16m 57s
1:	learn: 3.1870944	total: 1.9s	remaining: 15m 45s
2:	learn: 3.1638456	total: 3.01s	remaining: 16m 39s
3:	learn: 3.1434410	total: 4.05s	remaining: 16m 48s
4:	learn: 3.1124683	total: 4.94s	remaining: 16m 22s
5:	learn: 3.0906770	total: 5.77s	remaining: 15m 55s
6:	learn: 3.0740520	total: 6.61s	remaining: 15m 37s
7:	learn: 3.0576071	total: 7.47s	remaining: 15m 26s
8:	learn: 3.0391843	total: 8.51s	remaining: 15m 36s
9:	learn: 3.0160755	total: 9.35s	remaining: 15m 25s
10:	learn: 2.9966768	total: 10.2s	remaining: 15m 14s
11:	learn: 2.9771656	total: 11.1s	remaining: 15m 12s
12:	learn: 2.9545412	total: 12.7s	remaining: 16m 4s
13:	learn: 2.9454437	total: 14.1s	remaining: 16m 34s
14:	learn: 2.9185140	total: 16.1s	remaining: 17m 38s
15:	learn: 2.9004272	total: 17.3s	remaining: 17m 45s
16:	learn: 2.8906519	total: 18.9s	remaining: 18m 12s
17:	learn: 2.8720522	total: 20.3s	remaining: 18m 26s
18:	learn: 2.8620818	total: 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model,Accuracy,Recall,Precision,F1 Score
5,CatBoost,0.852941,0.852941,0.887255,0.848039
1,KNN,0.823529,0.823529,0.980392,0.868137
3,Random Forest,0.764706,0.764706,0.740196,0.735651
4,XGBoost,0.735294,0.735294,0.754902,0.716993
2,Decision Tree,0.676471,0.676471,0.843137,0.706933
0,Logistic Regression,0.382353,0.382353,0.496732,0.391051
6,SVM,0.294118,0.294118,0.471814,0.311176


In [110]:
from imblearn.over_sampling import SMOTE

# Fix: Set k_neighbors to a value less than or equal to the number of samples in the smallest class
min_class_count = y_train.value_counts().min()
k_neighbors = min(5, min_class_count - 1) if min_class_count > 1 else 1
print(k_neighbors)
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

1


In [111]:
knn_smote = KNeighborsClassifier()
knn_smote.fit(X_train_resampled, y_train_resampled)
knn_smote_pred = knn_smote.predict(X_test)

print(accuracy_score(y_test, knn_smote_pred))
print(recall_score(y_test, knn_smote_pred, average='weighted'))
print(precision_score(y_test, knn_smote_pred, average='weighted'))
print(f1_score(y_test, knn_smote_pred, average='weighted'))

0.8823529411764706
0.8823529411764706
0.9352941176470588
0.8897058823529411


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [117]:
for i, label in enumerate(le.classes_):
    print(f"{i}: {label}")

0: Advocate
1: Arts
2: Automation Testing
3: Blockchain
4: Business Analyst
5: Civil Engineer
6: Data Science
7: Database
8: DevOps Engineer
9: DotNet Developer
10: ETL Developer
11: Electrical Engineering
12: HR
13: Hadoop
14: Health and fitness
15: Java Developer
16: Mechanical Engineer
17: Network Security Engineer
18: Operations Manager
19: PMO
20: Python Developer
21: SAP Developer
22: Sales
23: Testing
24: Web Designing


In [116]:
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')
print(classification_report(y_test, knn_smote_pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       1.00      1.00      1.00         2
           4       0.60      1.00      0.75         3
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         2
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         5
          13       1.00      1.00      1.00         3
          14       1.00      0.50      0.67         2
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         1
          17       1.00      1.00      1.00         1
          18       1.00      1.00      1.00         2
          20       1.00    

In [38]:
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)

print(accuracy_score(y_test, knn_pred))
print(recall_score(y_test, knn_pred, average='weighted'))
print(precision_score(y_test, knn_pred, average='weighted'))
print(f1_score(y_test, knn_pred, average='weighted'))

import joblib
joblib.dump(knn, 'models/knn_model.pkl')

0.8235294117647058
0.8235294117647058
0.980392156862745
0.8681372549019608


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['models/knn_model.pkl']

In [36]:
catboost = CatBoostClassifier(verbose=0)
catboost.fit(X_train, y_train)
catboost_pred = catboost.predict(X_test)

print(accuracy_score(y_test, catboost_pred))
print(recall_score(y_test, catboost_pred, average='weighted'))
print(precision_score(y_test, catboost_pred, average='weighted'))
print(f1_score(y_test, catboost_pred, average='weighted'))


0.8529411764705882
0.8529411764705882
0.8872549019607843
0.8480392156862746


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
import joblib
joblib.dump(catboost, 'models/catboost_model.pkl')

['models/catboost_model.pkl']

In [126]:
resume = input("Enter your text: ")

preprocess_text(resume)

vectorized = tf_idf.transform([resume])

prediction = knn_smote.predict(vectorized)
predicted_category = le.inverse_transform([prediction[0]])[0]
print(f"Predicted Category: {predicted_category}")


Predicted Category: Arts


In [127]:
cleaned_df['Category'].value_counts()

Category
Java Developer               13
Database                     11
Advocate                     10
HR                           10
Data Science                 10
Automation Testing            7
DevOps Engineer               7
Testing                       7
DotNet Developer              7
Hadoop                        7
SAP Developer                 6
Python Developer              6
Health and fitness            6
Civil Engineer                6
Arts                          6
Business Analyst              6
Sales                         5
Blockchain                    5
Mechanical Engineer           5
ETL Developer                 5
Electrical Engineering        5
Network Security Engineer     5
Web Designing                 4
Operations Manager            4
PMO                           3
Name: count, dtype: int64