## EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Zainab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [120]:
df = pd.read_csv('datasets/UpdatedResumeDataset.csv')
df['Category'].value_counts()

Category
Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Sales                        40
Data Science                 40
Mechanical Engineer          40
ETL Developer                40
Blockchain                   40
Operations Manager           40
Arts                         36
Database                     33
Health and fitness           30
PMO                          30
Electrical Engineering       30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
Civil Engineer               24
SAP Developer                24
Advocate                     20
Name: count, dtype: int64

In [121]:
synth_df = pd.read_csv('datasets/synthetic_resume_dataset.csv')
synth_df.shape

(525, 2)

In [122]:
synth_df2 = pd.read_csv('datasets/synthetic_resume_dataset2.csv')
synth_df2

Unnamed: 0,Resume,Category
0,**Full-Stack Web Developer** Proficient in mod...,Web Developer
1,**Front-End Developer** Specialized in creatin...,Web Developer
2,**Back-End Engineer** Skilled in building robu...,Web Developer
3,**Junior Web Developer** Eager to apply founda...,Web Developer
4,**Senior Web Applications Developer** Led deve...,Web Developer
5,**React Developer** Focused on component-based...,Web Developer
6,**Node.js Developer** Adept at building fast a...,Web Developer
7,**UI/UX Developer** Bridge between design and ...,Web Developer
8,**PHP Laravel Developer** Developed and mainta...,Web Developer
9,**Full-Stack JavaScript Developer** Experience...,Web Developer


In [124]:
combined_df = pd.concat([df, synth_df, synth_df2], ignore_index=True)
combined_df.shape

(1517, 2)

In [125]:
combined_df.isna().sum()

Category    0
Resume      0
dtype: int64

In [126]:
combined_df['Category'].value_counts()

Category
Java Developer               105
Testing                       91
DevOps Engineer               76
Python Developer              69
Web Designing                 66
HR                            65
Hadoop                        63
Data Science                  61
Sales                         61
Mechanical Engineer           61
ETL Developer                 61
Blockchain                    61
Operations Manager            61
Arts                          57
Database                      54
Electrical Engineering        51
Health and fitness            51
PMO                           51
DotNet Developer              49
Business Analyst              49
Automation Testing            47
Network Security Engineer     46
SAP Developer                 45
Civil Engineer                45
Advocate                      41
Web Developer                 30
Name: count, dtype: int64

In [127]:
#duplicate resume
combined_df['Resume'].duplicated().sum()

np.int64(796)

In [128]:
#drop duplicate resume
combined_df.drop_duplicates(subset=['Resume'], keep='first', inplace=True)

In [129]:
combined_df['Resume'].duplicated().sum()

np.int64(0)

In [130]:
combined_df['Category'].value_counts()

Category
Java Developer               34
Database                     32
HR                           31
Advocate                     31
Data Science                 31
Web Developer                30
DevOps Engineer              28
Automation Testing           28
DotNet Developer             28
Hadoop                       28
Testing                      28
Python Developer             27
Business Analyst             27
Civil Engineer               27
Arts                         27
Health and fitness           27
SAP Developer                27
Mechanical Engineer          26
Blockchain                   26
Sales                        26
Electrical Engineering       26
Network Security Engineer    26
ETL Developer                26
Web Designing                25
Operations Manager           25
PMO                          24
Name: count, dtype: int64

In [131]:
# Show rows where Resume starts with the specified string
mask = combined_df['Resume'].str.startswith("Here are 20 synthetic resume texts", na=False)
print(display(combined_df[mask]))


Unnamed: 0,Category,Resume
962,Java Developer,Here are 20 synthetic resume texts for the job...
983,Database,Here are 20 synthetic resume texts for the 'Da...
1004,Advocate,Here are 20 synthetic resume texts for the job...
1025,HR,Here are 20 synthetic resume texts for the job...
1046,Data Science,Here are 20 synthetic resume texts for the job...
1067,Automation Testing,Here are 20 synthetic resume texts for the 'Au...
1088,DevOps Engineer,Here are 20 synthetic resume texts for the job...
1109,Testing,Here are 20 synthetic resume texts for the 'Te...
1130,DotNet Developer,Here are 20 synthetic resume texts for DotNet ...
1151,Hadoop,Here are 20 synthetic resume texts for the job...


None


In [132]:
#drop these rows
combined_df.drop(combined_df[combined_df['Resume'].str.startswith("Here are 20 synthetic resume texts", na=False)].index, inplace=True)

In [133]:
cleaned_df = combined_df.to_csv("datasets/cleaned_resume.csv", index=False)

## FE

In [134]:
cleaned_df = pd.read_csv('datasets/cleaned_resume.csv')
cleaned_df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [175]:
from sklearn.preprocessing import  LabelEncoder
le = LabelEncoder()
cleaned_df['Category_Encoded'] = le.fit_transform(cleaned_df['Category'])

joblib.dump(le, 'models/le.pkl')

['models/le.pkl']

In [136]:
for i, label in enumerate(le.classes_):
    print(f"{i}: {label}")

0: Advocate
1: Arts
2: Automation Testing
3: Blockchain
4: Business Analyst
5: Civil Engineer
6: Data Science
7: Database
8: DevOps Engineer
9: DotNet Developer
10: ETL Developer
11: Electrical Engineering
12: HR
13: Hadoop
14: Health and fitness
15: Java Developer
16: Mechanical Engineer
17: Network Security Engineer
18: Operations Manager
19: PMO
20: Python Developer
21: SAP Developer
22: Sales
23: Testing
24: Web Designing
25: Web Developer


In [138]:
import re
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    text = re.sub('http\\S+\\s*', ' ', text)
    text = re.sub('RT|cc', ' ', text)
    text = re.sub('#\\S+', '', text)
    text = re.sub('@\\S+', ' ', text)
    text = re.sub('[^a-zA-Z0-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [139]:
cleaned_df['Cleaned_Resume'] = cleaned_df['Resume'].apply(preprocess_text)

cleaned_df.head()

Unnamed: 0,Category,Resume,Category_Encoded,Cleaned_Resume
0,Data Science,Skills * Programming Languages: Python (pandas...,6,skill programming language python panda numpy ...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,6,education detail may 2013 may 2017 b e uit rgp...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",6,area interest deep learning control system des...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,6,skill r python sap hana tableau sap hana sql s...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",6,education detail mca ymcaust faridabad haryana...


In [174]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(
    ngram_range=(1,3),
    max_df=0.9,
    min_df=2,
    stop_words='english'
)

X = tf_idf.fit_transform(cleaned_df['Cleaned_Resume'])
vocab = tf_idf.get_feature_names_out()

import joblib
joblib.dump(tf_idf, 'models/tf_idf.pkl')

['models/tf_idf.pkl']

In [161]:
from sklearn.model_selection import train_test_split

y = cleaned_df['Category_Encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training

In [142]:
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score,precision_score,f1_score

In [143]:
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    # 'CatBoost': CatBoostClassifier(),
    'SVM': SVC(),
}

In [144]:
metrics_list = []
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    metrics_list.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Recall': recall,
        'Precision': precision,
        'F1 Score': f1
    })

metrics_df = pd.DataFrame(metrics_list)
metrics_df.sort_values(by='Accuracy', ascending=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model,Accuracy,Recall,Precision,F1 Score
3,Random Forest,0.885714,0.885714,0.917063,0.885798
4,XGBoost,0.864286,0.864286,0.905237,0.863843
0,Logistic Regression,0.835714,0.835714,0.895431,0.837072
5,SVM,0.8,0.8,0.885083,0.791444
2,Decision Tree,0.764286,0.764286,0.80824,0.76653
1,KNN,0.707143,0.707143,0.706893,0.68239


In [145]:
for i, label in enumerate(le.classes_):
    print(f"{i}: {label}")

0: Advocate
1: Arts
2: Automation Testing
3: Blockchain
4: Business Analyst
5: Civil Engineer
6: Data Science
7: Database
8: DevOps Engineer
9: DotNet Developer
10: ETL Developer
11: Electrical Engineering
12: HR
13: Hadoop
14: Health and fitness
15: Java Developer
16: Mechanical Engineer
17: Network Security Engineer
18: Operations Manager
19: PMO
20: Python Developer
21: SAP Developer
22: Sales
23: Testing
24: Web Designing
25: Web Developer


In [164]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print(accuracy_score(y_test, rf_pred))
print(recall_score(y_test, rf_pred, average='weighted'))
print(precision_score(y_test, rf_pred, average='weighted'))
print(f1_score(y_test, rf_pred, average='weighted'))

import joblib
joblib.dump(rf, 'models/rf.pkl')

0.9
0.9
0.9194217687074829
0.8996562960848675


['models/rf.pkl']

In [173]:
resume = input("Enter your text: ")

preprocess_text(resume)

vectorized = tf_idf.transform([resume])

prediction = rf.predict(vectorized)
predicted_category = le.inverse_transform([prediction[0]])[0]
print(f"Predicted Category: {predicted_category}")


Predicted Category: HR
