In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
df=pd.read_csv('Resume.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Resume.csv'

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['Category'].unique()

In [None]:
df['Category'].nunique()

In [None]:
df['Category'].value_counts()

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(x=df['Category'])
plt.xticks(rotation=90)
plt.show()

In [None]:
df['Category'].value_counts().plot(kind='bar',figsize=(12,4))
plt.title('Distribution of Resume Categories')
plt.xticks(rotation=90)
plt.show()

In [None]:
df['Category'].value_counts().plot(kind='pie',autopct='%.2f%%',figsize=(15,10),shadow=True,colors=plt.cm.coolwarm(np.linspace(0,1,3)))
plt.xticks(rotation=90)

## Preprocessing and Encoding:

In [None]:
df['Resume_str'][0]

In [None]:
import re 
import string
def clean_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

In [None]:
df['Resume_str']=df['Resume_str'].apply(clean_text)

In [None]:
df['Resume_str'][0]

Workflow:
    1. Fit the label encoder on the Category column to learn the unique categories.    2. 
Transform the Category column into encoded numerical values.    3. 
Fit the TF-IDF vectorizer on the Resume column to learn the vocabulary and IDF (Inverse Document Frequency) of the words    4. .
Transform the Resume texts into a sparse matrix of TF-IDF features.

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
le.fit(df['Category'])
df['Category']=le.transform(df['Category'])

In [None]:
df['Category'].unique()

In [None]:
df['Category'].value_counts()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf=TfidfVectorizer(max_features=5000,stop_words='english')
tfidf.fit(df['Resume_str'])
requredTaxt=tfidf.transform(df['Resume_str'])

In [None]:
requredTaxt

## Train Test Data and Modeling

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(requredTaxt,df['Category'], test_size=0.3, random_state=42, stratify=df['Category'])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print("Training set size:", X_train.shape)
print("Validation set size:", X_val.shape)
print("Test set size:", X_test.shape)


In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
accuracy_lis=[] #
model_lis=[]

# Training and Evaluating Multiple Machine Learning Models

In order to build a robust NLP-based resume classification system, it's beneficial to train and compare the performance of multiple machine learning models. This approach allows us to select the best model based on performance metrics such as accuracy.

Models Considered:

1. K-Nearest Neighbors (KNN) Classifier: Already implemented, a non-parametric, instance-based learning algorithm.
2. Logistic Regression: A linear model for classification that predicts the probability of a categorical dependent variable.
3. Random Forest Classifier: An ensemble learning method that operates by constructing a multitude of decision trees at training time and       outputting the class that is the mode of the classes of the individual trees.
4. Multinomial Naive Bayes: A probabilistic learning method that is particularly suited for text classification tasks.

Evaluation Metric:

All models are evaluated using the accuracy score, which is the proportion of correct predictions over all predictions made.

In [None]:
from sklearn.naive_bayes import MultinomialNB
ml= MultinomialNB()
ml.fit(X_train, y_train)

In [None]:
y_val_pred = ml.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_val_pred))

In [None]:
y_test_pred = ml.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_test_pred))

print("precision:",precision_score(y_test, y_test_pred, average='weighted',zero_division=0))
print("recall:",recall_score(y_test, y_test_pred, average='weighted'))
print("f1:",f1_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred,zero_division=0))

model_lis.append("MultinomialNB")
accuracy_lis.append(accuracy_score(y_val, y_val_pred)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
LR = LogisticRegression(max_iter=1000)
LR.fit(X_train, y_train)

In [None]:
y_val_pred = LR.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_val_pred))

In [None]:
y_test_pred = LR.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_test_pred))

print("precision:",precision_score(y_test, y_test_pred, average='weighted',zero_division=0))
print("recall:",recall_score(y_test, y_test_pred, average='weighted'))
print("f1:",f1_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred,zero_division=0))

model_lis.append("LogisticRegression")
accuracy_lis.append(accuracy_score(y_val, y_val_pred)*100)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf=KNeighborsClassifier()
clf.fit(X_train,y_train)

In [None]:
y_val_pred = clf.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_val_pred))

In [None]:
y_test_pred = clf.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_test_pred))

print("precision:",precision_score(y_test, y_test_pred, average='weighted',zero_division=0))
print("recall:",recall_score(y_test, y_test_pred, average='weighted'))
print("f1:",f1_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred,zero_division=0))

model_lis.append("KNeighborsClassifier")
accuracy_lis.append(accuracy_score(y_val, y_val_pred)*100)

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier()
RFC.fit(X_train, y_train)

In [None]:
y_val_pred = RFC.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_val_pred))

In [None]:
y_test_pred = RFC.predict(X_test)
print("Accuracy:",accuracy_score(y_test, y_test_pred))

print("precision:",precision_score(y_test, y_test_pred, average='weighted',zero_division=0))
print("recall:",recall_score(y_test, y_test_pred, average='weighted'))
print("f1:",f1_score(y_test, y_test_pred, average='weighted'))
print(classification_report(y_test, y_test_pred,zero_division=0))

model_lis.append("RandomForestClassifier")
accuracy_lis.append(accuracy_score(y_val, y_val_pred)*100)

In [None]:
print(confusion_matrix(y_test,y_test_pred))

# Summarization of the models and accuracy

In [None]:
accuracy_data = pd.DataFrame({'model': model_lis, 'accuracy': accuracy_lis})

# Plot the data
plt.figure(figsize=(8, 6))
plt.bar(accuracy_data['model'], accuracy_data['accuracy'])
plt.xlabel('Model Name')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.title('Overview of the models and accuracy')
plt.show()

# Pickle File

In [None]:
import pickle
pickle.dump(RFC,open('model.pkl','wb'))

In [None]:
ID_to_find = 22323967
resume_str_value = df.loc[df['ID'] == ID_to_find, 'Resume_str'].values[0]
print(resume_str_value)