In [1]:
import os
from typing import Tuple, List

notebook_dir = os.path.abspath(os.getcwd())
project_root = os.path.dirname(notebook_dir)

csv_path = os.path.join(project_root, "data/processed/processed_resume.csv")

In [70]:
import pandas as pd
import numpy as np 
import pickle

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score

In [3]:
df = pd.read_csv(csv_path)
df.isna().sum()

Unnamed: 0    0
category      0
resume        0
word_count    0
dtype: int64

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,category,resume,word_count
0,0,6,skills programming languages python pandas num...,615
1,1,6,education details may 2013 to may 2017 data sc...,136
2,2,6,areas of interest deep learning control system...,234
3,3,6,skills r python sap hana tableau sap hana sql ...,904
4,4,6,education details mca ymcaust faridabad haryan...,55


In [5]:
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,category,resume,word_count
0,6,skills programming languages python pandas num...,615
1,6,education details may 2013 to may 2017 data sc...,136
2,6,areas of interest deep learning control system...,234
3,6,skills r python sap hana tableau sap hana sql ...,904
4,6,education details mca ymcaust faridabad haryan...,55


In [18]:
vectorizer_1 = CountVectorizer()
X_one = vectorizer_1.fit_transform(df["resume"]).toarray()

vectorizer_2 = TfidfVectorizer(stop_words="english")
X_two = vectorizer_2.fit_transform(df["resume"]).toarray()

X_one.shape, X_two.shape

((962, 7272), (962, 7062))

In [19]:
X_train_one, X_test_one, y_train_one, y_test_one = train_test_split(X_one, df["category"].values, test_size=0.33)

In [20]:
X_train_two, X_test_two, y_train_two, y_test_two = train_test_split(X_one, df["category"].values, test_size=0.33)

In [21]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [28]:
## for gnb

model = gnb.fit(X_train_one, y_train_one)
y_hat_one = model.predict(X_test_one)
print("The accuracy score for count vectorized data is: ", round(accuracy_score(y_test_one, y_hat_one) * 100, 2))
print("The precision score for count vectorized data is: ", round(precision_score(y_test_one, y_hat_one, average='weighted') * 100, 2))

model = gnb.fit(X_train_two, y_train_two)
y_hat_two = model.predict(X_test_two)
print("The accuracy score for tfidf vectorized data is: ", round(accuracy_score(y_test_two, y_hat_two) * 100, 2))
print("The precision score for tfidf vectorized data is: ", round(precision_score(y_test_two, y_hat_two, average='weighted') * 100, 2))

The accuracy score for count vectorized data is:  96.86
The precision score for count vectorized data is:  97.43
The accuracy score for tfidf vectorized data is:  99.69
The precision score for tfidf vectorized data is:  99.71


In [29]:
## for mnb

model = mnb.fit(X_train_one, y_train_one)
y_hat_one = model.predict(X_test_one)
print("The accuracy score for count vectorized data is: ", round(accuracy_score(y_test_one, y_hat_one) * 100, 2))
print("The precision score for count vectorized data is: ", round(precision_score(y_test_one, y_hat_one, average='weighted') * 100, 2))

model = mnb.fit(X_train_two, y_train_two)
y_hat_two = model.predict(X_test_two)
print("The accuracy score for tfidf vectorized data is: ", round(accuracy_score(y_test_two, y_hat_two) * 100, 2))
print("The precision score for tfidf vectorized data is: ", round(precision_score(y_test_two, y_hat_two, average='weighted') * 100, 2))

The accuracy score for count vectorized data is:  96.86
The precision score for count vectorized data is:  97.57
The accuracy score for tfidf vectorized data is:  96.23
The precision score for tfidf vectorized data is:  96.86


In [30]:
## for bnb

model = bnb.fit(X_train_one, y_train_one)
y_hat_one = model.predict(X_test_one)
print("The accuracy score for count vectorized data is: ", round(accuracy_score(y_test_one, y_hat_one) * 100, 2))
print("The precision score for count vectorized data is: ", round(precision_score(y_test_one, y_hat_one, average='weighted') * 100, 2))

model = bnb.fit(X_train_two, y_train_two)
y_hat_two = model.predict(X_test_two)
print("The accuracy score for tfidf vectorized data is: ", round(accuracy_score(y_test_two, y_hat_two) * 100, 2))
print("The precision score for tfidf vectorized data is: ", round(precision_score(y_test_two, y_hat_two, average='weighted') * 100, 2))

The accuracy score for count vectorized data is:  71.38
The precision score for count vectorized data is:  86.53
The accuracy score for tfidf vectorized data is:  65.72
The precision score for tfidf vectorized data is:  88.1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Let's now check the accuracy and the precision score for other machine learning algorithms.

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)

In [39]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
}

In [58]:
def train_classifier(clf, X_train, y_train, X_test, y_test) -> tuple[float, float]:
    clf.fit(X_train, y_train)
    y_hat = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_hat) * 100
    precision = precision_score(y_test, y_hat, average="weighted") * 100
    return float(accuracy), float(precision)

In [59]:
##test 
score_acc, score_prec = train_classifier(clf=mnb, X_train=X_train_two, y_train=y_train_two, X_test=X_test_two, y_test=y_test_two)
score_acc, score_prec

(96.22641509433963, 96.85558235210668)

In [66]:
from typing import Any


def clf_metric_calculator(Xtrain, ytrain, Xtest, ytest) -> tuple[Any, Any]:
    accuracy_scores = []
    precision_scores = []

    for algo, clf in clfs.items():
        current_acc, current_prec = train_classifier(clf, Xtrain, ytrain, Xtest, ytest)
        accuracy_scores.append(current_acc)
        precision_scores.append(current_prec)
    return accuracy_scores, precision_scores

In [65]:
# For count vectorized data
accuracy_scores_cv, precision_scores_cv = clf_metric_calculator(
    Xtrain=X_train_one,
    ytrain=y_train_one,
    Xtest=X_test_one,
    ytest=y_test_one
)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [68]:
import warnings
warnings.filterwarnings("ignore")

## For tf-idf vectorized data
accuracy_scores_tfidf, precision_scores_tfidf = clf_metric_calculator(
    Xtrain=X_train_two,
    ytrain=y_train_two,
    Xtest=X_test_two,
    ytest=y_test_two
)

In [67]:
performance_df_cv = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores_cv, "Precision":precision_scores_cv}).sort_values('Precision',ascending=True)
performance_df_cv

Unnamed: 0,Algorithm,Accuracy,Precision
0,SVC,10.377358,1.076896
5,AdaBoost,19.811321,10.577829
2,DT,38.050314,43.645937
1,KN,81.446541,87.988458
7,ETC,98.742138,99.245283
6,BgC,99.371069,99.425759
3,LR,100.0,100.0
4,RF,100.0,100.0
8,GBDT,100.0,100.0


In [69]:
performance_df_tfidf = pd.DataFrame({'Algorithm':clfs.keys(), "Accuracy":accuracy_scores_tfidf, "Precision":precision_scores_tfidf}).sort_values('Precision', ascending=True)
performance_df_tfidf

Unnamed: 0,Algorithm,Accuracy,Precision
0,SVC,10.377358,1.076896
5,AdaBoost,24.842767,17.739977
2,DT,34.591195,36.063649
1,KN,85.849057,88.859775
4,RF,99.056604,99.184809
3,LR,99.685535,99.716981
7,ETC,99.685535,99.716981
8,GBDT,99.685535,99.737945
6,BgC,100.0,100.0


In [76]:
model_dir = os.path.join(project_root, "models")
model = os.path.join(model_dir, "model.pkl")
vectorizer = os.path.join(model_dir, "vectorizer.pkl")

pickle.dump(lrc, open(model, "wb"))
pickle.dump(vectorizer_1, open(vectorizer, "wb"))