In [26]:
import pandas as pd
from datasets import load_dataset
from processor import Preprocessor
from similarity import SentenceTransformerSimilarity, BertSimilarity, TFIDFSimilarity
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
data_path = "cnamuangtoun/resume-job-description-fit"

In [3]:
datasets = load_dataset(data_path)

train_dataset = datasets['train']
test_dataset = datasets['test']

In [4]:
preprocessor = Preprocessor()
sentence_transformer = SentenceTransformerSimilarity()
bert_similarity = BertSimilarity()
tfidf_similarity = TFIDFSimilarity()

In [5]:
def compute_similarity(example):
    resume = preprocessor.preprocess(example["resume_text"])
    job_description = preprocessor.preprocess(example["job_description_text"])
    label = 0 if example["label"] == "No Fit" else 1

    tfidf = tfidf_similarity.similarity(resume, job_description)
    bert = bert_similarity.similarity(resume, job_description)
    transformer = sentence_transformer.similarity(resume, job_description)

    return {
        "resume": resume, "job_description": job_description, "label": label,
        "tfidf_similarity": tfidf, "bert_similarity": bert, "sentence_transformer_similarity": transformer
    }

In [6]:
test_dataset = test_dataset.map(compute_similarity)

Map: 100%|██████████| 1759/1759 [18:43<00:00,  1.56 examples/s]


In [14]:
test_dataset

Dataset({
    features: ['resume_text', 'job_description_text', 'label', 'resume', 'job_description', 'tfidf_similarity', 'bert_similarity', 'sentence_transformer_similarity'],
    num_rows: 1759
})

In [13]:
def convert_to_label(example):
    tfidf = 0 if example["tfidf_similarity"] < 0.5 else 1
    bert = 0 if example["bert_similarity"] < 0.5 else 1
    transformer = 0 if example["sentence_transformer_similarity"] < 0.5 else 1
    
    return {"tfidf": tfidf, "bert": bert, "transformer": transformer}
    

In [15]:
test_dataset = test_dataset.map(convert_to_label)

Map: 100%|██████████| 1759/1759 [00:00<00:00, 13685.32 examples/s]


In [16]:
test_dataset

Dataset({
    features: ['resume_text', 'job_description_text', 'label', 'resume', 'job_description', 'tfidf_similarity', 'bert_similarity', 'sentence_transformer_similarity', 'tfidf', 'bert', 'transformer'],
    num_rows: 1759
})

In [32]:
actual = test_dataset["label"]
bert_predict = test_dataset["bert"]
tfidf_predict = test_dataset["tfidf"]
transformer_predict = test_dataset["transformer"]

In [33]:
def evaluate(actual, predict):
    accuracy = accuracy_score(actual, predict)
    precision = precision_score(actual, predict)
    recall = recall_score(actual, predict)
    f1 = f1_score(actual, predict)
    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [34]:
bert_evaluate = evaluate(actual, bert_predict)
tfidf_evaluate = evaluate(actual, tfidf_predict)
transformer_evaluate = evaluate(actual, transformer_predict)

In [35]:
print("bert evaluate : ", bert_evaluate)
print("tfidf evaluate : ", tfidf_evaluate)
print("transformer evaluate : ", transformer_evaluate)

bert evaluate :  {'accuracy': 0.5127913587265491, 'precision': 0.5127913587265491, 'recall': 1.0, 'f1': 0.6779406238256295}
tfidf evaluate :  {'accuracy': 0.4866401364411598, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
transformer evaluate :  {'accuracy': 0.5548607163160887, 'precision': 0.5572666025024061, 'recall': 0.6419068736141907, 'f1': 0.5965996908809892}


In [36]:
data = {
    'BERT': bert_evaluate,
    'TF-IDF': tfidf_evaluate,
    'Transformer': transformer_evaluate
}

dataframe = pd.DataFrame(data)

In [37]:
dataframe

Unnamed: 0,BERT,TF-IDF,Transformer
accuracy,0.512791,0.48664,0.554861
precision,0.512791,0.0,0.557267
recall,1.0,0.0,0.641907
f1,0.677941,0.0,0.5966


In [39]:
dataframe.to_csv("evaluation.csv")