In [1]:
import re
import math
import nltk
import pandas as pd
import mysql.connector
from datetime import date
from html import unescape
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
def preprocessing(data):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stopword = StopWordRemoverFactory().create_stop_word_remover()

    def cleansing(row):
        text = re.sub(r'&[^\s;&]+;', '', unescape(row['text']))
        html_pattern = re.compile('<.*?>')
        text = html_pattern.sub(r' ', text)
        text = re.sub(
            r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)
        return text
    
    def caseFolding(row):
        text = row['text'].lower()
        return text

    def tokenizing(row):
        tokenized = word_tokenize(str(row['text']))
        return tokenized

    def stemming(row):
        stemmed = [stemmer.stem(token) for token in row['text']]
        stemmed = " ".join(stemmed)
        return stemmed

    def stopwording(row):
        stopworded = stopword.remove(row['text'])
        return stopworded

    data['text'] = data.apply(cleansing, axis=1)
    data['text'] = data.apply(caseFolding, axis=1)
    data['text'] = data.apply(tokenizing, axis=1)
    data['text'] = data.apply(stemming, axis=1)
    data['text'] = data.apply(stopwording, axis=1)

    return data

In [3]:
vacancies = [
    "Lowongan Kerja Web Developer di Perusahaan ABC",
    "Dibutuhkan Software Engineer untuk Proyek Inovatif",
    "Lowongan Kerja Data Analyst dengan Gaji Menarik",
    "Dicari UI/UX Designer berpengalaman untuk Startup",
    "Lowongan Kerja Mobile App Developer Full-time",
    "Dibutuhkan Software Engineer dengan pengalaman di bidang pengembangan web.",
    "Dicari UI/UX Designer yang kreatif dan berpengalaman.",
    "Perusahaan mencari Data Scientist untuk menganalisis data dan membuat model prediktif.",
    "Dibutuhkan Digital Marketing Specialist untuk mengelola kampanye pemasaran online.",
    "Perusahaan mencari Product Manager yang memiliki pengalaman di industri teknologi.",
    "Dicari Content Writer yang kreatif dan mampu menulis konten berkualitas.",
    "Perusahaan membutuhkan Frontend Developer untuk mengembangkan antarmuka pengguna.",
    "Dibutuhkan HR Manager dengan pengalaman dalam manajemen sumber daya manusia.",
    "Dicari Graphic Designer yang mampu menciptakan desain visual yang menarik.",
    "Perusahaan mencari Business Analyst untuk menganalisis kebutuhan bisnis.",
    "Dibutuhkan Sales Executive untuk menjalin hubungan bisnis dengan klien.",
    "Dicari Full Stack Developer yang memiliki pengetahuan luas tentang teknologi terkini.",
    "Perusahaan membutuhkan Customer Service Representative yang ramah dan efisien.",
    "Dibutuhkan Project Manager untuk mengelola proyek dan tim dengan baik.",
    "Dicari Social Media Specialist untuk mengelola kehadiran online perusahaan.",
    "Perusahaan mencari Android Developer yang berpengalaman dalam pengembangan aplikasi.",
    "Dibutuhkan Accountant untuk mengelola keuangan perusahaan.",
    "Dicari Network Administrator yang memiliki pemahaman tentang jaringan komputer.",
    "Perusahaan membutuhkan Legal Counsel untuk memberikan nasihat hukum.",
    "Dibutuhkan UI/UX Researcher untuk mengumpulkan dan menganalisis data pengguna."
]
itemPerPage = 5
page = 1
minWeight = 0.15
keyword = "UI/UX"

In [4]:
dframeKeyword = pd.DataFrame([keyword], columns=["text"])
dframeVacancies = pd.DataFrame(vacancies, columns=["text"])
dframeVacancies = dframeVacancies.dropna()

dframeKeyword = preprocessing(dframeKeyword)
dframeVacancies = preprocessing(dframeVacancies)

dframeKeyword

Unnamed: 0,text
0,ui ux


In [5]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(dframeVacancies['text'])
new_tfidf_vector = vectorizer.transform(dframeKeyword["text"])

vectorizerWithoutNorm = TfidfVectorizer(norm=None)
tfidf_matrixWithoutNorm = vectorizerWithoutNorm.fit_transform(dframeVacancies['text'])
new_tfidf_vectorWithoutNorm = vectorizerWithoutNorm.transform(dframeKeyword["text"])

countVectorizer = CountVectorizer()
tf = countVectorizer.fit_transform(dframeVacancies['text'])
tf_keyword = countVectorizer.transform(dframeKeyword["text"])

document_frequencies = tfidf_matrix.astype(bool).sum(axis=0).A1
words = vectorizer.get_feature_names_out()
idf_values = vectorizer.idf_

document_frequenciesWithoutNorm = tfidf_matrixWithoutNorm.astype(bool).sum(axis=0).A1
wordsWithoutNorm = vectorizerWithoutNorm.get_feature_names_out()
idf_valuesWithoutNorm = vectorizerWithoutNorm.idf_


In [6]:
print('Jumlah dokumen dengan dan tanpa Norm L2:', tf.shape[0])
print('Jumlah term dengan dan tanpa Norm L2:', tf.shape[1])
print('Daftar Term dengan dan tanpa Norm L2:', countVectorizer.get_feature_names_out())

Jumlah dokumen dengan dan tanpa Norm L2: 25
Jumlah term dengan dan tanpa Norm L2: 100
Daftar Term dengan dan tanpa Norm L2: ['abc' 'accountant' 'administrator' 'alam' 'analis' 'analyst' 'android'
 'antarmuka' 'aplikasi' 'app' 'baik' 'beri' 'bidang' 'bisnis' 'buat'
 'business' 'butuh' 'cari' 'cipta' 'content' 'counsel' 'customer' 'data'
 'daya' 'desain' 'designer' 'developer' 'digital' 'efisien' 'engineer'
 'executive' 'frontend' 'full' 'gaji' 'graphic' 'hadir' 'hr' 'hubung'
 'hukum' 'industri' 'inovatif' 'jalin' 'jaring' 'kampanye' 'kelola'
 'kembang' 'kerja' 'kini' 'klien' 'komputer' 'konten' 'kreatif' 'kualitas'
 'kumpul' 'legal' 'lowong' 'luas' 'mampu' 'manager' 'manajemen' 'manusia'
 'marketing' 'media' 'milik' 'mobile' 'model' 'nasihat' 'network' 'online'
 'paham' 'pasar' 'prediktif' 'product' 'project' 'proyek' 'ramah'
 'representative' 'researcher' 'sales' 'scientist' 'service' 'social'
 'software' 'specialist' 'stack' 'startup' 'sumber' 'tahu' 'tarik'
 'teknologi' 'tim' 'time' 

In [7]:
print('Matriks TF dengan dan tanpa Norm L2:')
pd.DataFrame(tf.toarray(), columns=countVectorizer.get_feature_names_out())

Matriks TF dengan dan tanpa Norm L2:


Unnamed: 0,abc,accountant,administrator,alam,analis,analyst,android,antarmuka,aplikasi,app,...,tim,time,tulis,uang,ui,usaha,ux,visual,web,writer
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
7,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
dframeWords = pd.DataFrame(words, columns=["Word"])
dframeDF = pd.DataFrame(document_frequencies, columns=["DF"])
dframeIDF = pd.DataFrame(idf_values, columns=["IDF"])
pd.concat([dframeWords, dframeDF, dframeIDF], axis=1, join='inner')


Unnamed: 0,Word,DF,IDF
0,abc,1,3.564949
1,accountant,1,3.564949
2,administrator,1,3.564949
3,alam,6,2.312186
4,analis,3,2.871802
...,...,...,...
95,usaha,10,1.860201
96,ux,3,2.871802
97,visual,1,3.564949
98,web,2,3.159484


In [9]:
dframeWordsWithoutNorm = pd.DataFrame(wordsWithoutNorm, columns=["Word"])
dframeDFWithoutNorm = pd.DataFrame(document_frequenciesWithoutNorm, columns=["DF"])
dframeIDFWithoutNorm = pd.DataFrame(idf_valuesWithoutNorm, columns=["IDF"])
pd.concat([dframeWordsWithoutNorm, dframeDFWithoutNorm, dframeIDFWithoutNorm], axis=1, join='inner')


Unnamed: 0,Word,DF,IDF
0,abc,1,3.564949
1,accountant,1,3.564949
2,administrator,1,3.564949
3,alam,6,2.312186
4,analis,3,2.871802
...,...,...,...
95,usaha,10,1.860201
96,ux,3,2.871802
97,visual,1,3.564949
98,web,2,3.159484


In [10]:
print('TFIDF dengan Norm L2:')
pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

TFIDF dengan Norm L2:


Unnamed: 0,abc,accountant,administrator,alam,analis,analyst,android,antarmuka,aplikasi,app,...,tim,time,tulis,uang,ui,usaha,ux,visual,web,writer
0,0.510693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.266481,0.0,0.0,0.452609,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.416968,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.341132,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.423696,0.0,0.423696,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42402,...,0.0,0.42402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.300725,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.410926,0.0
6,0.0,0.0,0.0,0.351729,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.436858,0.0,0.436858,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.2891,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.187264,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.289373,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.232807,0.0,0.0,0.0,0.0


In [11]:
print('TFIDF tanpa Norm L2:')
pd.DataFrame(tfidf_matrixWithoutNorm.toarray(), columns=vectorizerWithoutNorm.get_feature_names_out())

TFIDF tanpa Norm L2:


Unnamed: 0,abc,accountant,administrator,alam,analis,analyst,android,antarmuka,aplikasi,app,...,tim,time,tulis,uang,ui,usaha,ux,visual,web,writer
0,3.564949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.860201,0.0,0.0,3.159484,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,3.159484,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.312186,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.871802,0.0,2.871802,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.564949,...,0.0,3.564949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,2.312186,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.159484,0.0
6,0.0,0.0,0.0,2.312186,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.871802,0.0,2.871802,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,2.871802,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.860201,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,2.312186,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.860201,0.0,0.0,0.0,0.0


In [14]:
similarity_scores = cosine_similarity(new_tfidf_vector, tfidf_matrix)

vacanciesWeighted = similarity_scores[0]                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
sortedIndexVacancies = vacanciesWeighted.argsort()[::-1]

vacanciesWeighted

array([0.        , 0.        , 0.        , 0.59919644, 0.        ,
       0.        , 0.61780983, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.51883981])

In [23]:
print(sortedIndexVacancies)
pd.DataFrame(vacanciesWeighted, columns=["Skor"]).transpose()

[ 6  3 24 11  1  2  4  5  7  8  9 10 12 23 13 14 15 16 17 18 19 20 21 22
  0]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Skor,0.0,0.0,0.0,0.599196,0.0,0.0,0.61781,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.51884


In [28]:
for num, index in enumerate(sortedIndexVacancies):
    print(num+1, vacancies[index])

1 Dicari UI/UX Designer yang kreatif dan berpengalaman.
2 Dicari UI/UX Designer berpengalaman untuk Startup
3 Dibutuhkan UI/UX Researcher untuk mengumpulkan dan menganalisis data pengguna.
4 Perusahaan membutuhkan Frontend Developer untuk mengembangkan antarmuka pengguna.
5 Dibutuhkan Software Engineer untuk Proyek Inovatif
6 Lowongan Kerja Data Analyst dengan Gaji Menarik
7 Lowongan Kerja Mobile App Developer Full-time
8 Dibutuhkan Software Engineer dengan pengalaman di bidang pengembangan web.
9 Perusahaan mencari Data Scientist untuk menganalisis data dan membuat model prediktif.
10 Dibutuhkan Digital Marketing Specialist untuk mengelola kampanye pemasaran online.
11 Perusahaan mencari Product Manager yang memiliki pengalaman di industri teknologi.
12 Dicari Content Writer yang kreatif dan mampu menulis konten berkualitas.
13 Dibutuhkan HR Manager dengan pengalaman dalam manajemen sumber daya manusia.
14 Perusahaan membutuhkan Legal Counsel untuk memberikan nasihat hukum.
15 Dicari 

In [32]:
similar_docs_indices = similarity_scores.argsort()[0][::-1]
recommended_docs = [vacancies[index] for index in similar_docs_indices]

similar_docs_indices, recommended_docs

(array([ 6,  3, 24, 11,  1,  2,  4,  5,  7,  8,  9, 10, 12, 23, 13, 14, 15,
        16, 17, 18, 19, 20, 21, 22,  0], dtype=int64),
 ['Dicari UI/UX Designer yang kreatif dan berpengalaman.',
  'Dicari UI/UX Designer berpengalaman untuk Startup',
  'Dibutuhkan UI/UX Researcher untuk mengumpulkan dan menganalisis data pengguna.',
  'Perusahaan membutuhkan Frontend Developer untuk mengembangkan antarmuka pengguna.',
  'Dibutuhkan Software Engineer untuk Proyek Inovatif',
  'Lowongan Kerja Data Analyst dengan Gaji Menarik',
  'Lowongan Kerja Mobile App Developer Full-time',
  'Dibutuhkan Software Engineer dengan pengalaman di bidang pengembangan web.',
  'Perusahaan mencari Data Scientist untuk menganalisis data dan membuat model prediktif.',
  'Dibutuhkan Digital Marketing Specialist untuk mengelola kampanye pemasaran online.',
  'Perusahaan mencari Product Manager yang memiliki pengalaman di industri teknologi.',
  'Dicari Content Writer yang kreatif dan mampu menulis konten berkualitas.'

In [96]:
ground_truth_list = [6, 3, 24]

22


(3, 22, 0, 22)

In [109]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score
# Menghitung confusion matrix
tn, fp, fn, tp = confusion_matrix([6, 3, 24], similar_docs_indices).ravel()


# Menampilkan hasil pengujian
print("Confusion Matrix:")
print(tn, fp, fn, tp)
print("Precision:", precision)
print("Recall:", recall)

ValueError: Found input variables with inconsistent numbers of samples: [3, 25]