In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
tokenizer = BertTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = TFBertModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

2023-11-20 20:16:22.734023: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2023-11-20 20:16:22.734048: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2023-11-20 20:16:22.734060: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2023-11-20 20:16:22.734272: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-20 20:16:22.734525: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
I

In [4]:
vacancy_data = pd.read_csv('data/job_list_all.csv')

### Select Features from the Vacancy Data

Selected Feature from Vacancy Data that will be used in the recommendation system:
- Job Title
- Skills Requiered

In [5]:
job_title = vacancy_data['Job_title']
skill_required = vacancy_data['Skills'].fillna('')

In [6]:
vacancy_data[['Job_title', 'Skills']]

Unnamed: 0,Job_title,Skills
0,Perawat,"Dapat Berkomunikasi Secara Baik, Berorientasi ..."
1,Sales Sosial Media,
2,Fashion Designer,"Trend Forecasting, Pattern Making, Fashion Sty..."
3,RPA Software Developer (UIPATH),
4,SEO Specialist,"Search Engine Optimization (SEO), Seo/sem, SEO..."
...,...,...
2965,Sales Executive,
2966,Drafter,"2D & 3D Drafter, Construction, Arsitek & Draft..."
2967,3D Animator,"Character Animation, Target Oriented, 3D Animator"
2968,Crew Outlet,"Jujur, teliti, dan rajin, Bekerja Sama Secara ..."


### Generate Embedding Vector from Vacancy Data Features

Job Title

In [7]:
job_title_embedding = tokenizer(list(job_title), padding=True, truncation=True, return_tensors="tf")

job_title_output = model(**job_title_embedding)
job_title_embedding = tf.reduce_mean(job_title_output.last_hidden_state, axis=1)

print(job_title_embedding)

tf.Tensor(
[[ 0.09893861  0.6709868  -0.05763523 ... -0.09025701 -0.05610005
   0.09990735]
 [ 0.06790408  0.29846144 -0.03877079 ... -0.05968623 -0.20636159
   0.39817366]
 [-0.18341003  0.81219375  0.13489076 ... -0.6137716  -0.19563222
   0.21891122]
 ...
 [-0.33673918  0.46139348  0.12747812 ... -0.16478837 -0.06735568
   0.23283784]
 [ 0.10142635  0.6584376   0.13544369 ...  0.1945655   0.37138313
   0.26483524]
 [-0.02047656  0.60060596 -0.08322085 ... -0.25413296  0.06816447
   0.35279298]], shape=(2970, 384), dtype=float32)


### User Dataset

In [16]:
user_data = pd.read_csv('data/user_data.csv')


### Select Features from the User Data

Selected Feature from User Data that will be used in the recommendation system:
- Summary
- Specialities
- Preference

In [17]:
preference = user_data.loc[39,['Preference']].fillna('')
print(preference)

Preference    Screenwriter, Film Producer
Name: 39, dtype: object


In [12]:
'''
summary = user_data['Summary'].fillna('')
specialities = user_data['Specialities'].fillna('')
preference = user_data['Preference'].fillna('')
'''
user_data_pivot = user_data.loc[0, ['Specialities', 'Preference']].fillna('')
user_data_text = ', '.join(user_data_pivot)
print(user_data_pivot)
print(user_data_text)

Specialities      Pyhton, Machine Learning, Front-end Engineering
Preference      Machine Learning Engineer, Data Engineer, AI E...
Name: 0, dtype: object
Pyhton, Machine Learning, Front-end Engineering, Machine Learning Engineer, Data Engineer, AI Engineer


### Generate Embedding Vector from User Data Features

In [18]:
preference_embedding = tokenizer(list(preference), padding=True, truncation=True, return_tensors="tf")

preference_output = model(**preference_embedding)
preference_embedding = tf.reduce_mean(preference_output.last_hidden_state, axis=1)

print(preference_embedding)

tf.Tensor(
[[-3.18766922e-01 -2.05217168e-01 -1.48510754e-01  3.17142725e-01
  -1.86311170e-01  1.98808908e-01  3.02221477e-01  1.82692513e-01
   4.14625168e-01  6.83556795e-02  7.62336999e-02 -1.74791798e-01
  -1.26591414e-01  5.23503534e-02 -6.57366991e-01  1.28093228e-01
  -9.11315531e-02 -1.78633141e-03  3.28003019e-02 -7.52546549e-01
  -2.20756143e-01  1.52366757e-01  1.63122043e-01  5.78575991e-02
   4.30588216e-01 -3.72516006e-01  3.33673954e-02  1.27522081e-01
  -3.71765316e-01 -3.51653516e-01 -4.16352749e-01  2.68474191e-01
   1.21864140e-01 -2.31091380e-01  2.32149839e-01  8.75296816e-02
  -2.71304369e-01  3.02366972e-01 -2.96890318e-01 -5.44056177e-01
   9.39824283e-02 -1.05612122e-01  9.23461616e-02  5.04346788e-02
   2.01043934e-01 -5.52764498e-02 -3.73398280e-03 -1.99091107e-01
  -4.88896370e-01  5.52245915e-01 -2.11830422e-01  6.45608529e-02
  -1.81081265e-01 -4.41145986e-01  2.91735411e-01 -3.52525085e-01
   6.24299496e-02  4.37565744e-02  3.47053766e-01 -2.41071641e-01

In [15]:
user_data_text_embedding = tokenizer(list(user_data_text), padding=True, truncation=True, return_tensors="tf")

user_data_text_output = model(**user_data_text_embedding)
user_data_text_embedding = tf.reduce_mean(user_data_text_output.last_hidden_state, axis=1)

print(user_data_text_embedding)

tf.Tensor(
[[-0.2821684   0.17624943 -0.17334244 ... -0.22395012 -0.37560448
   0.08452722]
 [-0.42910144  0.08270818 -0.07904025 ...  0.435364    0.37753874
  -0.24304257]
 [-0.3762169   0.61999106 -0.2830522  ...  0.3662709   0.09408029
   0.32485762]
 ...
 [-0.09205972  0.31469434  0.24819134 ... -0.0672051  -0.02212937
   0.07603661]
 [-0.09205972  0.31469434  0.24819134 ... -0.0672051  -0.02212937
   0.07603661]
 [-0.34301218  0.00452676 -0.4503737  ...  0.40050817  0.27370155
  -0.18284445]], shape=(102, 384), dtype=float32)


### Searching Similarity
I'm using cosine similarity to searching the similarity between both vectorized embedding text

In [19]:
similarities = cosine_similarity(job_title_embedding, preference_embedding)

In [20]:
top_10_indices = np.argsort(similarities, axis=1)[:, -10:]

In [25]:
top_10_indices = np.argsort(similarities.flatten())[-10:][::-1]
top_10_values = similarities[top_10_indices]

# Menampilkan hasil
for i, (index, value) in enumerate(zip(top_10_indices, top_10_values)):
    print(f"{i+1}. Index: {index}, Similarity: {value}\n{vacancy_data.iloc[index]}")

1. Index: 927, Similarity: [0.4563548]
Company                         Djalaluddin Pane Foundation (DPF)
Job_title                                     Editor Video & Film
Location         Medan Selayang, Medan, Sumatera Utara, Indonesia
Salary                          Perusahaan tidak menampilkan gaji
Work Type                                             Penuh Waktu
Skills          Graphic Design, Fotografi & Videografi, Video ...
Descriptions                                                  NaN
Links           https://glints.com/id/opportunities/jobs/edito...
Name: 927, dtype: object
2. Index: 287, Similarity: [0.438792]
Company                                                 Ex1st Com
Job_title                         video editor dan konten kreator
Location            Medan Barat, Medan, Sumatera Utara, Indonesia
Salary                                            Rp1,5jt-Rp3,5jt
Work Type                                             Penuh Waktu
Skills                                  