# Collaborative Filtering

### Import Model & Dependencies

In [26]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
#tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
#model = TFBertModel.from_pretrained("indobenchmark/indobert-base-p1")

tokenizer = BertTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = TFBertModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### Vacancy Dataset

In [28]:
vacancy_data = pd.read_csv('data/job_list_all.csv')

In [None]:
vacancy_data.head()

In [None]:
vacancy_data.shape

In [None]:
vacancy_data.info()

### Select Features from the Vacancy Data

Selected Feature from Vacancy Data that will be used in the recommendation system:
- Job Title
- Skills Requiered

In [29]:
job_title = vacancy_data['Job_title']
skill_required = vacancy_data['Skills'].fillna('')

In [61]:
vacancy_data[['Job_title', 'Skills']]

Unnamed: 0,Job_title,Skills
0,Perawat,"Dapat Berkomunikasi Secara Baik, Berorientasi ..."
1,Sales Sosial Media,
2,Fashion Designer,"Trend Forecasting, Pattern Making, Fashion Sty..."
3,RPA Software Developer (UIPATH),
4,SEO Specialist,"Search Engine Optimization (SEO), Seo/sem, SEO..."
...,...,...
2965,Sales Executive,
2966,Drafter,"2D & 3D Drafter, Construction, Arsitek & Draft..."
2967,3D Animator,"Character Animation, Target Oriented, 3D Animator"
2968,Crew Outlet,"Jujur, teliti, dan rajin, Bekerja Sama Secara ..."


### Generate Embedding Vector from Vacancy Data Features

Job Title

In [30]:
job_title_embedding = tokenizer(list(job_title), padding=True, truncation=True, return_tensors="tf")

job_title_output = model(**job_title_embedding)
job_title_embedding = tf.reduce_mean(job_title_output.last_hidden_state, axis=1)

print(job_title_embedding)

tf.Tensor(
[[ 0.09893861  0.6709868  -0.05763523 ... -0.09025701 -0.05610005
   0.09990735]
 [ 0.06790408  0.29846144 -0.03877079 ... -0.05968623 -0.20636159
   0.39817366]
 [-0.18341003  0.81219375  0.13489076 ... -0.6137716  -0.19563222
   0.21891122]
 ...
 [-0.33673918  0.46139348  0.12747812 ... -0.16478837 -0.06735568
   0.23283784]
 [ 0.10142635  0.6584376   0.13544369 ...  0.1945655   0.37138313
   0.26483524]
 [-0.02047656  0.60060596 -0.08322085 ... -0.25413296  0.06816447
   0.35279298]], shape=(2970, 384), dtype=float32)


Skills Required

In [31]:
batch_size = 8  # Adjust the batch size based on your available memory
num_batches = int(np.ceil(len(skill_required) / batch_size))

# Initialize an empty list to store the embeddings
all_embeddings = []

# Process the skills in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(skill_required))
    
    # Get the current batch of skills
    current_batch = skill_required[start_idx:end_idx]

    # Tokenize and get embeddings for the current batch
    skills_required_embedding = tokenizer(list(current_batch), padding=True, truncation=True, return_tensors="tf")
    skills_required_output = model(**skills_required_embedding)
    skills_required_embedding_batch = tf.reduce_mean(skills_required_output.last_hidden_state, axis=1)

    # Append the batch embeddings to the list
    all_embeddings.append(skills_required_embedding_batch)

# Concatenate the embeddings from all batches
skills_required_embedding = tf.concat(all_embeddings, axis=0)

print(skills_required_embedding)


tf.Tensor(
[[-0.04734316 -0.04068359 -0.05341089 ... -0.0326941   0.12434901
   0.08022784]
 [-0.14200833  0.06820566  0.30239564 ...  0.05579124  0.03658259
  -0.19898558]
 [-0.36620596 -0.27244887  0.29949224 ... -0.3608912  -0.32922253
   0.33754823]
 ...
 [-0.41038263 -0.00828709  0.04550349 ... -0.10936541  0.06763899
   0.09309419]
 [-0.34337884  0.11199798 -0.09946654 ...  0.00533551 -0.30713984
   0.09499694]
 [-0.14812972 -0.07734243  0.12420789 ... -0.21625116  0.47002015
   0.10632435]], shape=(2970, 384), dtype=float32)


Combine embeddings vector from job_title and skills_required

In [32]:
''' 
vacancy_data_embedding = tf.concat([job_title_embedding, skills_required_embedding], axis=1)
print(vacancy_data_embedding)
'''

vacancy_data_embedding = job_title_embedding + skills_required_embedding
print(vacancy_data_embedding)

tf.Tensor(
[[ 5.15954420e-02  6.30303204e-01 -1.11046121e-01 ... -1.22951105e-01
   6.82489574e-02  1.80135190e-01]
 [-7.41042495e-02  3.66667092e-01  2.63624847e-01 ... -3.89498845e-03
  -1.69779003e-01  1.99188083e-01]
 [-5.49615979e-01  5.39744854e-01  4.34383005e-01 ... -9.74662781e-01
  -5.24854779e-01  5.56459427e-01]
 ...
 [-7.47121811e-01  4.53106403e-01  1.72981620e-01 ... -2.74153769e-01
   2.83300877e-04  3.25932026e-01]
 [-2.41952494e-01  7.70435572e-01  3.59771475e-02 ...  1.99901015e-01
   6.42432868e-02  3.59832168e-01]
 [-1.68606281e-01  5.23263514e-01  4.09870371e-02 ... -4.70384121e-01
   5.38184643e-01  4.59117323e-01]], shape=(2970, 384), dtype=float32)


### User Dataset

In [33]:
user_data = pd.read_csv('data/user_data.csv')


In [None]:
user_data.head()

In [None]:
user_data.shape

In [None]:
user_data.info()

### Select Features from the User Data

Selected Feature from User Data that will be used in the recommendation system:
- Summary
- Specialities
- Preference

In [62]:
'''
summary = user_data['Summary'].fillna('')
specialities = user_data['Specialities'].fillna('')
preference = user_data['Preference'].fillna('')
'''
user_data_pivot = user_data.loc[0, ['Specialities', 'Preference']].fillna('')
user_data_text = ' '.join(user_data_pivot)
print(user_data_text)

Pyhton, Machine Learning, Front-end Engineering Machine Learning Engineer, Data Engineer, AI Engineer


In [None]:
# user_data[['Summary', 'Specialities', 'Preference']].head()

### Generate Embedding Vector from User Data Features

Summary

In [None]:
summary_embedding = tokenizer(list(summary), padding=True, truncation=True, return_tensors="tf")

summary_output = model(**summary_embedding)
summary_embedding = tf.reduce_mean(summary_output.last_hidden_state, axis=1)

print(summary_embedding)

Specialities

In [44]:
specialities_embedding = tokenizer(list(specialities), padding=True, truncation=True, return_tensors="tf")

specialities_output = model(**specialities_embedding)
specialities_embedding = tf.reduce_mean(specialities_output.last_hidden_state, axis=1)

print(specialities_embedding)

tf.Tensor(
[[-0.7591557  -0.2853693  -0.04422127 ...  0.22754283  0.51835626
   0.30709255]
 [-0.37234053 -0.16718821  0.20641777 ...  0.2686259   0.04418864
   0.32030675]
 [-0.03494948  0.03984269 -0.00202686 ... -0.06028648 -0.05858291
   0.3398166 ]
 ...
 [ 0.08599003  0.43470374  0.01070081 ... -0.37930927  0.21111889
   0.43160692]
 [-0.23071201  0.15263826 -0.1491357  ...  0.2772762   0.0031032
   0.2470112 ]
 [-0.37705415 -0.2945982   0.18425955 ...  0.1156285   0.3197995
   0.20341104]], shape=(40, 384), dtype=float32)


Preference

In [45]:
preference_embedding = tokenizer(list(preference), padding=True, truncation=True, return_tensors="tf")

preference_output = model(**preference_embedding)
preference_embedding = tf.reduce_mean(preference_output.last_hidden_state, axis=1)

print(preference_embedding)

tf.Tensor(
[[-0.27622938 -0.5186211   0.28838944 ... -0.23163259  0.15096131
   0.10167424]
 [-0.23453705 -0.12980202 -0.15797941 ...  0.23230988  0.34744868
   0.4337092 ]
 [-0.11287393  0.06269799  0.02106555 ... -0.31289685 -0.20406961
   0.11203896]
 ...
 [ 0.03132712  0.31291494  0.09176126 ... -0.7587497  -0.00762711
   0.6370269 ]
 [-0.23912387 -0.18840477  0.23998892 ... -0.15042606  0.03189412
   0.29902086]
 [-0.41676292 -0.11168133 -0.09661361 ... -0.08892183  0.5930506
   0.02420565]], shape=(40, 384), dtype=float32)


Combine Embedding Vector from User Data

### Searching Similarity
I'm using cosine similarity to searching the similarity between both vectorized embedding text

In [None]:
''' 
# Reshape cf_embedding menjadi matrix 2 dimensi
cf_embedding_np = cf_embedding.numpy().reshape(1, -1)
'''

In [50]:
similarities = cosine_similarity(vacancy_data_embedding, user_data_embedding)

In [53]:
top_10_indices = np.argsort(similarities, axis=1)[:, -10:]

In [54]:
for i, job_index in enumerate(top_10_indices[0]):
    similarity_score = similarities[0, job_index]
    job_title = vacancy_data.iloc[job_index]
    print(f"Similarity {i + 1}: {similarity_score}, Job Title: {job_title}")

Similarity 1: 0.17065221071243286, Job Title: Company                                           Yanoshi Omiyage
Job_title                                      Sales Sosial Media
Location            Bekasi Selatan, Bekasi, Jawa Barat, Indonesia
Salary                          Perusahaan tidak menampilkan gaji
Work Type                                             Penuh Waktu
Skills                                                        NaN
Descriptions    Mengelola aktifitas penjualan di platform medi...
Links           https://glints.com/id/opportunities/jobs/sales...
Name: 1, dtype: object
Similarity 2: 0.19600795209407806, Job Title: Company                                         MII CMT Indonesia
Job_title                                        Fashion Designer
Location                   Karawaci, Tangerang, Banten, Indonesia
Salary                          Perusahaan tidak menampilkan gaji
Work Type                                             Penuh Waktu
Skills          Trend Forec