In [None]:
import sys
import os
os.environ['OPENAI_API_KEY'] = # your api-key here or get it from your environment
import pandas as pd
import numpy as np
from ast import literal_eval

from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score



from embeddings_utils import get_embedding

# [TODO] Libs from openai and translate are not compatible!
# from preprocess.translate import process_job_translation



from lightgbm import LGBMClassifier

import pickle

# Add the src directory to the system path
src_path = os.path.abspath(os.path.join('..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

models_path = os.path.abspath(os.path.join('..', 'models'))

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)

# Translate all jobs to english

In [5]:
df_experience_raw = pd.read_csv('../data/raw/experience.csv')
df_experience_raw

Unnamed: 0,person_id,company_id,role,location,start_date,end_date,description
0,0,0,Desenvolvimento de recursos humanos,"São Paulo, Brasil",Sep 2022,Ongoing,"Organização, planejamento e acompanhamento de ..."
1,0,0,Agente de operações,"São Paulo, Brasil",Jun 2018,Jan 2023,"Responsável por vendas, administração de capit..."
2,0,1,Psicanálise Clínica,,May 2022,Ongoing,
3,0,2,Professor de inglês e Matemática,"São Paulo, Brasil",Jan 2010,Ongoing,Elaboração e cumprimento do plano de trabalho ...
4,0,3,Consultor Ambiental,"São Paulo, Brasil",2010,2017,"Atendimento à clientes internos e externos, ag..."
...,...,...,...,...,...,...,...
35941,9244,10398,Desenvolvedor Python,"São Paulo e Região, Brasil",Sep 2019,Feb 2020,- Desenvolvimento e depuração com Django2/Pyth...
35942,9244,3110,Analista,"São Paulo e Região, Brasil",Jul 2019,Sep 2019,- Atendimento de suporte técnico- Uso de Banco...
35943,9244,10399,Voluntário de pesquisa,"Santos, São Paulo",Apr 2018,May 2018,"Voluntário FAPESP (processo nº 2016/24551-4), ..."
35944,9245,652,BackOffice - Financeira Veículos,"São Paulo, São Paulo, Brasil",Jan 2024,Ongoing,


In [None]:
df_experience = df_experience_raw.copy()
df_experience['role_english'] = df_experience['role'].apply(lambda x: process_job_translation(x))
df_experience['description_english'] = df_experience['role'].apply(lambda x: process_job_translation(x))

In [None]:
df_experience.to_parquet('../data/interim/experience_df_translated.parquet')
df_experience[['person_id', 'company_id', 'role_english', 'description_english', 'location', 'start_date', 'end_date']].to_parquet('../data/interim/experience_df_role_english_only.parquet')

# Get embeddings for all job experiences

In [None]:
df_experience = pd.read_parquet('../data/interim/experience_df_role_english_only.parquet')

embedding_model = "text-embedding-3-large"
embedding_encoding = "cl100k_base"
max_tokens = 8000  # the maximum for text-embedding-3-small is 8191

df_experience['role_embedding'] = df_experience['role_english'].fillna('empty').apply(lambda x: get_embedding(x, model=embedding_model))
df_experience['description_embedding'] = df_experience['description_english'].fillna('empty').apply(lambda x: get_embedding(x, model=embedding_model))

df_experience.head(3)

In [None]:
df_experience.to_parquet('../data/interim/experience_df_processed.parquet')

# Load embedded dataframe and LGBM model

In [4]:
# Load it
df_experience_embedded = pd.read_parquet('../data/interim/experience_df_embedded.parquet')
df_experience_embedded

Unnamed: 0,person_id,company_id,role,location,start_date,end_date,description,role_english,description_english,role_embedding,description_embedding
0,0,0,Desenvolvimento de recursos humanos,"São Paulo, Brasil",Sep 2022,Ongoing,"Organização, planejamento e acompanhamento de ...",Human resource development,"Organization, planning and monitoring of inter...","[-0.0243100356310606, 0.016762226819992065, -0...","[-0.035512685775756836, -0.0035126367583870888..."
1,0,0,Agente de operações,"São Paulo, Brasil",Jun 2018,Jan 2023,"Responsável por vendas, administração de capit...",Operations Agent,"Responsible for sales, capital management, con...","[-0.03315284103155136, -0.023175520822405815, ...","[0.00040178062045015395, 0.0025061615742743015..."
2,0,1,Psicanálise Clínica,,May 2022,Ongoing,,Clinical Psychoanalysis,,"[-0.041004203259944916, -0.029672371223568916,...","[-0.00956551730632782, -0.025751212611794472, ..."
3,0,2,Professor de inglês e Matemática,"São Paulo, Brasil",Jan 2010,Ongoing,Elaboração e cumprimento do plano de trabalho ...,English and Mathematics teacher,Preparation and fulfillment of the work plan a...,"[-0.05433833971619606, 0.032104719430208206, -...","[-0.021015996113419533, 0.013301520608365536, ..."
4,0,3,Consultor Ambiental,"São Paulo, Brasil",2010,2017,"Atendimento à clientes internos e externos, ag...",Environmental Consultant,"Service to internal and external customers, sc...","[-0.024257482960820198, -0.010218966752290726,...","[-0.003762776032090187, -0.013486848212778568,..."
...,...,...,...,...,...,...,...,...,...,...,...
35941,9244,10398,Desenvolvedor Python,"São Paulo e Região, Brasil",Sep 2019,Feb 2020,- Desenvolvimento e depuração com Django2/Pyth...,Python Developer,- Development and debugging with Django 2/Pyth...,"[-0.019558709114789963, -0.01988365687429905, ...","[-0.023483239114284515, 0.0031624201219528913,..."
35942,9244,3110,Analista,"São Paulo e Região, Brasil",Jul 2019,Sep 2019,- Atendimento de suporte técnico- Uso de Banco...,Analyst,- Technical support service - Use of Relationa...,"[-0.025528710335493088, 0.0021096516866236925,...","[-0.029761090874671936, 0.0009133976418524981,..."
35943,9244,10399,Voluntário de pesquisa,"Santos, São Paulo",Apr 2018,May 2018,"Voluntário FAPESP (processo nº 2016/24551-4), ...",Research Volunteer,"FAPESP Volunteer (process no. 2016/24551-4), a...","[-0.009537572041153908, -0.006213328801095486,...","[-0.020103486254811287, 0.04703690484166145, -..."
35944,9245,652,BackOffice - Financeira Veículos,"São Paulo, São Paulo, Brasil",Jan 2024,Ongoing,,BackOffice - Finance Vehicles,,"[-0.030772071331739426, 0.03125680983066559, -...","[-0.009593404829502106, -0.025807352736592293,..."


In [5]:
pca_role = pickle.load(open('../models/pca_role.pkl', 'rb'))
pca_description = pickle.load(open('../models/pca_description.pkl', 'rb'))
svm_only_t0_info = pickle.load(open('../models/svm_only_t0_info.pkl', 'rb'))

# Apply PCA on embeddings

In [6]:
role_embedding_matrix = np.vstack(df_experience_embedded['role_embedding'].values)
description_embedding_matrix = np.vstack(df_experience_embedded['description_embedding'].values)

# Check shape
print("Original Role Embedding shape:", role_embedding_matrix.shape)  # (n_samples, 1536)

Original Role Embedding shape: (35946, 3072)


In [7]:
reduced_role_embedding = pca_role.transform(role_embedding_matrix)
reduced_description_embedding = pca_description.transform(description_embedding_matrix)

print("Reduced Role Embedding shape:", reduced_role_embedding.shape)  # (n_samples, pca_dim)
print("Reduced Description Embedding shape:", reduced_description_embedding.shape)  # (n_samples, pca_dim)

Reduced Role Embedding shape: (35946, 600)
Reduced Description Embedding shape: (35946, 600)


In [8]:
df_experience_reduced_embedded = df_experience_embedded
df_experience_reduced_embedded['reduced_role_embedding'] = list(reduced_role_embedding)
df_experience_reduced_embedded['reduced_description_embedding'] = list(reduced_description_embedding)
df_experience_reduced_embedded = df_experience_reduced_embedded[['reduced_role_embedding', 'reduced_description_embedding']]
df_experience_reduced_embedded.head()

Unnamed: 0,reduced_role_embedding,reduced_description_embedding
0,"[-0.1158553133153693, -0.11004434991438014, 0....","[0.4426139590255054, 0.05279842006011781, 0.10..."
1,"[0.022750665412124414, -0.23555141316738556, 0...","[0.46800080037921377, 0.2292870307959621, 0.15..."
2,"[0.0780982360973996, -0.11631413537977832, -0....","[-0.6333038390506511, 0.007129625750741803, 0...."
3,"[-0.12479493816885473, -0.26310070816022557, -...","[0.3805213839015129, -0.16264063789702646, 0.0..."
4,"[0.08260825357384458, -0.08143872776924596, -0...","[0.451384523846458, 0.1808521656945917, 0.2021..."


# Predict

In [9]:
# Flatten the array embeddings and concatenate with the integer feature
arr1s = np.vstack(df_experience_reduced_embedded['reduced_role_embedding'].values)  # Shape: (n_samples, embedding_size)
arr2s = np.vstack(df_experience_reduced_embedded['reduced_description_embedding'].values)  # Shape: (n_samples, embedding_size)

# Concatenate all features horizontally
X = np.hstack((arr1s, arr2s))  # Shape: (n_samples, total_features)

preds = svm_only_t0_info.predict(X)

In [10]:
df_experience_embedded['model_classification'] = preds
df_experience_embedded

Unnamed: 0,person_id,company_id,role,location,start_date,end_date,description,role_english,description_english,role_embedding,description_embedding,reduced_role_embedding,reduced_description_embedding,model_classification
0,0,0,Desenvolvimento de recursos humanos,"São Paulo, Brasil",Sep 2022,Ongoing,"Organização, planejamento e acompanhamento de ...",Human resource development,"Organization, planning and monitoring of inter...","[-0.0243100356310606, 0.016762226819992065, -0...","[-0.035512685775756836, -0.0035126367583870888...","[-0.1158553133153693, -0.11004434991438014, 0....","[0.4426139590255054, 0.05279842006011781, 0.10...",131071
1,0,0,Agente de operações,"São Paulo, Brasil",Jun 2018,Jan 2023,"Responsável por vendas, administração de capit...",Operations Agent,"Responsible for sales, capital management, con...","[-0.03315284103155136, -0.023175520822405815, ...","[0.00040178062045015395, 0.0025061615742743015...","[0.022750665412124414, -0.23555141316738556, 0...","[0.46800080037921377, 0.2292870307959621, 0.15...",131111
2,0,1,Psicanálise Clínica,,May 2022,Ongoing,,Clinical Psychoanalysis,,"[-0.041004203259944916, -0.029672371223568916,...","[-0.00956551730632782, -0.025751212611794472, ...","[0.0780982360973996, -0.11631413537977832, -0....","[-0.6333038390506511, 0.007129625750741803, 0....",000000
3,0,2,Professor de inglês e Matemática,"São Paulo, Brasil",Jan 2010,Ongoing,Elaboração e cumprimento do plano de trabalho ...,English and Mathematics teacher,Preparation and fulfillment of the work plan a...,"[-0.05433833971619606, 0.032104719430208206, -...","[-0.021015996113419533, 0.013301520608365536, ...","[-0.12479493816885473, -0.26310070816022557, -...","[0.3805213839015129, -0.16264063789702646, 0.0...",030000
4,0,3,Consultor Ambiental,"São Paulo, Brasil",2010,2017,"Atendimento à clientes internos e externos, ag...",Environmental Consultant,"Service to internal and external customers, sc...","[-0.024257482960820198, -0.010218966752290726,...","[-0.003762776032090187, -0.013486848212778568,...","[0.08260825357384458, -0.08143872776924596, -0...","[0.451384523846458, 0.1808521656945917, 0.2021...",192041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35941,9244,10398,Desenvolvedor Python,"São Paulo e Região, Brasil",Sep 2019,Feb 2020,- Desenvolvimento e depuração com Django2/Pyth...,Python Developer,- Development and debugging with Django 2/Pyth...,"[-0.019558709114789963, -0.01988365687429905, ...","[-0.023483239114284515, 0.0031624201219528913,...","[0.02966283225624864, -0.1153839263572885, -0....","[0.35839147856485065, -0.08176828767448942, -0...",151252
35942,9244,3110,Analista,"São Paulo e Região, Brasil",Jul 2019,Sep 2019,- Atendimento de suporte técnico- Uso de Banco...,Analyst,- Technical support service - Use of Relationa...,"[-0.025528710335493088, 0.0021096516866236925,...","[-0.029761090874671936, 0.0009133976418524981,...","[0.32939866844117754, -0.17043647204447057, -0...","[0.3371295969502092, 0.05449387299599148, -0.1...",151211
35943,9244,10399,Voluntário de pesquisa,"Santos, São Paulo",Apr 2018,May 2018,"Voluntário FAPESP (processo nº 2016/24551-4), ...",Research Volunteer,"FAPESP Volunteer (process no. 2016/24551-4), a...","[-0.009537572041153908, -0.006213328801095486,...","[-0.020103486254811287, 0.04703690484166145, -...","[-0.1970355267459052, 0.03132192732784206, -0....","[0.3773113997248327, -0.30851527071993146, 0.1...",000000
35944,9245,652,BackOffice - Financeira Veículos,"São Paulo, São Paulo, Brasil",Jan 2024,Ongoing,,BackOffice - Finance Vehicles,,"[-0.030772071331739426, 0.03125680983066559, -...","[-0.009593404829502106, -0.025807352736592293,...","[0.13632986562546073, -0.057441053139577064, 0...","[-0.6332811684980748, 0.007137518406986398, 0....",132051


In [11]:
df_experience_embedded['model_classification'].value_counts()

model_classification
000000    7354
131111    4157
152051    2645
151211    2506
132051    2060
          ... 
173028       1
419021       1
536051       1
173027       1
499052       1
Name: count, Length: 129, dtype: int64

In [12]:
df_experience_embedded[['person_id', 'company_id', 'start_date', 'end_date', 'model_classification']].to_parquet('../data/processed/experience_df_with_model_classification.parquet')