In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, ZeroShotClassification
from unidecode import unidecode
from deep_translator import GoogleTranslator

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


# Imports, getting the data, and basic transformation

We have a dataset with 4486 rows.

In [3]:
def process_url(url):
    url = url.split('//')[1].split('/')[0].replace('www.', '')
    if '.gupy.io' in url:
        return 'gupy.io'
    if '.infojobs.com.br' in url:
        return 'infojobs.com.br'
    return url

def translate_message(msg):
    if len(msg) >= 5000:
        translated_msg = ''
        n_of_tries = np.ceil(len(msg) / 5000)
        init = 0
        for i in range(int(n_of_tries)):
            translated_msg += GoogleTranslator(source='auto', target='en').translate(msg[init:(init+4999)])
            init += 5000
        return translated_msg
    return GoogleTranslator(source='auto', target='en').translate(msg)

In [61]:
# Importing the raw data
df_jobs = pd.DataFrame()
for job in os.listdir('data'):
    if job[-4:] == '.csv':
        new_df = pd.read_csv(f'data/{job}', sep=';')
        df_jobs = pd.concat([df_jobs, new_df], axis=0)
# Reseting index and removing unused columns
df_jobs = df_jobs.reset_index().drop(['index', 'competitive_advantages', 'company'], axis=1)
# Droping duplicated rows
df_jobs = df_jobs[~(df_jobs[['company_name', 'position', 'location']].duplicated())]
# Droping the duplicated descriptions of remote jobs, since they are probably repeated jobs
df_jobs = df_jobs.drop(df_jobs[(df_jobs['type_workplace'] == 'Remoto') & (df_jobs[['description']].duplicated())].index, axis=0)
# Transforming location data into three columns: city, state, and country
mask_has_comma = df_jobs['location'].str.contains(', ')
df_jobs['city'] = df_jobs[mask_has_comma].location.map(lambda x: x.split(', ')[0])
df_jobs.loc[~mask_has_comma, 'city'] = df_jobs[~mask_has_comma].location.map(lambda x: x if 'Região' in x else np.nan)
df_jobs['state'] = df_jobs[mask_has_comma].location.map(lambda x: x.split(', ')[1] if x.split(', ')[1] != 'Brasil' else np.nan)
df_jobs['country'] = df_jobs[mask_has_comma].location.map(lambda x: x.split(', ')[-1])
df_jobs.loc[df_jobs['location'] == 'Brasil', 'country'] = 'Brasil'
mask_is_regiao = df_jobs['location'].str.contains('Região')
mask_is_only_brasil = (df_jobs['location'] == 'Brasil')
df_jobs.loc[~mask_has_comma & ~mask_is_regiao & ~mask_is_only_brasil, 'country'] = df_jobs.loc[~mask_has_comma & ~mask_is_regiao & ~mask_is_only_brasil]['location']
# Transforming dtype of date columns
df_jobs['posted_date'] = pd.to_datetime(df_jobs['posted_date'].str[:10])
df_jobs['date_collected'] = pd.to_datetime(df_jobs['date_collected'])
# Transforming dtype of 'no_applicants' column
df_jobs['no_applicants'] = df_jobs[df_jobs['no_applicants'].notna()].no_applicants.map(lambda x: x.split()[0]).astype('int')
df_jobs.loc[:, 'no_applicants'].fillna(0, axis=0, inplace=True)
# Worktype
# mask_worktype_inadequate = (df_jobs['worktype'].str.contains('/month')) | (df_jobs['worktype'].str.contains('funcionários'))
# df_jobs = df_jobs.drop(df_jobs[mask_worktype_inadequate].index, axis=0)
# Level
# df_jobs = df_jobs.drop(df_jobs[(df_jobs['level'] == 'Diretor') | (df_jobs['level'] == 'Executivo')].index, axis=0)
# Transforming data: getting the main part of the application urls
df_jobs['link_application_simplified'] = df_jobs.link_application.map(process_url)
# Getting all the skills
df_exploded_skills = df_jobs['required_skills'].dropna().map(lambda x: eval(x)).explode().reset_index()
df_jobs.shape

(2186, 19)

# Usage of Bertopic

In [62]:
docs = df_jobs['position'] + ' - ' + df_jobs['description']
docs[:5]

0    Analista de Gestão de Dados - Sobre a vaga\nPo...
1    Marketing Data Analyst - Sobre a vaga\nA Valte...
2    Football Statistician - Sobre a vaga\nLove spo...
3    ­­­Analista Privacidade de Dados Sr – Prazo De...
4    ANALISTA CADASTRO JR - Sobre a vaga\nSomos pio...
dtype: object

In [63]:
docs.shape

(2186,)

## 1st Model

In [64]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)

In [65]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,537,-1_de_em_para_com,"[de, em, para, com, que, do, da, no, um, na]",[Administrador(a) de banco de dados - DBMS - S...
1,0,633,0_and_to_the_of,"[and, to, the, of, in, with, for, our, experie...",[Cloud Data Engineer - Sobre a vaga\nQuestrade...
2,1,244,1_de_dados_em_para,"[de, dados, em, para, com, bi, conhecimento, d...",[Analista de Dados - Sobre a vaga\nEsta vaga f...
3,2,189,2_de_em_para_da,"[de, em, para, da, com, do, no, dos, os, ou]",[Analista Compras Sênior - Unidade Sede (Zona ...
4,3,88,3_de_em_desenvolvimento_com,"[de, em, desenvolvimento, com, para, experinci...",[Analista de Suporte de Cibersegurança (Networ...
5,4,46,4_de_com_para_em,"[de, com, para, em, que, dados, da, do, voc, c...",[Coordenador(a) Engenharia de Dados - Sobre a ...
6,5,34,5_marketing_de_em_para,"[marketing, de, em, para, com, do, que, da, no...",[Analista de Suporte ao Cliente - Sobre a vaga...
7,6,34,6_de_que_em_com,"[de, que, em, com, accenture, para, ou, no, pe...",[Pessoas Analistas e Consultoras SAP MM/WM - S...
8,7,31,7_de_para_com_em,"[de, para, com, em, que, um, do, da, uma, no]",[Programador(a) Trainee - Sobre a vaga\nQuem s...
9,8,29,8_sports_statistician_you_football,"[sports, statistician, you, football, game, re...",[Sports Statistician - Sobre a vaga\nLove spor...


In [66]:
topic_model.get_topic(0)

[('and', 0.050803501236358156),
 ('to', 0.039817751968041076),
 ('the', 0.0390618054517621),
 ('of', 0.034633163785285116),
 ('in', 0.03213291841067927),
 ('with', 0.029445406284697457),
 ('for', 0.02413292902086082),
 ('our', 0.021286184818855983),
 ('experience', 0.021080755981464392),
 ('we', 0.020760841782740645)]

In [67]:
topic_model.get_topic(9)

[('de', 0.050008209254730936),
 ('que', 0.02709464536224373),
 ('em', 0.025880101925622605),
 ('para', 0.022543084311346356),
 ('suporte', 0.022394582330773245),
 ('certified', 0.021793134195582962),
 ('ou', 0.02150083080446207),
 ('dados', 0.020392608301944),
 ('banco', 0.02021025761156366),
 ('solutis', 0.019217714678926)]

In [68]:
topic_model.get_topic_freq()

Unnamed: 0,Topic,Count
12,0,633
1,-1,537
0,1,244
5,2,189
20,3,88
8,4,46
22,5,34
7,6,34
17,7,31
2,8,29


In [69]:
topic_model.get_document_info(docs).head()

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Analista de Gestão de Dados - Sobre a vaga\nPo...,1,1_de_dados_em_para,"[de, dados, em, para, com, bi, conhecimento, d...",[Analista de Dados - Sobre a vaga\nEsta vaga f...,de - dados - em - para - com - bi - conhecimen...,1.0,False
1,Marketing Data Analyst - Sobre a vaga\nA Valte...,-1,-1_de_em_para_com,"[de, em, para, com, que, do, da, no, um, na]",[Administrador(a) de banco de dados - DBMS - S...,de - em - para - com - que - do - da - no - um...,0.0,False
2,Football Statistician - Sobre a vaga\nLove spo...,8,8_sports_statistician_you_football,"[sports, statistician, you, football, game, re...",[Sports Statistician - Sobre a vaga\nLove spor...,sports - statistician - you - football - game ...,0.046419,False
3,­­­Analista Privacidade de Dados Sr – Prazo De...,-1,-1_de_em_para_com,"[de, em, para, com, que, do, da, no, um, na]",[Administrador(a) de banco de dados - DBMS - S...,de - em - para - com - que - do - da - no - um...,0.0,False
4,ANALISTA CADASTRO JR - Sobre a vaga\nSomos pio...,-1,-1_de_em_para_com,"[de, em, para, com, que, do, da, no, um, na]",[Administrador(a) de banco de dados - DBMS - S...,de - em - para - com - que - do - da - no - um...,0.0,False


In [70]:
topic_model.get_representative_docs()

{-1: ['Administrador(a) de banco de dados - DBMS - Sobre a vaga\n618551BR\n\nIntrodução\n\nNa IBM, o trabalho é mais que uma tarefa. É um chamado para construir. Para projetar. Para codificar. Para consultar. Para pensar junto com os clientes e vender. Para construir mercados. Para inventar. Para colaborar. Não apenas para fazer algo melhor, mas para tentar coisas que você nunca imaginou serem possíveis. Para liderar nesta nova era da tecnologia e resolver alguns dos problemas mais desafiadores do mundo.\n\nSeu papel e responsabilidades\n\nNesta função, você trabalhará em nosso IBM Client Innovation Center (CIC), onde oferecemos profundo conhecimento técnico e do setor para uma ampla gama de clientes do setor público e privado em todo o mundo. Esses centros oferecem aos nossos clientes habilidades e conhecimento técnico para impulsionar a inovação e a adoção de novas tecnologias.\n\nAs Principais Funções Incluem\n\nEspecialistas de TI nesta função terão experiência em uma ou mais áreas

In [71]:
print(topic_model.topics_)

[1, -1, 8, -1, -1, 24, 1, 9, 2, 17, -1, 2, 1, 6, -1, 4, 15, 25, 1, 11, 2, 0, 4, 11, 4, 8, 2, 1, 15, 21, 1, 0, 0, 17, -1, 19, 0, 2, 9, 1, 1, 17, 0, 1, 2, 0, 0, 20, 1, 4, 8, 11, 0, 2, 4, 0, -1, 1, 0, 0, 8, -1, 0, 1, -1, 1, 8, 4, 10, -1, 10, -1, -1, -1, 8, -1, 24, 0, 0, 21, 21, -1, 11, 0, 11, 8, 11, 24, 8, 2, 1, 7, -1, 7, -1, 11, 11, 0, 8, 17, 24, 1, -1, -1, 0, 4, 0, -1, 2, 0, 1, 1, 0, 2, 2, 1, 0, 0, -1, 8, 0, 17, 1, 1, 0, 8, -1, -1, -1, 1, -1, 2, 8, 6, 2, 23, 11, 4, 4, -1, -1, 18, 0, 8, -1, -1, 1, 0, 8, 6, 18, -1, 19, 7, 1, 1, 2, 2, 0, 0, 9, -1, 3, 0, 1, 1, 0, 1, 0, 8, -1, 8, -1, 22, 0, 21, 18, -1, -1, 17, 2, 3, 5, 2, -1, 5, 0, 0, -1, 7, -1, -1, -1, 0, 0, 1, 1, 11, -1, -1, -1, 4, 1, 0, 16, 9, 20, 1, 1, 3, 7, -1, 2, 2, -1, 1, -1, 0, -1, 0, 0, 0, 5, 15, -1, 0, 1, -1, 2, 0, 1, 0, -1, 0, 0, 17, -1, -1, 2, -1, -1, 2, 0, 1, 24, 24, 24, -1, 11, 0, -1, 8, 3, 24, 0, 18, 16, 0, -1, -1, -1, 1, 18, -1, -1, 0, 9, 0, 8, -1, 3, 8, 22, 8, 0, 17, -1, 20, 2, 2, 0, 17, -1, 0, 0, 17, 2, -1, 0, 2, -1, 0, 0, 

In [72]:
print(topic_model.topic_sizes_)

Counter({0: 633, -1: 537, 1: 244, 2: 189, 3: 88, 4: 46, 6: 34, 5: 34, 7: 31, 8: 29, 9: 26, 10: 26, 11: 25, 12: 24, 14: 22, 13: 22, 15: 20, 17: 18, 16: 18, 18: 16, 21: 15, 19: 15, 20: 15, 23: 13, 22: 13, 24: 12, 25: 11, 26: 10})


In [73]:
topic_model.topic_labels_

{-1: '-1_de_em_para_com',
 0: '0_and_to_the_of',
 1: '1_de_dados_em_para',
 2: '2_de_em_para_da',
 3: '3_de_em_desenvolvimento_com',
 4: '4_de_com_para_em',
 5: '5_marketing_de_em_para',
 6: '6_de_que_em_com',
 7: '7_de_para_com_em',
 8: '8_sports_statistician_you_football',
 9: '9_de_que_em_para',
 10: '10_de_crdito_para_com',
 11: '11_agoda_marketing_data_and',
 12: '12_java_de_em_para',
 13: '13_de_em_para_com',
 14: '14_de_com_da_em',
 15: '15_de_em_com_da',
 16: '16_de_com_que_deloitte',
 17: '17_de_com_que_em',
 18: '18_de_pagamento_em_com',
 19: '19_de_que_digital_em',
 20: '20_de_em_dados_que',
 21: '21_de_em_siemens_para',
 22: '22_que_de_nossa_por',
 23: '23_accenture_pessoa_desenvolvedora_espera',
 24: '24_agoda_marketing_and_data',
 25: '25_de_em_bunge_syngenta',
 26: '26_de_integral_perodo_analista'}

In [74]:
topic_model.visualize_topics()

In [75]:
topic_model.visualize_hierarchy()

In [76]:
topic_model.visualize_barchart()

In [77]:
topic_model.visualize_heatmap()

In [78]:
topic_model.visualize_term_rank()

## 2nd Model: KeyBERTInspired

In [79]:
representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model)
topics, probs = topic_model.fit_transform(docs)

KeyboardInterrupt: 

In [None]:
topic_model2 = topic_model

In [None]:
topic_model2.get_topic_info()[:5]

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,951,-1_como_projetos_trabalho_clientes,"[como, projetos, trabalho, clientes, managemen...",[Analista de Suporte - Vaga Exclusiva para Pes...
1,0,234,0_atividades_analytics_habilidades_responsabil...,"[atividades, analytics, habilidades, responsab...",[Analista de Dados - Sobre a vaga\nEsta vaga f...
2,1,88,1_software_como_projetos_responsabilidades,"[software, como, projetos, responsabilidades, ...",[Analista de Desenvolvimento Python - Sobre a ...
3,2,46,2_sap_salesforce_services_service,"[sap, salesforce, services, service, support, ...","[Support Engineer, SAP SD (Remote Brazil) - So..."
4,3,43,3_developers_developer_devops_engineers,"[developers, developer, devops, engineers, cli...",[Senior DevOps Engineer - Remote - Latin Ameri...


In [None]:
pd.DataFrame(topic_model2.get_representative_docs()).T.head(10)

Unnamed: 0,0,1,2
-1,Analista de Suporte - Vaga Exclusiva para Pess...,Analista de Operações de Sucesso do Cliente - ...,Software Architect - Sobre a vaga\nAbout The R...
0,Analista de BI Senior - Sobre a vaga\n🌱👩‍💻 Com...,Analista de Dados com Conhecimentos em Power B...,Analista Business Intelligence Pleno - Sobre a...
1,Analista de Desenvolvimento Python - Sobre a v...,Fullstack (Analista .NET) - Sobre a vaga\nObje...,Desenvolvedor - Sobre a vaga\nBuscamos pessoas...
2,"Senior Technical Architect, Gigster Network - ...",Java Engineer - Remote - Latin America - Sobre...,DevOps Developer (Mainframe zOS) - Sobre a vag...
3,Pessoa Consultora - FI/FICO/TRM/Mastersaf - So...,Analistas e Consultores Data migration - Sobre...,Analistas e Consultores SAP SD - Sobre a vaga\...
4,Marketing Automation Developer - Sobre a vaga\...,Analista de Marketing Jr. - Sobre a vaga\nO Tr...,Marketing Analyst - Sobre a vaga\nResumo do ca...
5,Accounting Analyst - Sobre a vaga\nAbout Addi\...,City Operations Analyst - Sobre a vaga\nAbout ...,Senior Data Analyst - Sobre a vaga\nAbout The ...
6,Senior Data Analyst - Remote - Latin America -...,Senior Data Analyst - Remote - Latin America -...,Senior Data Analyst - Remote - Latin America -...
7,Customer Excellence Analyst - Sobre a vaga\nAB...,Sales Strategy Analyst - Data Scientist - Sobr...,Sales Strategy Analyst - Data Intelligence - S...
8,Programador(a) Trainee - Sobre a vaga\nQuem so...,Engenheiro(a) de Software Trainee - Sobre a va...,Engenheiro(a) de Software Trainee - Sobre a va...


## 3rd Model: ChatGPT 3.5

In [None]:
openai.api_key = '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'
representation_model = OpenAI(model='gpt-3.5-turbo', chat=True)
topic_model_3 = BERTopic(representation_model=representation_model)
topics, probs = topic_model_3.fit_transform(docs)

RateLimitError: Rate limit reached for default-gpt-3.5-turbo in organization org-ZpN5JXSPGSaGzA1Vq7nc7Boj on requests per min. Limit: 3 / min. Please try again in 20s. Contact us through our help center at help.openai.com if you continue to have issues. Please add a payment method to your account to increase your rate limit. Visit https://platform.openai.com/account/billing to add a payment method.

## 4th Model: multilingual online, with KeyBert

In [None]:
representation_model = KeyBERTInspired()
topic_model4 = BERTopic(representation_model=representation_model, language='multilingual')
topics, probs = topic_model4.fit_transform(docs)

Downloading (…)0fe39/.gitattributes: 100%|██████████| 968/968 [00:00<00:00, 262kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 189kB/s]
Downloading (…)83e900fe39/README.md: 100%|██████████| 3.79k/3.79k [00:00<00:00, 3.45MB/s]
Downloading (…)e900fe39/config.json: 100%|██████████| 645/645 [00:00<?, ?B/s] 
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 81.0kB/s]
Downloading pytorch_model.bin: 100%|██████████| 471M/471M [15:51<00:00, 495kB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 867B/s]
Downloading (…)tencepiece.bpe.model: 100%|██████████| 5.07M/5.07M [00:09<00:00, 562kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 157kB/s]
Downloading tokenizer.json: 100%|██████████| 9.08M/9.08M [00:16<00:00, 559kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 480/480 [00:00<00:00, 478kB/s]
Downloading unigram.json: 100%|██████████| 14.8M/14.8M [00:42<00:00,

In [None]:
topic_model2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,940,-1_como_trabalho_projetos_management,"[como, trabalho, projetos, management, cliente...",[Analista de Suporte - Vaga Exclusiva para Pes...
1,0,234,0_comunicao_habilidades_responsabilidades_anal...,"[comunicao, habilidades, responsabilidades, an...",[Analista de BI Senior - Sobre a vaga\n🌱👩‍💻 Co...
2,1,88,1_software_como_projetos_responsabilidades,"[software, como, projetos, responsabilidades, ...",[Analista de Desenvolvimento Python - Sobre a ...
3,2,50,2_developers_devops_developer_engineer,"[developers, devops, developer, engineer, deve...","[Senior Technical Architect, Gigster Network -..."
4,3,42,3_como_extensivo_consultoria_pessoas,"[como, extensivo, consultoria, pessoas, inovao...",[Pessoa Consultora - FI/FICO/TRM/Mastersaf - S...
5,4,35,4_marketing_comunicao_atividades_publicidade,"[marketing, comunicao, atividades, publicidade...",[Marketing Automation Developer - Sobre a vaga...
6,5,33,5_analyst_accounting_management_financial,"[analyst, accounting, management, financial, o...",[Accounting Analyst - Sobre a vaga\nAbout Addi...
7,6,32,6_clients_hiring_software_engineers,"[clients, hiring, software, engineers, develop...",[Senior Data Analyst - Remote - Latin America ...
8,7,30,7_customers_analyst_sales_commerce,"[customers, analyst, sales, commerce, business...",[Customer Excellence Analyst - Sobre a vaga\nA...
9,8,30,8_comunicao_atividades_como_contato,"[comunicao, atividades, como, contato, projeto...",[Programador(a) Trainee - Sobre a vaga\nQuem s...


In [None]:
topic_model4.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,939,-1_empresa_company_management_software,"[empresa, company, management, software, busin...",[Solution Architect-Industry Solutions Deliver...
1,0,119,0_gestão_técnicos_empresas_atendimento,"[gestão, técnicos, empresas, atendimento, serv...",[Analista Compras Sênior - Unidade Sede (Zona ...
2,1,62,1_data_analytics_database_scientist,"[data, analytics, database, scientist, enginee...",[Data Scientist - Sobre a vaga\nA well-establi...
3,2,51,2_desenvolver_nós_apoiar_temos,"[desenvolver, nós, apoiar, temos, participar, ...",[AGENTE NEGOCIOS | 3026-SR.DOS PASSOS-UFS./BA ...
4,3,50,3_analista_intelligence_insights_dados,"[analista, intelligence, insights, dados, info...",[Analista Business Intelligence Pleno - Sobre ...
5,4,46,4_consultancy_software_net_companies,"[consultancy, software, net, companies, client...",[Senior Data Analyst - Remote - Latin America ...
6,5,45,5_profissional_especialização_analista_contrat...,"[profissional, especialização, analista, contr...",[ANALISTA DE INFORMAÇÕES JR - ANALISTA DE FRAU...
7,6,44,6_brasil_brazil_tecnologia_carreira,"[brasil, brazil, tecnologia, carreira, oportun...",[Staff Software Engineer - Sobre a vaga\nA Uni...
8,7,42,7_dados_data_analista_monitoramento,"[dados, data, analista, monitoramento, metodol...",[Engenheiro de Dados (HDL) - Sobre a vaga\nFor...
9,8,41,8_analista_projetos_técnicos_administrar,"[analista, projetos, técnicos, administrar, pr...",[Especialista de Tecnologia – Banco de Dados -...


# Translating the data to English

In [80]:
df_jobs['job_description_en'] = df_jobs.description.map(translate_message)
df_jobs['position_en'] = df_jobs.position.map(lambda x: GoogleTranslator(source='auto', target='en').translate(x))

In [17]:
# df_jobs = pd.read_csv('data/transformed_data/df_2023_6_17.csv', sep=';')
# df_jobs = df_jobs.drop('description_en', axis=1)

In [84]:
mask_translation_error = df_jobs['job_description_en'].str.contains('Error 400')
df_jobs[mask_translation_error]

Unnamed: 0,position,location,posted_date,no_applicants,date_collected,type_workplace,required_skills,level,worktype,description,...,link_linkedin,company_name,company_size,company_sector,city,state,country,link_application_simplified,job_description_en,position_en


In [82]:
df_jobs.loc[mask_translation_error, 'job_description_en'] = df_jobs[mask_translation_error].description.map(lambda x: translate_message(unidecode(x)))

In [83]:
now = datetime.now()
df_jobs.to_csv(f'data/transformed_data/df_{now.year}_{now.month}_{now.day}.csv', sep=';', index=None)

# Loading data already translated

In [2]:
df_jobs = pd.read_csv('data/transformed_data/df_2023_6_22.csv', sep=';')

# 5th Model

In [3]:
docs = df_jobs['position_en'] + ' - ' + df_jobs['job_description_en']
representation_model = KeyBERTInspired()
topic_model = BERTopic(representation_model=representation_model)
topics, probs = topic_model.fit_transform(docs)

In [4]:
df_jobs.shape

(2186, 21)

In [5]:
df_jobs_backup = df_jobs.copy()

In [42]:
df_jobs_backup.to_csv('bert_topic_data.csv', sep=';', index=None)

In [6]:
topic_model.get_document_info(docs).shape

(2186, 8)

In [7]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,916,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...
1,0,82,0_databases_databricks_data_analytics,"[databases, databricks, data, analytics, pipel...",[Data Platform Engineer | Consultor - About th...
2,1,72,1_analyst_bi_analytical_reports,"[analyst, bi, analytical, reports, dashboards,...",[BI analyst - About the vacancy\nWe are lookin...
3,2,52,2_customers_analyst_business_customer,"[customers, analyst, business, customer, sales...",[Customer Excellence Analyst - Sobre a vaga\nA...
4,3,51,3_cybersecurity_security_analyst_firewalls,"[cybersecurity, security, analyst, firewalls, ...",[Technology Analyst - Networks/Security (Senio...
5,4,48,4_consulting_sap_business_customers,"[consulting, sap, business, customers, develop...",[People Analysts and Consultants SAP MM/WM - A...
6,5,41,5_internship_intern_engineering_sap,"[internship, intern, engineering, sap, require...",[Engineer Intern - About the vacancy\nRINA is ...
7,6,40,6_logistics_responsibilities_freight_management,"[logistics, responsibilities, freight, managem...",[Logistics - Checker (Barriers) - About the va...
8,7,36,7_laboratory_certification_inspection_management,"[laboratory, certification, inspection, manage...",[BUSINESS CONSULTANT - About the vacancy\nComp...
9,8,36,8_developer_development_career_vacancy,"[developer, development, career, vacancy, java...",[Full Java Developer - Hybrid - About the vaca...


In [47]:
topic_model.get_document_info(docs).to_csv('document_info.csv', sep=';', index=None)
topic_model.get_topic_info().to_csv('topic_info.csv', sep=';', index=None)

In [48]:
pd.DataFrame(topic_model.get_representative_docs()).T.reset_index().melt(id_vars='index').sort_values(by=['index', 'variable']).head(30)

Unnamed: 0,index,variable,value
0,-1,0,Data Consultant - M&A IT - About the vacancy\n...
50,-1,1,JR Data Analyst (Affirmative vacancy for peopl...
100,-1,2,Software Architect - Sobre a vaga\nAbout The R...
1,0,0,Data Platform Engineer | Consultor - About the...
51,0,1,Data Engineer Specialist - About the vacancy\n...
101,0,2,"Digital Architect (BI, Analytics & DataOps) - ..."
2,1,0,BI analyst - About the vacancy\nWe are looking...
52,1,1,Senior BI Analyst - About the vacancy\n🌱👩‍💻 Wh...
102,1,2,Data Analyst with Power BI Knowledge - About t...
3,2,0,Customer Excellence Analyst - Sobre a vaga\nAB...


In [49]:
topic_distr, _ = topic_model.approximate_distribution(docs, batch_size=1000)
distributions = [distr[topic] if topic != -1 else 0 for topic, distr in zip(topics, topic_distr)]
topic_model.get_document_info(docs, metadata={"Topic_distribution": distributions})

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document,Topic_distribution
0,Data Management Analyst - About the vacancy\nP...,-1,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...,analyst - management - clients - develop - dev...,0.000000,False,0.000000
1,Marketing Data Analyst - About the vacancy\nVa...,-1,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...,analyst - management - clients - develop - dev...,0.000000,False,0.000000
2,Football Statistician - Sobre a vaga\nLove spo...,13,13_sports_statistician_httpsgeniussportscomsta...,"[sports, statistician, httpsgeniussportscomsta...",[Sports Statistician - Sobre a vaga\nLove spor...,sports - statistician - httpsgeniussportscomst...,0.051840,False,0.375623
3,Sr. Data Privacy Analyst - Fixed Term (6 month...,-1,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...,analyst - management - clients - develop - dev...,0.000000,False,0.000000
4,JR REGISTRATION ANALYST - About the vacancy\nW...,-1,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...,analyst - management - clients - develop - dev...,0.000000,False,0.000000
...,...,...,...,...,...,...,...,...,...
2181,Embedded Linux Engineer - About the vacancy\nE...,-1,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...,analyst - management - clients - develop - dev...,0.000000,False,0.000000
2182,DELMIA Apriso Consultant - Sobre a vaga\nImagi...,2,2_customers_analyst_business_customer,"[customers, analyst, business, customer, sales...",[Customer Excellence Analyst - Sobre a vaga\nA...,customers - analyst - business - customer - sa...,1.000000,False,0.041714
2183,SRE Specialist - About the vacancy\nJob Descri...,-1,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...,analyst - management - clients - develop - dev...,0.000000,False,0.000000
2184,Internship in Python Development - About the v...,-1,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...,analyst - management - clients - develop - dev...,0.000000,False,0.000000


Possibly useful info:

https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.get_document_info
https://maartengr.github.io/BERTopic/api/bertopic.html#bertopic._bertopic.BERTopic.get_representative_docs
https://maartengr.github.io/BERTopic/api/plotting/heatmap.html
https://maartengr.github.io/BERTopic/api/plotting/hierarchical_documents.html
https://maartengr.github.io/BERTopic/api/representation/zeroshot.html#bertopic.representation._zeroshot.ZeroShotClassification

Next steps:
- Fazer uma análise dos tópicos em si, no ReadMe; o que os tópicos trouxeram, o que tem de interessante, os mais presentes e o que indicam
- Passar os dados para um mesmo dataframe com o restante dos empregos:
--> Passar o nome do tópico para uma coluna, adicionando um feature
--> (melhor e mais complexo) Passar o array de distribuição, cada dado uma coluna, e fazer um melt, aumentando a granularidade para job-topic, permitindo analisar a presença de cada conteúdo em relação à probabilidade de fazer parte de um tópico. P. ex.: Empregos júnior tendem a estar em qual tópico principalmente etc.
- Para isso, salvar os dados gerados aqui em um novo documento .csv para facilitar a análise posterior. Em especial, por salvar os dados traduzidos e os dados do Bertopic, que levam tempo para serem gerados.

In [50]:
topic_distr.shape

(2186, 49)

In [51]:
df_jobs.shape

(2185, 77)

In [52]:
topic_model.get_document_info(docs).shape

(2186, 8)

In [53]:
topic_model.get_document_info(docs)[topic_model.get_document_info(docs)['Topic'] == 0].shape

(82, 8)

In [61]:
df_jobs = df_jobs.merge(topic_model.get_document_info(docs)[['Topic', 'Name', 'Representation', 'Top_n_words', 'Probability', 'Representative_document']], how='left', left_index=True, right_index=True)

In [None]:
for i in range(topic_distr.shape[1]):
    df_jobs[f'topic{i}'] = topic_distr[:, i]

In [67]:
df_jobs.to_csv('bert_topic_data.csv', sep=';', index=None)

# Melt analysis

In [76]:
# We can start running from here; we have all the dataset saved with the translated data and the Bertopic data


In [85]:


get_top_probability_by_category('worktype')

Unnamed: 0,index,worktype,variable,value
0,779,Temporário,topic5,0.052432
1,686,Tempo integral,topic0,0.036131
2,643,R$ 9.546/month,topic14,0.087958
3,623,R$ 7.500/month,topic40,0.14363
4,584,R$ 5.998/month,topic6,0.099677
5,515,R$ 5.000/month,topic31,0.103774
6,466,R$ 4/month,topic31,0.198125
7,393,R$ 4.200/month,topic1,0.129522
8,343,R$ 38.000/month,topic0,0.074209
9,320,R$ 15.600/month,topic32,0.107346


In [26]:
melted_topics.groupby(['type_workplace', 'variable'])['value'].mean().reset_index().sort_values(by=['type_workplace', 'value'], ascending=False).groupby('type_workplace').head(1).reset_index()

Unnamed: 0,index,type_workplace,variable,value
0,98,Remoto,topic0,0.045225
1,50,Presencial,topic1,0.040627
2,1,Híbrido,topic1,0.035446


In [27]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,916,-1_analyst_management_clients_develop,"[analyst, management, clients, develop, develo...",[Data Consultant - M&A IT - About the vacancy\...
1,0,82,0_databases_databricks_data_analytics,"[databases, databricks, data, analytics, pipel...",[Data Platform Engineer | Consultor - About th...
2,1,72,1_analyst_bi_analytical_reports,"[analyst, bi, analytical, reports, dashboards,...",[BI analyst - About the vacancy\nWe are lookin...
3,2,52,2_customers_analyst_business_customer,"[customers, analyst, business, customer, sales...",[Customer Excellence Analyst - Sobre a vaga\nA...
4,3,51,3_cybersecurity_security_analyst_firewalls,"[cybersecurity, security, analyst, firewalls, ...",[Technology Analyst - Networks/Security (Senio...
5,4,48,4_consulting_sap_business_customers,"[consulting, sap, business, customers, develop...",[People Analysts and Consultants SAP MM/WM - A...
6,5,41,5_internship_intern_engineering_sap,"[internship, intern, engineering, sap, require...",[Engineer Intern - About the vacancy\nRINA is ...
7,6,40,6_logistics_responsibilities_freight_management,"[logistics, responsibilities, freight, managem...",[Logistics - Checker (Barriers) - About the va...
8,7,36,7_laboratory_certification_inspection_management,"[laboratory, certification, inspection, manage...",[BUSINESS CONSULTANT - About the vacancy\nComp...
9,8,36,8_developer_development_career_vacancy,"[developer, development, career, vacancy, java...",[Full Java Developer - Hybrid - About the vaca...


In [28]:
topic_model.get_document_info(docs)['Document'].duplicated().sum()

74

In [29]:
unique_topics = df_jobs['Topic'].unique().copy()
unique_topics.sort()

# Repeatable analysis

In [98]:
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

In [117]:
def generate_graphs_by_topic(df):
    #fig, ax = plt.subplots(3, 1, figsize=(5, 10))
    #sns.barplot(data=count_words(df).head(15), y='word', x='frequency', ax=ax[0])
    count_columns(df_jobs, 'type_workplace')

def count_words(df):
    # Joining the data of all the positions
    all_positions = ' '.join(df['position']).lower()
    # Removing uninteresting words
    words_to_exclude = [' de ', '-', '|', ' e ', ' em ', ' from ', ' & ']  
    for word in words_to_exclude:
        all_positions = all_positions.replace(word, '')
    words_list = all_positions.split()
    counted_words = Counter(words_list)
    df_position_words = pd.DataFrame.from_dict(counted_words, orient='index', columns=['frequency'])
    df_position_words.index.name = 'word'
    df_position_words = df_position_words.reset_index().sort_values(by='frequency', ascending=False)
    plt.subplots(figsize=(10,10))
    plt.title('Top-15 most mentioned words')
    
    return df_position_words

def generate_count(dataframe, to_group_column, to_count_column):
    return dataframe.groupby(to_group_column)[to_count_column].count().reset_index().sort_values(ascending=False, by=to_count_column)

def count_columns(df_jobs, key):
    counted_columns = dict()
    for col in ['posted_date', 'no_applicants', 'type_workplace', 'level', 'worktype',
       'company_name', 'company_size', 'company_sector', 'city', 'state',
       'country', 'link_application_simplified']:
        counted_columns[col] = generate_count(df_jobs, col, ['position'])
    return counted_columns[key]



In [118]:
generate_graphs_by_topic(df_jobs)

In [95]:
for i in unique_topics:
    print(i)
    
    # df_jobs[df_jobs['Topic'] == 0]

-1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
