In [2]:
import sys
import os

# Add the src directory to the system path
src_path = os.path.abspath(os.path.join('..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

# Now you can import the process_job_translation method from translate.py
# from preprocess.translate import process_job_translation

import pandas as pd

# Import data

In [3]:
people_df = pd.read_csv("../data/raw/person.csv")
education_df = pd.read_csv("../data/raw/education.csv")
experience_df = pd.read_csv("../data/raw/experience.csv")

# Get list of people_id from different field_of_study
We need to get a list of people from different field_of_study so that when we train our classifier it can generalize to different kinds of jobs that people from different field_of_study work.

- There are 29 unique courses that UFABC offer in the undergrad level

- Bacharelado em Ciência e Tecnologia (BC&T) which then turn into -> Biotecnologia, Ciência da Computação, Ciências Biológicas, Física, Matemática, Química, Neurociência, 
                                                                   Engenharia Ambiental e Urbana, Engenharia de Energia, Engenharia de Informação, Engenharia de Instrumentação, Automação e Robótica,
                                                                   Engenharia de Materiais, Engenharia Aeroespacial, Engenharia Biomédica, Engenharia de Gestão

- Bacharelado em Ciências e Humanidades (BC&H) which then turn into -> Ciências Econômicas, Filosofia, Planejamento Territorial, Políticas Públicas, Relações Internacionais

- Licenciatura em Ciências Humanas (LCH) which then turn into -> Filosofia (a different one)

- Licenciatura em Ciências Naturais e Exatas (LCNE) which then turn into -> Ciências Biológicas, Física, Matemática, Química

In [4]:
# UFABC id is school_id=0
education_ufabc_df = education_df[education_df['school_id'] == 0].drop_duplicates(subset=['person_id', 'degree', 'field_of_study'])

person_id_by_field_of_study = education_ufabc_df.groupby('field_of_study')['person_id'].apply(lambda x: x.to_list())

df_person_id_by_field_of_study = person_id_by_field_of_study.reset_index(name='people_ids')
df_person_id_by_field_of_study['list_length'] = df_person_id_by_field_of_study['people_ids'].apply(len)
df_person_id_by_field_of_study = df_person_id_by_field_of_study.sort_values(by='list_length', ascending=False)
df_person_id_by_field_of_study.head()

Unnamed: 0,field_of_study,people_ids,list_length
344,Ciência e Tecnologia,"[13, 18, 26, 29, 47, 53, 55, 62, 63, 64, 71, 1...",1102
326,Ciência da Computação,"[50, 77, 96, 104, 129, 136, 143, 169, 189, 196...",464
633,Engenharia de Gestão,"[17, 142, 185, 186, 203, 216, 295, 335, 356, 3...",341
1112,Relações Internacionais,"[36, 109, 162, 182, 208, 233, 235, 254, 258, 2...",292
517,Economia,"[15, 42, 65, 120, 132, 217, 222, 230, 265, 285...",291


In [7]:
df_person_id_by_field_of_study.head(top_field_of_study_to_consider)

Unnamed: 0,field_of_study,people_ids,list_length
344,Ciência e Tecnologia,"[13, 18, 26, 29, 47, 53, 55, 62, 63, 64, 71, 1...",1102
326,Ciência da Computação,"[50, 77, 96, 104, 129, 136, 143, 169, 189, 196...",464
633,Engenharia de Gestão,"[17, 142, 185, 186, 203, 216, 295, 335, 356, 3...",341
1112,Relações Internacionais,"[36, 109, 162, 182, 208, 233, 235, 254, 258, 2...",292
517,Economia,"[15, 42, 65, 120, 132, 217, 222, 230, 265, 285...",291
433,Ciências e Humanidades,"[15, 43, 46, 111, 164, 166, 182, 197, 204, 286...",283
661,Engenharia de Materiais,"[116, 138, 176, 189, 280, 315, 345, 346, 368, ...",188
469,Computer Science,"[89, 455, 772, 985, 985, 1006, 1067, 1374, 140...",185
572,Engenharia,"[58, 127, 153, 170, 213, 218, 242, 247, 482, 4...",184
148,Bacharelado em Ciência e Tecnologia,"[94, 107, 131, 151, 185, 186, 253, 256, 335, 3...",178


In [None]:
# From education_ufabc_df, I want to get the person_id of people from different field_of_study (at least 50)
# Specifically, I want to get 30 people from each field_of_study

top_field_of_study_to_consider = 55 # This number was chosen so that we could select some people_ids from field_of_study='Filosofia' (arbitrary but I think that if contains Filosofia, it will contain a considerable mix of different jobs)
people_ids_to_consider = 18 # number of people from each field_of_study to consider (we will change this to get a total amount of jobs around 3,000)

df_top_field_of_study = df_person_id_by_field_of_study.head(top_field_of_study_to_consider)

# Creating a set of all people_ids from the top "top_field_of_study_to_consider" entries
top_50_person_ids_set = set()
for people_ids in df_top_field_of_study['people_ids']:
    people_ids = people_ids[0:people_ids_to_consider]
    top_50_person_ids_set.update(people_ids)

len(top_50_person_ids_set)

871

# Get a dataframe with the jobs of those people

In [7]:
# These are the jobs that I am going to manually classify!
filtered_experience_df = experience_df[experience_df['person_id'].isin(top_50_person_ids_set)]
filtered_experience_df

Unnamed: 0,person_id,company_id,role,location,start_date,end_date,description
10,2,6,CX Operations Analyst,"São Paulo, Brasil",fev. de 2023,Ongoing,Creation and adjustment of processes and tools...
11,2,6,Digital Commerce Specialist Program,,jul. de 2021,jan. de 2023,Customer Experience Operations:- Customer Serv...
12,2,7,Estagiário em Administração de Vendas,,mai. de 2019,mai. de 2021,- Validação de documentos para atestar a elegi...
13,3,8,Pesquisador júnior,São Paulo,jul. de 2022,Ongoing,
21,7,5,Estágio em Riscos,"São Paulo, Brasil",Jun 2021,Ongoing,- Análise de dados para Risco de Mercado e Liq...
...,...,...,...,...,...,...,...
32728,759,1384,Coordenador de integração,"São Paulo e Região, Brasil",Jan 2018,Feb 2020,
32729,759,1385,Estagiário,São Paulo,Nov 2018,Oct 2019,Realização de reuniões e oficinas;Levantamento...
34820,367,746,Analista de TIC,"São Paulo Area, Brazil",Mar 2016,Ongoing,"Desenvolvimento de aplicações Java, .Net e PHP..."
34821,367,747,Técnico em Desenvolvimento de Aplicações Pleno,"São Paulo Area, Brazil",Aug 2010,Feb 2016,Técnico em Desenvolvimento de Aplicações Júnio...


## Translate the job title and description to english

In [10]:
filtered_experience_df_translated = filtered_experience_df
filtered_experience_df_translated['role_english'] = filtered_experience_df_translated['role'].apply(lambda x: process_job_translation(x))

Role: CX Operations Analyst - Role in english: CX Operations Analyst
Role: Digital Commerce Specialist Program - Role in english: Digital Commerce Specialist Program
Role: Estagiário em Administração de Vendas - Role in english: Sales Administration Intern
Role: Pesquisador júnior - Role in english: Junior researcher
Role: Estágio em Riscos - Role in english: Risk Internship
Role: Coembaixadora em Marketing - Role in english: Marketing Co-Ambassador
Role: Analista de CRM PL - Role in english: CRM PL Analyst
Role: Analista de Processos Jr - Role in english: Junior Process Analyst
Role: Analista de MIS Jr - Role in english: MIS Jr Analyst
Role: Estagiária de riscos e prevenção à lavagem de dinheiro - Role in english: Risk and money laundering prevention intern
Role: Site Safety Specialist - Role in english: Site Safety Specialist
Role: Trainee - Role in english: Trainee
Role: Voluntária no Departamento de Marketing e Pesquisa do Projeto. - Role in english: Volunteer in the Project's Mark

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_experience_df_translated['role_english'] = filtered_experience_df_translated['role'].apply(lambda x: process_job_translation(x))


In [None]:
filtered_experience_df_translated['description_english'] = filtered_experience_df_translated['description'].apply(lambda x: process_job_translation(x))

In [6]:
# Save it
filtered_experience_df_translated.to_parquet('../data/interim/jobs_used_to_manually_classify.parquet')
filtered_experience_df_translated.to_excel('../data/interim/jobs_used_to_manually_classify.xlsx')

In [3]:
# Load it
filtered_experience_df_translated = pd.read_parquet('../data/interim/jobs_used_to_manually_classify.parquet')