In [1]:
import sys
import os
import re

# Add the src directory to the system path
src_path = os.path.abspath(os.path.join('..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

import pandas as pd

# Import and merge data

In [2]:
# Load it
df_manual_classification = pd.read_excel('../data/interim/manual_classification.xlsx')

df_occupation_data_onet = pd.read_excel('../data/raw/occupation_data_onet_29_0_database.xlsx')

In [None]:
df_manual_classification['onet_title_final'].value_counts().head()

onet_title_final
unidentified job                         353
Management Analysts                      273
student - extension or student entity    243
Business Intelligence Analysts           210
Financial and Investment Analysts        176
Name: count, dtype: int64

In [None]:
# Merge manual classification with ONET codes
df_manual_classification_codes = df_manual_classification.merge(df_occupation_data_onet, left_on='onet_title_final', right_on='Title', how='left')
df_manual_classification_codes.head()

Unnamed: 0.1,Unnamed: 0,person_id,filter,company_id,role,location,start_date,end_date,description,role_english,description_english,onet_title_final,Level,Industry,O*NET-SOC Code,Title,Description
0,10,2,DON'T SHOW,6,CX Operations Analyst,"São Paulo, Brasil",fev. de 2023,Ongoing,Creation and adjustment of processes and tools...,CX Operations Analyst,Creation and adjustment of processes and tools...,Search Marketing Strategists,analyst,,13-1161.01,Search Marketing Strategists,Employ search marketing tactics to increase vi...
1,11,2,DON'T SHOW,6,Digital Commerce Specialist Program,,jul. de 2021,jan. de 2023,Customer Experience Operations:- Customer Serv...,Digital Commerce Specialist Program,Customer Experience Operations:- Customer Serv...,Search Marketing Strategists,analyst,,13-1161.01,Search Marketing Strategists,Employ search marketing tactics to increase vi...
2,12,2,DON'T SHOW,7,Estagiário em Administração de Vendas,,mai. de 2019,mai. de 2021,- Validação de documentos para atestar a elegi...,Sales Administration Intern,- Validation of documents to certify the eligi...,"Sales and Related Workers, All Other",internship,,41-9099.00,"Sales and Related Workers, All Other",All sales and related workers not listed separ...
3,13,3,DON'T SHOW,8,Pesquisador júnior,São Paulo,jul. de 2022,Ongoing,,Junior researcher,,researcher,analyst,,,,
4,21,7,DON'T SHOW,5,Estágio em Riscos,"São Paulo, Brasil",2021-06-01 00:00:00,Ongoing,- Análise de dados para Risco de Mercado e Liq...,Risk Internship,- Data analysis for Market and Liquidity Risk ...,Financial Quantitative Analysts,internship,,13-2099.01,Financial Quantitative Analysts,Develop quantitative techniques to inform secu...


# Simple EDA

In [5]:
# number of unique jobs (besides the ones that are not part of ONET, such as "teacher", "student")
df_manual_classification_codes['O*NET-SOC Code'].nunique()

145

In [6]:
# The jobs that have a O*NET-SOC Code
df_manual_classification_codes['O*NET-SOC Code'].value_counts().head()

O*NET-SOC Code
13-1111.00    273
15-2051.01    210
13-2051.00    176
15-1211.00    153
41-9099.00    136
Name: count, dtype: int64

# Process the data

### Reducing the number of classes

In [7]:
# We have too many classes...let's reduce the number by using only the first few digits of each O*NET-SOC Code
df_manual_classification_codes['onet_short_code_str2'] = df_manual_classification_codes['O*NET-SOC Code'].fillna('').str.replace('-', '').str[:2]
df_manual_classification_codes['onet_short_code_str3'] = df_manual_classification_codes['O*NET-SOC Code'].fillna('').str.replace('-', '').str[:3]
df_manual_classification_codes['onet_short_code_str4'] = df_manual_classification_codes['O*NET-SOC Code'].fillna('').str.replace('-', '').str[:4]
df_manual_classification_codes['onet_short_code_str5'] = df_manual_classification_codes['O*NET-SOC Code'].fillna('').str.replace('-', '').str[:5]
df_manual_classification_codes['onet_short_code_str6'] = df_manual_classification_codes['O*NET-SOC Code'].fillna('').str.replace('-', '').str[:6]

print(df_manual_classification_codes['onet_short_code_str2'].nunique())
print(df_manual_classification_codes['onet_short_code_str3'].nunique())
print(df_manual_classification_codes['onet_short_code_str4'].nunique())
print(df_manual_classification_codes['onet_short_code_str5'].nunique())
print(df_manual_classification_codes['onet_short_code_str6'].nunique())

20
47
54
113
132


In [8]:
# Great, let's prepare the dataframe to get our final classification (the one that the model will use)
df_manual_classification_codes['final_classification_str2'] = df_manual_classification_codes['onet_short_code_str2']
df_manual_classification_codes['final_classification_str3'] = df_manual_classification_codes['onet_short_code_str3']
df_manual_classification_codes['final_classification_str4'] = df_manual_classification_codes['onet_short_code_str4']
df_manual_classification_codes['final_classification_str5'] = df_manual_classification_codes['onet_short_code_str5']
df_manual_classification_codes['final_classification_str6'] = df_manual_classification_codes['onet_short_code_str6']

In [9]:
# Now, with respect to the jobs that don't have a O*NET-SOC Code....
df_manual_classification_codes[df_manual_classification_codes['O*NET-SOC Code'].isna()]['onet_title_final'].value_counts(dropna=False)

onet_title_final
unidentified job                         353
student - extension or student entity    243
student - scientific initiation          126
teacher                                   99
researcher                                64
student - tutor                           49
student - others                          30
student - teaching initiation             23
entrepreneur                              11
master's degree student                   11
volunteer                                  9
teacher - coordinator                      8
phd candidate                              3
teacher - others                           2
teacher - language                         1
teacher - assistant                        1
Name: count, dtype: int64

In [10]:
# Creating "codes" to the jobs that don't have a O*NET-SOC Code
students = df_manual_classification_codes['onet_title_final'].isin(\
    ['student - extension or student entity', 'student - scientific initiation',
     'student - tutor', 'student - teaching initiation', 'student - others', 
     'master\'s degree student'])
master_degree_student = df_manual_classification_codes['onet_title_final'].isin(\
    [])
phd_candidate = df_manual_classification_codes['onet_title_final'].isin(\
    ['phd candidate'])
teachers = df_manual_classification_codes['onet_title_final'].isin(\
    ['teacher', 'teacher - coordinator', 'teacher - others', 'teacher - language',
     'teacher - assistant'])
researcher = df_manual_classification_codes['onet_title_final'].isin(\
    ['researcher'])
volunteer = df_manual_classification_codes['onet_title_final'].isin(\
    ['volunteer'])
entrepreneur = df_manual_classification_codes['onet_title_final'].isin(\
    ['entrepreneur'])
unidentified_job = df_manual_classification_codes['onet_title_final'].isin(
    ['unidentified job'])

columns = ['final_classification_str2', 'final_classification_str3', 'final_classification_str4', 'final_classification_str5', 'final_classification_str6']
df_manual_classification_codes.loc[students, columns] = ['00', '000', '0000', '00000', '000000']
df_manual_classification_codes.loc[master_degree_student, columns] = ['01', '010', '0100', '01000', '010000']
df_manual_classification_codes.loc[phd_candidate, columns] = ['02', '020', '0200', '02000', '020000']
df_manual_classification_codes.loc[teachers, columns] = ['03', '030', '0300', '03000', '030000']
df_manual_classification_codes.loc[researcher, columns] = ['04', '040', '0400', '04000', '040000']
df_manual_classification_codes.loc[volunteer, columns] = ['05', '050', '0500', '05000', '050000']
df_manual_classification_codes.loc[entrepreneur, columns] = ['06', '060', '0600', '06000', '060000']
df_manual_classification_codes.loc[unidentified_job, columns] = ['09', '090', '0900', '09000', '090000']

In [11]:
df_manual_classification_codes.head()

Unnamed: 0.1,Unnamed: 0,person_id,filter,company_id,role,location,start_date,end_date,description,role_english,...,onet_short_code_str2,onet_short_code_str3,onet_short_code_str4,onet_short_code_str5,onet_short_code_str6,final_classification_str2,final_classification_str3,final_classification_str4,final_classification_str5,final_classification_str6
0,10,2,DON'T SHOW,6,CX Operations Analyst,"São Paulo, Brasil",fev. de 2023,Ongoing,Creation and adjustment of processes and tools...,CX Operations Analyst,...,13.0,131.0,1311.0,13116.0,131161.0,13,131,1311,13116,131161
1,11,2,DON'T SHOW,6,Digital Commerce Specialist Program,,jul. de 2021,jan. de 2023,Customer Experience Operations:- Customer Serv...,Digital Commerce Specialist Program,...,13.0,131.0,1311.0,13116.0,131161.0,13,131,1311,13116,131161
2,12,2,DON'T SHOW,7,Estagiário em Administração de Vendas,,mai. de 2019,mai. de 2021,- Validação de documentos para atestar a elegi...,Sales Administration Intern,...,41.0,419.0,4190.0,41909.0,419099.0,41,419,4190,41909,419099
3,13,3,DON'T SHOW,8,Pesquisador júnior,São Paulo,jul. de 2022,Ongoing,,Junior researcher,...,,,,,,4,40,400,4000,40000
4,21,7,DON'T SHOW,5,Estágio em Riscos,"São Paulo, Brasil",2021-06-01 00:00:00,Ongoing,- Análise de dados para Risco de Mercado e Liq...,Risk Internship,...,13.0,132.0,1320.0,13209.0,132099.0,13,132,1320,13209,132099


In [13]:
print(df_manual_classification_codes['final_classification_str2'].nunique(dropna=False))
print(df_manual_classification_codes['final_classification_str3'].nunique(dropna=False))
print(df_manual_classification_codes['final_classification_str4'].nunique(dropna=False))
print(df_manual_classification_codes['final_classification_str5'].nunique(dropna=False))
print(df_manual_classification_codes['final_classification_str6'].nunique(dropna=False))

26
53
60
119
138


### Normalizing dates

In [14]:
def clean_string_column(terms_to_remove, replace_dict, df, col):
    """ Remove and/or replace some terms in a column, and also removes leading/trailing whitespaces."""

    def remove_terms(terms_to_remove, df, col):
        terms_to_remove_pattern = '|'.join(map(re.escape, terms_to_remove))
        df.loc[:, col] = df.loc[:, col].str.replace(terms_to_remove_pattern, '', regex=True)
        return df

    def replace_terms(replace_dict, df, col):
        df.loc[:, col] = df.loc[:, col].replace(replace_dict, regex=True)
        return df
    
    df = remove_terms(terms_to_remove, df, col)
    df = replace_terms(replace_dict, df, col)
    df.loc[:, col] = df.loc[:, col].str.lstrip()
    df.loc[:, col] = df.loc[:, col].str.rstrip()

    return df

In [15]:
### Start and end date


dates_dict = {
    'jan. de': 'Jan',
    'fev. de': 'Feb',
    'mar. de': 'Mar',
    'abr. de': 'Apr',
    'mai. de': 'May',
    'jun. de': 'Jun',
    'jul. de': 'Jul',
    'ago. de': 'Aug',
    'set. de': 'Sep',
    'out. de': 'Oct',
    'nov. de': 'Nov',
    'dez. de': 'Dec'
}

terms_to_remove = []
df_manual_classification_codes = clean_string_column(terms_to_remove, dates_dict, df_manual_classification_codes, 'start_date')
df_manual_classification_codes = clean_string_column(terms_to_remove, dates_dict, df_manual_classification_codes, 'end_date')

def transform_date(date_str):
    month, year = date_str.split()
    month_dict = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
        'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
        'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    return f"{year}-{month_dict[month]}"

df_manual_classification_codes['start_date'] = df_manual_classification_codes['start_date'].str.replace(r'(\w{3}) (\d{4})', lambda x: transform_date(x.group()), regex=True)
df_manual_classification_codes['end_date'] = df_manual_classification_codes['end_date'].str.replace(r'(\w{3}) (\d{4})', lambda x: transform_date(x.group()), regex=True)

df_manual_classification_codes.head()

Unnamed: 0.1,Unnamed: 0,person_id,filter,company_id,role,location,start_date,end_date,description,role_english,...,onet_short_code_str2,onet_short_code_str3,onet_short_code_str4,onet_short_code_str5,onet_short_code_str6,final_classification_str2,final_classification_str3,final_classification_str4,final_classification_str5,final_classification_str6
0,10,2,DON'T SHOW,6,CX Operations Analyst,"São Paulo, Brasil",2023-02,Ongoing,Creation and adjustment of processes and tools...,CX Operations Analyst,...,13.0,131.0,1311.0,13116.0,131161.0,13,131,1311,13116,131161
1,11,2,DON'T SHOW,6,Digital Commerce Specialist Program,,2021-07,2023-01,Customer Experience Operations:- Customer Serv...,Digital Commerce Specialist Program,...,13.0,131.0,1311.0,13116.0,131161.0,13,131,1311,13116,131161
2,12,2,DON'T SHOW,7,Estagiário em Administração de Vendas,,2019-05,2021-05,- Validação de documentos para atestar a elegi...,Sales Administration Intern,...,41.0,419.0,4190.0,41909.0,419099.0,41,419,4190,41909,419099
3,13,3,DON'T SHOW,8,Pesquisador júnior,São Paulo,2022-07,Ongoing,,Junior researcher,...,,,,,,4,40,400,4000,40000
4,21,7,DON'T SHOW,5,Estágio em Riscos,"São Paulo, Brasil",,Ongoing,- Análise de dados para Risco de Mercado e Liq...,Risk Internship,...,13.0,132.0,1320.0,13209.0,132099.0,13,132,1320,13209,132099


In [16]:
df_manual_classification_codes[['person_id', 'start_date', 'end_date', 'role_english', 'description_english', 'final_classification_str2', 'final_classification_str3', 'final_classification_str4', 'final_classification_str5', 'final_classification_str6']]

Unnamed: 0,person_id,start_date,end_date,role_english,description_english,final_classification_str2,final_classification_str3,final_classification_str4,final_classification_str5,final_classification_str6
0,2,2023-02,Ongoing,CX Operations Analyst,Creation and adjustment of processes and tools...,13,131,1311,13116,131161
1,2,2021-07,2023-01,Digital Commerce Specialist Program,Customer Experience Operations:- Customer Serv...,13,131,1311,13116,131161
2,2,2019-05,2021-05,Sales Administration Intern,- Validation of documents to certify the eligi...,41,419,4190,41909,419099
3,3,2022-07,Ongoing,Junior researcher,,04,040,0400,04000,040000
4,7,,Ongoing,Risk Internship,- Data analysis for Market and Liquidity Risk ...,13,132,1320,13209,132099
...,...,...,...,...,...,...,...,...,...,...
3644,8013,2009-12,2009-12,Sales Attendant,Temporary Christmas job as sales attendant in ...,41,419,4190,41909,419099
3645,8013,,,Administration Assistant,,43,436,4360,43601,436014
3646,8094,2022-12,Ongoing,Anl Gest Coml I - Commercial Management & Anal...,,41,419,4190,41909,419099
3647,8094,2021-09,2022-12,Cluster Wholesale Intern,,41,419,4190,41909,419099


In [17]:
df_manual_classification_codes[['person_id', 'start_date', 'end_date', 
                                'role_english', 'description_english', 
                                'final_classification_str2', 'final_classification_str3', 'final_classification_str4', 'final_classification_str5', 'final_classification_str6']
                                ].to_parquet('../data/interim/train_test.parquet')