In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_rows = 200
pd.options.display.max_columns = 50
from itertools import product

import matplotlib.pyplot as plt

import plotly.express as px

import app_rais.dicts_constants as dc

import app_rais.analytics_potec as potec

import bamboolib as bam
#bam.enable()
bam.disable()

The bamboolib extension was disabled. You can enable it again via 'bam.enable()'. In case that bamboolib was not helpful to you, we are sorry and would like to fix this. Please write us a quick mail to info@8080labs.com so that we can serve you better in the future. Best regards, Tobias and Florian


Um teste com o Paraná, em 2017, mostrou que o CNAE 3699-4 correspondia a diversos CNAES 2.0, com distintos níveis de intensidade tecnológica. Talvez a proporção seja necessária para identificar o encaminhamento da classificação no CNAE 95.

CNAE-95|CNAE-2.0|Tecnologia|Pessoal|%
-------|--------|-------|--------|-------
36994|20924|Medium-high-technology|939|29,79
36994|23991|Medium-low-technology|385|12,21
36994|32124|Low-technology|302|9,58
36994|32990|Low-technology|1086|34,45
36994|33295|Medium-low-technology|440|13,96


Verifiquei no Excel 33 códigos CNAE 1.0 que apresentam dous ou mais códigos CNAE 2.0 correspondentes, aos quais são associados diferentes níveis de tecnologia e conhecimento. Trata-se dos casos abaixo:

CNAE95 | Descrição
-------|----------
01.61-9 | Atividades de serviços relacionados com a agricultura
01.62-7 | Atividades de serviços relacionados com a pecuária exceto atividades veterinárias
15.81-4 | Fabricação de produtos de padaria, confeitaria e pastelaria
17.64-7 | Fabricação de tecidos especiais - inclusive artefatos
18.22-8 | Fabricação de acessórios para segurança industrial e pessoal
20.23-0 | Fabricação de artefatos de tanoaria e embalagens de madeira
22.19-5 | Edição; edição e impressão de outros produtos gráficos
23.30-2 | Elaboração de combustíveis nucleares
24.29-5 | Fabricação de outros produtos químicos orgânicos
24.54-6 | Fabricação de materiais para usos médicos, hospitalares e odontológicos
25.29-1 | Fabricação de artefatos diversos de material plástico
26.99-9 | Fabricação de outros produtos de minerais não-metálicos
28.99-1 | Fabricação de outros produtos elaborados de metal
29.29-7 | Fabricação de outras máquinas e equipamentos de uso geral
29.31-9 | Fabricação de máquinas e equipamentos para agricultura, avicultura e obtenção de produtos animais
29.40-8 | Fabricação de máquinas-ferramenta
29.52-1 | Fabricação de outras máquinas e equipamentos para a extração de minérios e indústria da construção
29.69-6 | Fabricação de outras máquinas e equipamentos de uso específico
30.12-0 | Fabricação de máquinas de escrever e calcular, copiadoras e outros equipamentos eletrônicos destinados à automação gerencial e comercial
33.40-5 | Fabricação de aparelhos, instrumentos e materiais ópticos, fotográficos e cinematográficos
36.99-4 | Fabricação de produtos diversos
63.22-3 | Atividades auxiliares aos transportes aquaviários
63.23-1 | Atividades auxiliares aos transportes aéreos
64.20-3 | Telecomunicações
65.93-5 | Gestão de ativos intangíveis não financeiros
70.10-6 | Incorporação e compra e venda de imóveis
72.40-0 | Atividades de banco de dados e distribuição on-line de conteúdo eletrônico
72.50-8 | Manutenção e reparação de máquinas de escritório e de informática
74.16-0 | Atividades de assessoria em gestão empresarial
74.99-3 | Outras atividades de serviços prestados principalmente às empresas não especificadas anteriormente
92.32-0 | Gestão de salas de espetáculos
92.40-1 | Atividades de agências de notícias
92.62-2 | Outras atividades relacionadas ao lazer


In [3]:
cnae10problema = [
    '01619'
    ,'01627'
    ,'15814'
    ,'17647'
    ,'18228'
    ,'20230'
    ,'22195'
    ,'23302'
    ,'24295'
    ,'24546'
    ,'25291'
    ,'26999'
    ,'28991'
    ,'29297'
    ,'29319'
    ,'29408'
    ,'29521'
    ,'29696'
    ,'30120'
    ,'33405'
    ,'36994'
    ,'63223'
    ,'63231'
    ,'64203'
    ,'65935'
    ,'70106'
    ,'72400'
    ,'72508'
    ,'74160'
    ,'74993'
    ,'92320'
    ,'92401'
    ,'92622'
]

list_ufs=['AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MG','PA','PE','PR','RJ','RS','SC','SP']

In [4]:
cnaeproblemacompleto = [
    '01619'
    ,'01627'
    ,'15814'
    ,'17647'
    ,'18228'
    ,'20230'
    ,'22195'
    ,'23302'
    ,'24295'
    ,'24546'
    ,'25291'
    ,'26999'
    ,'28991'
    ,'29114'
    ,'29122'
    ,'29130'
    ,'29149'
    ,'29157'
    ,'29211'
    ,'29220'
    ,'29238'
    ,'29246'
    ,'29297'
    ,'29319'
    ,'29408'
    ,'29513'
    ,'29521'
    ,'29548'
    ,'29610'
    ,'29629'
    ,'29637'
    ,'29645'
    ,'29653'
    ,'29696'
    ,'29726'
    ,'30120'
    ,'31119'
    ,'31127'
    ,'31135'
    ,'31216'
    ,'31526'
    ,'31925'
    ,'31992'
    ,'32212'
    ,'33103'
    ,'33200'
    ,'33308'
    ,'33405'
    ,'34312'
    ,'36110'
    ,'36129'
    ,'36137'
    ,'36943'
    ,'36994'
    ,'63223'
    ,'63231'
    ,'64203'
    ,'65935'
    ,'65994'
    ,'70106'
    ,'72400'
    ,'72508'
    ,'74160'
    ,'74993'
    ,'92320'
    ,'92401'
    ,'92622'    
]

In [5]:
dict_tec_know = {
    '011':'Without Classification'
    ,'012':'Without Classification'
    ,'013':'Without Classification'
    ,'014':'Without Classification'
    ,'015':'Without Classification'
    ,'016':'Without Classification'
    ,'017':'Without Classification'
    ,'021':'Without Classification'
    ,'022':'Without Classification'
    ,'023':'Without Classification'
    ,'031':'Without Classification'
    ,'032':'Without Classification'
    ,'050':'Without Classification'
    ,'060':'Without Classification'
    ,'071':'Without Classification'
    ,'072':'Without Classification'
    ,'081':'Without Classification'
    ,'089':'Without Classification'
    ,'091':'Without Classification'
    ,'099':'Without Classification'
    ,'101':'Low-technology'
    ,'102':'Low-technology'
    ,'103':'Low-technology'
    ,'104':'Low-technology'
    ,'105':'Low-technology'
    ,'106':'Low-technology'
    ,'107':'Low-technology'
    ,'108':'Low-technology'
    ,'109':'Low-technology'
    ,'111':'Low-technology'
    ,'112':'Low-technology'
    ,'121':'Low-technology'
    ,'122':'Low-technology'
    ,'131':'Low-technology'
    ,'132':'Low-technology'
    ,'133':'Low-technology'
    ,'134':'Low-technology'
    ,'135':'Low-technology'
    ,'141':'Low-technology'
    ,'142':'Low-technology'
    ,'151':'Low-technology'
    ,'152':'Low-technology'
    ,'153':'Low-technology'
    ,'154':'Low-technology'
    ,'161':'Low-technology'
    ,'162':'Low-technology'
    ,'171':'Low-technology'
    ,'172':'Low-technology'
    ,'173':'Low-technology'
    ,'174':'Low-technology'
    ,'181':'Low-technology'
    ,'182':'Low-technology'
    ,'183':'Medium-low-technology'
    ,'191':'Medium-low-technology'
    ,'192':'Medium-low-technology'
    ,'193':'Medium-low-technology'
    ,'201':'Medium-high-technology'
    ,'202':'Medium-high-technology'
    ,'203':'Medium-high-technology'
    ,'204':'Medium-high-technology'
    ,'205':'Medium-high-technology'
    ,'206':'Medium-high-technology'
    ,'207':'Medium-high-technology'
    ,'209':'Medium-high-technology'
    ,'211':'High-technology'
    ,'212':'High-technology'
    ,'221':'Medium-low-technology'
    ,'222':'Medium-low-technology'
    ,'231':'Medium-low-technology'
    ,'232':'Medium-low-technology'
    ,'233':'Medium-low-technology'
    ,'234':'Medium-low-technology'
    ,'239':'Medium-low-technology'
    ,'241':'Medium-low-technology'
    ,'242':'Medium-low-technology'
    ,'243':'Medium-low-technology'
    ,'244':'Medium-low-technology'
    ,'245':'Medium-low-technology'
    ,'251':'Medium-low-technology'
    ,'252':'Medium-low-technology'
    ,'253':'Medium-low-technology'
    ,'254':'Medium-high-technology'
    ,'255':'Medium-low-technology'
    ,'259':'Medium-low-technology'
    ,'261':'High-technology'
    ,'262':'High-technology'
    ,'263':'High-technology'
    ,'264':'High-technology'
    ,'265':'High-technology'
    ,'266':'High-technology'
    ,'267':'High-technology'
    ,'268':'High-technology'
    ,'271':'Medium-high-technology'
    ,'272':'Medium-high-technology'
    ,'273':'Medium-high-technology'
    ,'274':'Medium-high-technology'
    ,'275':'Medium-high-technology'
    ,'279':'Medium-high-technology'
    ,'281':'Medium-high-technology'
    ,'282':'Medium-high-technology'
    ,'283':'Medium-high-technology'
    ,'284':'Medium-high-technology'
    ,'285':'Medium-high-technology'
    ,'286':'Medium-high-technology'
    ,'291':'Medium-high-technology'
    ,'292':'Medium-high-technology'
    ,'293':'Medium-high-technology'
    ,'294':'Medium-high-technology'
    ,'295':'Medium-high-technology'
    ,'301':'Medium-low-technology'
    ,'303':'Medium-high-technology'
    ,'304':'High-technology'
    ,'305':'Medium-high-technology'
    ,'309':'Medium-high-technology'
    ,'310':'Low-technology'
    ,'321':'Low-technology'
    ,'322':'Low-technology'
    ,'323':'Low-technology'
    ,'324':'Low-technology'
    ,'325':'Medium-high-technology'
    ,'329':'Low-technology'
    ,'331':'Medium-low-technology'
    ,'332':'Medium-low-technology'
    ,'351':'Without Classification'
    ,'352':'Without Classification'
    ,'353':'Without Classification'
    ,'360':'Without Classification'
    ,'370':'Without Classification'
    ,'381':'Without Classification'
    ,'382':'Without Classification'
    ,'383':'Without Classification'
    ,'390':'Without Classification'
    ,'411':'Without Classification'
    ,'412':'Without Classification'
    ,'421':'Without Classification'
    ,'422':'Without Classification'
    ,'429':'Without Classification'
    ,'431':'Without Classification'
    ,'432':'Without Classification'
    ,'433':'Without Classification'
    ,'439':'Without Classification'
    ,'451':'Less knowledge-intensive market services'
    ,'452':'Less knowledge-intensive market services'
    ,'453':'Less knowledge-intensive market services'
    ,'454':'Less knowledge-intensive market services'
    ,'461':'Less knowledge-intensive market services'
    ,'462':'Less knowledge-intensive market services'
    ,'463':'Less knowledge-intensive market services'
    ,'464':'Less knowledge-intensive market services'
    ,'465':'Less knowledge-intensive market services'
    ,'466':'Less knowledge-intensive market services'
    ,'467':'Less knowledge-intensive market services'
    ,'468':'Less knowledge-intensive market services'
    ,'469':'Less knowledge-intensive market services'
    ,'471':'Less knowledge-intensive market services'
    ,'472':'Less knowledge-intensive market services'
    ,'473':'Less knowledge-intensive market services'
    ,'474':'Less knowledge-intensive market services'
    ,'475':'Less knowledge-intensive market services'
    ,'476':'Less knowledge-intensive market services'
    ,'477':'Less knowledge-intensive market services'
    ,'478':'Less knowledge-intensive market services'
    ,'479':'Less knowledge-intensive market services'
    ,'491':'Less knowledge-intensive market services'
    ,'492':'Less knowledge-intensive market services'
    ,'493':'Less knowledge-intensive market services'
    ,'494':'Less knowledge-intensive market services'
    ,'495':'Less knowledge-intensive market services'
    ,'501':'Knowledge-intensive market services'
    ,'502':'Knowledge-intensive market services'
    ,'503':'Knowledge-intensive market services'
    ,'509':'Knowledge-intensive market services'
    ,'511':'Knowledge-intensive market services'
    ,'512':'Knowledge-intensive market services'
    ,'513':'Knowledge-intensive market services'
    ,'521':'Less knowledge-intensive market services'
    ,'522':'Less knowledge-intensive market services'
    ,'523':'Less knowledge-intensive market services'
    ,'524':'Less knowledge-intensive market services'
    ,'525':'Less knowledge-intensive market services'
    ,'531':'Other less knowledge-intensive services'
    ,'532':'Other less knowledge-intensive services'
    ,'551':'Less knowledge-intensive market services'
    ,'559':'Less knowledge-intensive market services'
    ,'561':'Less knowledge-intensive market services'
    ,'562':'Less knowledge-intensive market services'
    ,'581':'Other knowledge-intensive services'
    ,'582':'Other knowledge-intensive services'
    ,'591':'High-tech knowledge-intensive services'
    ,'592':'High-tech knowledge-intensive services'
    ,'601':'High-tech knowledge-intensive services'
    ,'602':'High-tech knowledge-intensive services'
    ,'611':'High-tech knowledge-intensive services'
    ,'612':'High-tech knowledge-intensive services'
    ,'613':'High-tech knowledge-intensive services'
    ,'614':'High-tech knowledge-intensive services'
    ,'619':'High-tech knowledge-intensive services'
    ,'620':'High-tech knowledge-intensive services'
    ,'631':'High-tech knowledge-intensive services'
    ,'639':'High-tech knowledge-intensive services'
    ,'641':'Knowledge-intensive financial services'
    ,'642':'Knowledge-intensive financial services'
    ,'643':'Knowledge-intensive financial services'
    ,'644':'Knowledge-intensive financial services'
    ,'645':'Knowledge-intensive financial services'
    ,'646':'Knowledge-intensive financial services'
    ,'647':'Knowledge-intensive financial services'
    ,'649':'Knowledge-intensive financial services'
    ,'651':'Knowledge-intensive financial services'
    ,'652':'Knowledge-intensive financial services'
    ,'653':'Knowledge-intensive financial services'
    ,'654':'Knowledge-intensive financial services'
    ,'655':'Knowledge-intensive financial services'
    ,'661':'Knowledge-intensive financial services'
    ,'662':'Knowledge-intensive financial services'
    ,'663':'Knowledge-intensive financial services'
    ,'681':'Less knowledge-intensive market services'
    ,'682':'Less knowledge-intensive market services'
    ,'691':'Knowledge-intensive market services'
    ,'692':'Knowledge-intensive market services'
    ,'701':'Knowledge-intensive market services'
    ,'702':'Knowledge-intensive market services'
    ,'711':'Knowledge-intensive market services'
    ,'712':'Knowledge-intensive market services'
    ,'721':'High-tech knowledge-intensive services'
    ,'722':'High-tech knowledge-intensive services'
    ,'731':'Knowledge-intensive market services'
    ,'732':'Knowledge-intensive market services'
    ,'741':'Knowledge-intensive market services'
    ,'742':'Knowledge-intensive market services'
    ,'749':'Knowledge-intensive market services'
    ,'750':'Other knowledge-intensive services'
    ,'771':'Less knowledge-intensive market services'
    ,'772':'Less knowledge-intensive market services'
    ,'773':'Less knowledge-intensive market services'
    ,'774':'Less knowledge-intensive market services'
    ,'781':'Knowledge-intensive market services'
    ,'782':'Knowledge-intensive market services'
    ,'783':'Knowledge-intensive market services'
    ,'791':'Less knowledge-intensive market services'
    ,'799':'Less knowledge-intensive market services'
    ,'801':'Knowledge-intensive market services'
    ,'802':'Knowledge-intensive market services'
    ,'803':'Knowledge-intensive market services'
    ,'811':'Less knowledge-intensive market services'
    ,'812':'Less knowledge-intensive market services'
    ,'813':'Less knowledge-intensive market services'
    ,'821':'Less knowledge-intensive market services'
    ,'822':'Less knowledge-intensive market services'
    ,'823':'Less knowledge-intensive market services'
    ,'829':'Less knowledge-intensive market services'
    ,'841':'Other knowledge-intensive services'
    ,'842':'Other knowledge-intensive services'
    ,'843':'Other knowledge-intensive services'
    ,'851':'Other knowledge-intensive services'
    ,'852':'Other knowledge-intensive services'
    ,'853':'Other knowledge-intensive services'
    ,'854':'Other knowledge-intensive services'
    ,'855':'Other knowledge-intensive services'
    ,'859':'Other knowledge-intensive services'
    ,'861':'Other knowledge-intensive services'
    ,'862':'Other knowledge-intensive services'
    ,'863':'Other knowledge-intensive services'
    ,'864':'Other knowledge-intensive services'
    ,'865':'Other knowledge-intensive services'
    ,'866':'Other knowledge-intensive services'
    ,'869':'Other knowledge-intensive services'
    ,'871':'Other knowledge-intensive services'
    ,'872':'Other knowledge-intensive services'
    ,'873':'Other knowledge-intensive services'
    ,'880':'Other knowledge-intensive services'
    ,'900':'Other knowledge-intensive services'
    ,'910':'Other knowledge-intensive services'
    ,'920':'Other knowledge-intensive services'
    ,'931':'Other knowledge-intensive services'
    ,'932':'Other knowledge-intensive services'
    ,'941':'Other less knowledge-intensive services'
    ,'942':'Other less knowledge-intensive services'
    ,'943':'Other less knowledge-intensive services'
    ,'949':'Other less knowledge-intensive services'
    ,'951':'Less knowledge-intensive market services'
    ,'952':'Less knowledge-intensive market services'
    ,'960':'Other less knowledge-intensive services'
    ,'970':'Other less knowledge-intensive services'
    ,'990':'Other less knowledge-intensive services'
}

## 1. Verificação das Correspondências entre Classificações CNAE10 e CNAE20 em 2017

In [6]:
%%time
df_dict = dict()
for uf in list_ufs:
    df = pd.read_csv(
        f'app_rais/data/rais_original/2017/{uf}2017.txt'
        , encoding='latin-1'
        , sep=';'
        , usecols=['CNAE 95 Classe','CNAE 2.0 Classe', 'Vínculo Ativo 31/12']
        , dtype={'CNAE 95 Classe':'category','CNAE 2.0 Classe':'category', 'Vínculo Ativo 31/12':'category'}
    )
    df = df.loc[df['Vínculo Ativo 31/12'] == '1']
    df = df[['CNAE 95 Classe','CNAE 2.0 Classe']]
    df = df[df['CNAE 95 Classe'].isin(cnaeproblemacompleto)].groupby(['CNAE 95 Classe','CNAE 2.0 Classe'], observed=True).size().reset_index().rename(columns={0:'Pessoal'})
    df_dict[uf] = df
    print(f'{uf} done.')
df_2017 = pd.concat(df_dict.values(), keys=df_dict.keys())

AM done.
BA done.
CE done.
DF done.
ES done.
GO done.
MG done.
PA done.
PE done.
PR done.
RJ done.
RS done.
SC done.
SP done.
Wall time: 6min 2s


In [7]:
df_2017 = df_2017.reset_index().rename(columns={'level_0':'UF'}).drop(columns='level_1')

df_2017 = df_2017.groupby(['CNAE 95 Classe','CNAE 2.0 Classe'], observed=True).sum().reset_index()

df_2017.insert(2, 'class', df_2017['CNAE 2.0 Classe'].str.slice(start=0, stop=3).map(dict_tec_know))

In [8]:
df_2017['Pessoal'].sum()

3398931

In [9]:
df_2017.sum()

CNAE 95 Classe     0161901619016190162715814176471822820230221952...
CNAE 2.0 Classe    0161001636813030162810911135453292216234581915...
class              Without ClassificationWithout ClassificationLe...
Pessoal                                                      3398931
dtype: object

In [10]:
df_2017.pivot_table(values='Pessoal', index=['CNAE 95 Classe', 'CNAE 2.0 Classe'], columns='class', aggfunc='sum').fillna(0).astype(np.int32).head()

Unnamed: 0_level_0,class,High-tech knowledge-intensive services,High-technology,Knowledge-intensive financial services,Knowledge-intensive market services,Less knowledge-intensive market services,Low-technology,Medium-high-technology,Medium-low-technology,Other knowledge-intensive services,Without Classification
CNAE 95 Classe,CNAE 2.0 Classe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1619,1610,0,0,0,0,0,0,0,0,0,91536
1619,1636,0,0,0,0,0,0,0,0,0,5494
1619,81303,0,0,0,0,24672,0,0,0,0,0
1627,1628,0,0,0,0,0,0,0,0,0,22544
15814,10911,0,0,0,0,0,117622,0,0,0,0


In [19]:
df_2017.pivot_table(values='Pessoal', index=['CNAE 95 Classe'], columns='class', aggfunc='sum').fillna(0).astype(np.int32)

class,High-tech knowledge-intensive services,High-technology,Knowledge-intensive financial services,Knowledge-intensive market services,Less knowledge-intensive market services,Low-technology,Medium-high-technology,Medium-low-technology,Other knowledge-intensive services,Without Classification
CNAE 95 Classe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1619,0,0,0,0,24672,0,0,0,0,97030
1627,0,0,0,0,0,0,0,0,0,22544
15814,0,0,0,0,0,117622,0,0,0,0
17647,0,0,0,0,0,12487,0,0,0,0
18228,0,0,0,0,0,11114,0,0,0,0
20230,0,0,0,0,0,13284,0,0,0,0
22195,0,0,0,0,0,0,0,0,18320,0
23302,0,0,0,0,0,0,1412,0,0,0
24295,0,0,0,0,0,0,13082,78382,0,0
24546,0,3224,0,0,0,0,18252,0,0,0


In [31]:
%%time
df_dict = dict()
for uf in list_ufs:
    df = pd.read_csv(
        f'app_rais/data/rais_original/2017/{uf}2017.txt'
        , encoding='latin-1'
        , sep=';'
        , usecols=['Vínculo Ativo 31/12']
    )
    df = df.loc[df['Vínculo Ativo 31/12'] == 1]
    df_dict[uf] = df['Vínculo Ativo 31/12'].sum()
    print(f'{uf} done.')

AM done.
BA done.
CE done.
DF done.
ES done.
GO done.
MG done.
PA done.
PE done.
PR done.
RJ done.
RS done.
SC done.
SP done.
Wall time: 5min 54s


In [35]:
np.sum(list(df_dict.values()))

40583183

## 2. Verificação das Correspondências entre Classificações CNAE10 e CNAE20 em 2007aggfunc=

In [12]:
%%time
df_dict = dict()
for uf in list_ufs:
    df = pd.read_csv(
        f'app_rais/data/rais_original/2007/{uf}2007.txt'
        , encoding='latin-1'
        , sep=';'
        , usecols=['CNAE 95 Classe','CNAE 2.0 Classe', 'Vínculo Ativo 31/12']
        , dtype={'CNAE 95 Classe':'category','CNAE 2.0 Classe':'category', 'Vínculo Ativo 31/12':'category'}
    )
    df = df.loc[df['Vínculo Ativo 31/12'] == '1']
    df = df[['CNAE 95 Classe','CNAE 2.0 Classe']]
    df = df[df['CNAE 95 Classe'].isin(cnaeproblemacompleto)].groupby(['CNAE 95 Classe','CNAE 2.0 Classe'], observed=True).size().reset_index().rename(columns={0:'Pessoal'})
    df_dict[uf] = df
    print(f'{uf} done.')
df_2007 = pd.concat(df_dict.values(), keys=df_dict.keys())

AM done.
BA done.
CE done.
DF done.
ES done.
GO done.
MG done.
PA done.
PE done.
PR done.
RJ done.
RS done.
SC done.
SP done.
Wall time: 3min 18s


In [13]:
df_2007 = df_2007.reset_index().rename(columns={'level_0':'UF'}).drop(columns='level_1')

df_2007 = df_2007.groupby(['CNAE 95 Classe','CNAE 2.0 Classe'], observed=True).sum().reset_index()

df_2007.insert(2, 'class', df_2007['CNAE 2.0 Classe'].str.slice(start=0, stop=3).map(dict_tec_know))

In [14]:
df_2007['Pessoal'].sum()

2819578

In [15]:
df_2007.head()

Unnamed: 0,CNAE 95 Classe,CNAE 2.0 Classe,class,Pessoal
0,1619,1610,Without Classification,85467
1,1619,1636,Without Classification,2703
2,1619,81303,Less knowledge-intensive market services,6983
3,1627,1628,Without Classification,27366
4,15814,10911,Low-technology,68727


In [16]:
df_2007.pivot_table(values='Pessoal', index=['CNAE 95 Classe', 'CNAE 2.0 Classe'], columns='class', aggfunc='sum').fillna(0).astype(np.int32).head()

Unnamed: 0_level_0,class,High-tech knowledge-intensive services,High-technology,Knowledge-intensive financial services,Knowledge-intensive market services,Less knowledge-intensive market services,Low-technology,Medium-high-technology,Medium-low-technology,Other knowledge-intensive services,Without Classification
CNAE 95 Classe,CNAE 2.0 Classe,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1619,1610,0,0,0,0,0,0,0,0,0,85467
1619,1636,0,0,0,0,0,0,0,0,0,2703
1619,81303,0,0,0,0,6983,0,0,0,0,0
1627,1628,0,0,0,0,0,0,0,0,0,27366
15814,10911,0,0,0,0,0,68727,0,0,0,0


In [21]:
df_2007.pivot_table(values='Pessoal', index=['CNAE 95 Classe'], columns='class', aggfunc='sum').fillna(0).astype(np.int32)

class,High-tech knowledge-intensive services,High-technology,Knowledge-intensive financial services,Knowledge-intensive market services,Less knowledge-intensive market services,Low-technology,Medium-high-technology,Medium-low-technology,Other knowledge-intensive services,Without Classification
CNAE 95 Classe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1619,0,0,0,0,6983,0,0,0,0,88170
1627,0,0,0,0,0,0,0,0,0,27366
15814,0,0,0,0,57915,68727,0,0,0,0
17647,0,0,0,0,0,10406,127,0,0,0
18228,0,0,0,0,0,9306,0,0,0,0
20230,0,0,0,0,0,13787,0,0,0,0
22195,0,0,0,0,0,0,0,0,33057,0
23302,0,0,0,0,0,0,526,0,0,0
24295,0,0,0,0,0,0,10729,410,0,0
24546,0,1747,0,0,0,0,13870,0,0,0


In [36]:
%%time
df_dict = dict()
for uf in list_ufs:
    df = pd.read_csv(
        f'app_rais/data/rais_original/2007/{uf}2007.txt'
        , encoding='latin-1'
        , sep=';'
        , usecols=['Vínculo Ativo 31/12']
    )
    df = df.loc[df['Vínculo Ativo 31/12'] == 1]
    df_dict[uf] = df['Vínculo Ativo 31/12'].sum()
    print(f'{uf} done.')

AM done.
BA done.
CE done.
DF done.
ES done.
GO done.
MG done.
PA done.
PE done.
PR done.
RJ done.
RS done.
SC done.
SP done.
Wall time: 3min 9s


In [37]:
np.sum(list(df_dict.values()))

33403573

In [10]:
df = pd.read_csv(
    f'app_rais/data/rais_original/2017/DF2017.txt'
    , encoding='latin-1'
    , sep=';'
    , usecols=['CNAE 95 Classe','CNAE 2.0 Classe']
    , dtype={'CNAE 95 Classe':'category','CNAE 2.0 Classe':'category'}
)

In [11]:
df

Unnamed: 0,CNAE 2.0 Classe,CNAE 95 Classe
0,47440,52442
1,47440,52442
2,10121,15121
3,10121,15121
4,41204,45217
...,...,...
1618602,84116,75116
1618603,99008,99007
1618604,99008,99007
1618605,84116,75116


In [21]:
df[df['CNAE 95 Classe'].isin(cnae10problema)].groupby(['CNAE 95 Classe', 'CNAE 2.0 Classe'], observed=True).size().reset_index().rename(columns={0:'Pessoal'})

Unnamed: 0,CNAE 2.0 Classe,CNAE 95 Classe,Pessoal
0,41107,70106,5569
1,82300,74993,2371
2,81303,1619,425
3,82113,74993,23822
4,23991,26999,138
5,23991,36994,1
6,68102,70106,545
7,1610,1619,381
8,82997,74993,11515
9,82997,92622,1361


In [6]:
    df_dict[uf] = df

Unnamed: 0,Unnamed: 1,CNAE 95 Classe,CNAE 2.0 Classe,Pessoal
AM,0,01112,01113,0
AM,1,01112,01130,0
AM,2,01112,01164,0
AM,3,01112,01211,0
AM,4,01112,01229,0
...,...,...,...,...
DF,239215,32107,91023,0
DF,239216,32107,32302,0
DF,239217,32107,35204,0
DF,239218,32107,09106,0


In [32]:
df_concat['class'] = df_concat['CNAE 2.0 Classe'].str.slice(start=0, stop=3).map(dict_tec_know)

In [33]:
df_concat = df_concat.reset_index().rename(columns={'level_0':'UF'}).drop(columns='level_1')

In [37]:
df_concat['Pessoal']

0          0
1          0
2          0
3          0
4          0
          ..
4413704    0
4413705    0
4413706    0
4413707    0
4413708    0
Name: Pessoal, Length: 4413709, dtype: int64

In [33]:
pr_2006 = pd.read_csv('app_rais/data/rais_original/2017/PR2017.txt', encoding='latin-1', sep=';')

In [44]:
pr_2006[pr_2006['CNAE 95 Classe'].isin(cnae10problema)].groupby(['CNAE 95 Classe','CNAE 2.0 Classe']).size().reset_index().rename(columns={0:'Pessoal'})

Unnamed: 0,CNAE 95 Classe,CNAE 2.0 Classe,Pessoal
0,1619,1610,17498
1,1619,1636,1478
2,1619,81303,3045
3,1627,1628,6236
4,15814,10911,12393
5,17647,13545,2173
6,18228,32922,3200
7,20230,16234,2687
8,22195,58191,513
9,22195,58298,1830


In [16]:
dc.interest_columns_for_year(2006)

array(['Faixa Etária', 'Faixa Remun Dezem (SM)', 'Faixa Remun Média (SM)',
       'Município', 'Nacionalidade', 'Sexo Trabalhador',
       'Tamanho Estabelecimento', 'Tipo Estab', 'Tipo Estab.1',
       'Vl Remun Dezembro (SM)', 'Vl Remun Média (SM)',
       'Vínculo Ativo 31/12', 'CNAE 95 Classe', 'Idade',
       'Natureza Jurídica', 'Vl Remun Dezembro Nom', 'Vl Remun Média Nom',
       'Ind Simples', 'CBO Ocupação 2002', 'CNAE 2.0 Classe',
       'Escolaridade após 2005', 'Raça Cor'], dtype=object)

In [17]:
dc.interest_columns_for_year_treat(2006)

array(['Sexo Trabalhador', 'Tamanho Estabelecimento', 'Tipo Estab',
       'Tipo Estab.1', 'Natureza Jurídica', 'Ind Simples',
       'Escolaridade após 2005'], dtype=object)

In [20]:
dc.interest_columns_for_year_treat??

[1;31mSignature:[0m [0mdc[0m[1;33m.[0m[0minterest_columns_for_year_treat[0m[1;33m([0m[0myear[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
[1;32mdef[0m [0minterest_columns_for_year_treat[0m[1;33m([0m[0myear[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m    [1;34m"""
    From 'columns_to_load_each_year_treated.csv', gets the columns that must be loaded to create DataFrames with fewer attributes for better performance.
    The result of this function is usued as an argument when loading data with generate_rais_dataframe function from treat.py module. 
    """[0m[1;33m
[0m    [0myear[0m [1;33m=[0m [0mstr[0m[1;33m([0m[0myear[0m[1;33m)[0m[1;33m
[0m    [1;33m
[0m    [0mlist_columns[0m [1;33m=[0m [0mpd[0m[1;33m.[0m[0mread_csv[0m[1;33m([0m[0mos[0m[1;33m.[0m[0mpath[0m[1;33m.[0m[0mjoin[0m[1;33m([0m[0mmodulepath[0m[1;33m,[0m [1;34m'columns_to_load_each_year_treated.csv'[0m[1;33m)[0m[1;33m,[0m [0msep[0m[1;33m=[0