In [1]:
def generate_rais_dataframe(list_ufs, year, data_format='wide', filter_metarea=False, territorio='arranjo'):

    """
    For selected Brazilian Federation Unities (ufs) and year, loads pretreated Rais data into a Pandas DataFrame and treat it so 
    it is suited to analysis of formally employed personnel with respect to metrics of: scientific and technical (S&T)
    personnel (POTEC), organization legal status, economic sectors, categories of services according to intensity of applied 
    knowledge, industry technological levels; and level of education.
    
    If filter_metarea is set to True, it will load data reffering just to the metropolitan areas of the selected Federation Unities,
    leaving out data of remaining municipalities.
    
    If territorio is equal to 'arranjo', the resulting DataFrame will have data related to population arrangement for Brazilian Metropolises. Else, if
    territorio is equal to 'territorio_tese', it will use the territorial refference used in the thesis found at 
    https://acervodigital.ufpr.br/handle/1884/58421?show=full
    
    """
    if year > 2005:
        escolaridade = 'Escolaridade após 2005'
    else:
        escolaridade = 'Grau Instrução 2005-1985'
    
    # Loads each file into DataFrame and assign it to the dictionary dict_df
    dict_df = dict()
    for uf in list_ufs:
        df = pd.read_pickle(os.path.join(modulepath, f'data/rais_treated/{year}/{uf}{year}.zip'))
        
        df['UF'] = uf.upper()
        df['UF'] = df['UF'].astype('category')

#        if filter_metarea == True:
#            filter_rm_territory = df.territorio_tese != df.territorio_tese.cat.categories[1]
#            df = df.groupby(by=['UF', territorio, 'Tamanho Estabelecimento', 'Natureza Jurídica', 'knowledge_services', 'technology_industries', 'Escolaridade após 2005','potec'], observed=True).size().reset_index()
#        
#        elif filter_metarea == False:
        df = df.groupby(by=['UF', 'Município', territorio, 'Tamanho Estabelecimento', 'Natureza Jurídica', 'knowledge_services', 'technology_industries', escolaridade,'potec'], observed=True).size().reset_index()
        dict_df[uf] = df


    # Concatenates the DataFrames that are in dict_df
    df = pd.concat(dict_df.values(),  axis=0)


    df.rename(columns={0: 'Pessoal'}, inplace=True)
    df[territorio] = df[territorio].astype('category')
    df.rename(columns={territorio:'Território'}, inplace=True)
    
    # Creates new columns for organization size, organization legal status, economic sectors, level of education and scientific and technical (S&T) personnel (POTEC)
    df['Tamanho Estabelecimento'] = df['Tamanho Estabelecimento'].map(dc.dict_porte).astype(dc.type_porte)
    df['Natureza Jurídica Grupo'] = df['Natureza Jurídica'].astype('category').str.slice(start=0, stop=1).map(dc.dict_nat_jur).astype('category')
    df['Natureza Jurídica'] = df['Natureza Jurídica'].map(dc.dict_nat_jur_detail).fillna('OUTROS').astype('category')
    df['Sectors'] = df.knowledge_services.map({'Without Classification':'Without Classification'}).fillna('Services')
    df['Sectors'] = ['Services' if x[1]['Sectors'] == 'Services' else 'Industry' if x[1]['technology_industries'] != 'Without Classification' and x[1]['Sectors'] != 'Services'  else 'Others' for x in df.iterrows()]
    df['Sectors'] = df['Sectors'].astype('category')
    df['Escolaridade1'] = df[escolaridade].map(dc.dict_escolaridade)
    df['Escolaridade2'] = df[escolaridade].map(dc.dict_escolaridade1)
    df['Potec Grupo'] = df['potec'].map(dc.dict_potec).astype('category')
    

    # Reorder columns
    df = df[[
        'UF'
        , 'Município'
        ,'Território'
        , 'Sectors'
        , 'Tamanho Estabelecimento'
        , 'Natureza Jurídica Grupo'
        , 'Natureza Jurídica'
        , 'knowledge_services'
        , 'technology_industries'
        , 'Potec Grupo'
        , 'potec'
        , 'Escolaridade1'
        , 'Escolaridade2'
        , 'Pessoal'
    ]]
    
        
    return df