## First Hypotheses:

O próximo município de atuação deve estar inserido em um contexto similar aos atuais.

### 1) Importing Libraries

In [112]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import warnings

# Ou suprimir todos os warnings
warnings.filterwarnings("ignore")

### 2) Data Cleaning and Loading:

In [3]:
codigos_ne = list(range(21, 30))
lista_municipios_modelo = ["MAURITI", "INAJÁ", "SÃO JOSÉ DA TAPERA", "BUÍQUE"]

#### 2.1) Dados dos Municípios

In [4]:
df_municipios_raw_um = pd.read_excel("..\Dados\Municipios\lista-1262municipios-semiarido-2017.xlsx", header=2)
df_municipios_raw_dois = pd.read_csv("..\Dados\Municipios\municipios.csv")

In [5]:
df_municipios_um = df_municipios_raw_um.copy()
df_municipios_dois = df_municipios_raw_dois.copy()

In [6]:
display(df_municipios_um.info())
display(df_municipios_dois.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1262 entries, 0 to 1261
Data columns (total 5 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   #                          1262 non-null   int64 
 1   Cod IBGE                   1262 non-null   int64 
 2   UF                         1262 non-null   object
 3   Município                  1262 non-null   object
 4   População Estimada - 2017  1262 non-null   object
dtypes: int64(2), object(3)
memory usage: 49.4+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5570 entries, 0 to 5569
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   codigo_ibge  5570 non-null   int64  
 1   nome         5570 non-null   object 
 2   latitude     5570 non-null   float64
 3   longitude    5570 non-null   float64
 4   codigo_uf    5570 non-null   int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 217.7+ KB


None

In [8]:
df_municipios_um.drop(columns=["#"], inplace=True)

In [9]:
df_municipios_dois.columns

Index(['codigo_ibge', 'nome', 'latitude', 'longitude', 'codigo_uf'], dtype='object')

In [10]:
df_municipios = pd.merge(df_municipios_um, df_municipios_dois[["codigo_ibge", "codigo_uf", "latitude", "longitude"]], 
                         how="inner", left_on="Cod IBGE", right_on="codigo_ibge")

df_municipios.drop(columns=["Cod IBGE"], inplace=True)

In [12]:
municipios_ne = df_municipios[df_municipios["codigo_uf"].isin(codigos_ne)]
codigos_municipios_modelos = municipios_ne["codigo_ibge"].values

In [44]:
municipios_ne.head()

Unnamed: 0,UF,Município,População Estimada - 2017,codigo_ibge,codigo_uf,latitude,longitude
0,AL,ÁGUA BRANCA,20467,2700102,27,-9.262,-37.938
1,AL,ARAPIRACA,234185,2700300,27,-9.75487,-36.6615
2,AL,BATALHA,18757,2700706,27,-9.6742,-37.133
3,AL,BELO MONTE,6797,2700904,27,-9.82272,-37.277
4,AL,CACIMBINHAS,10897,2701209,27,-9.40121,-36.9911


#### 2.2) Dados de Meteorologia:

In [15]:
met1_raw = pd.read_csv("..\Dados\INMET\INMET_2020.csv")
met2_raw = pd.read_csv("..\Dados\INMET\INMET_2021.csv")
met3_raw = pd.read_csv("..\Dados\INMET\INMET_2022.csv")

In [32]:
met_2020 = met1_raw.copy()
met_2021 = met2_raw.copy()
met_2022 = met3_raw.copy()

In [33]:
met_data = pd.concat([met_2022, met_2021, met_2020])

In [34]:
met_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15273576 entries, 0 to 5173775
Data columns (total 26 columns):
 #   Column                                                 Dtype  
---  ------                                                 -----  
 0   Data                                                   object 
 1   Hora UTC                                               object 
 2   PRECIPITAÇÃO TOTAL, HORÁRIO (mm)                       float64
 3   PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)  float64
 4   PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)        float64
 5   PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)       float64
 6   RADIACAO GLOBAL (Kj/m²)                                float64
 7   TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)           float64
 8   TEMPERATURA DO PONTO DE ORVALHO (°C)                   float64
 9   TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)             float64
 10  TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)             float64
 11  TE

In [35]:
met_data.columns

Index(['Data', 'Hora UTC', 'PRECIPITAÇÃO TOTAL, HORÁRIO (mm)',
       'PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)',
       'PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)',
       'PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)',
       'RADIACAO GLOBAL (Kj/m²)',
       'TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)',
       'TEMPERATURA DO PONTO DE ORVALHO (°C)',
       'TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)',
       'TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)',
       'TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)',
       'TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)',
       'UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)',
       'UMIDADE REL. MIN. NA HORA ANT. (AUT) (%)',
       'UMIDADE RELATIVA DO AR, HORARIA (%)',
       'VENTO, DIREÇÃO HORARIA (gr) (° (gr))', 'VENTO, RAJADA MAXIMA (m/s)',
       'VENTO, VELOCIDADE HORARIA (m/s)', 'REGIAO', 'UF', 'ESTACAO',
       'CODIGO (WMO)', 'LATITUDE', 'LONGITUDE', 'ALTITUDE'],
      dtype='object')

In [37]:
met_data.drop(columns=["ESTACAO", "CODIGO (WMO)",
                       "Hora UTC",
                       "REGIAO",
                       "RADIACAO GLOBAL (Kj/m²)",
                       "PRESSAO ATMOSFERICA AO NIVEL DA ESTACAO, HORARIA (mB)",
                       "TEMPERATURA DO PONTO DE ORVALHO (°C)",
                       "TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)",
                       "TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)",
                       "PRESSÃO ATMOSFERICA MAX.NA HORA ANT. (AUT) (mB)",
                       "PRESSÃO ATMOSFERICA MIN. NA HORA ANT. (AUT) (mB)", 
                       "TEMPERATURA ORVALHO MAX. NA HORA ANT. (AUT) (°C)",
                       "TEMPERATURA ORVALHO MIN. NA HORA ANT. (AUT) (°C)",
                       "TEMPERATURA MÁXIMA NA HORA ANT. (AUT) (°C)",
                       "TEMPERATURA MÍNIMA NA HORA ANT. (AUT) (°C)",
                       "UMIDADE RELATIVA DO AR, HORARIA (%)",
                       "UMIDADE REL. MAX. NA HORA ANT. (AUT) (%)",
                       "UMIDADE REL. MIN. NA HORA ANT. (AUT) (%)",
                       "VENTO, DIREÇÃO HORARIA (gr) (° (gr))", 
                       "VENTO, RAJADA MAXIMA (m/s)",
                       'ALTITUDE',
                       "VENTO, VELOCIDADE HORARIA (m/s)",
                       ], 
                       inplace=True)

In [38]:
met_data.set_index(["Data"], inplace=True)

In [39]:
met_data.head()

Unnamed: 0_level_0,"PRECIPITAÇÃO TOTAL, HORÁRIO (mm)","TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)",UF,LATITUDE,LONGITUDE
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01,0.0,26.4,AL,-9.622222,-37.767222
2022-01-01,0.0,25.8,AL,-9.622222,-37.767222
2022-01-01,0.0,25.4,AL,-9.622222,-37.767222
2022-01-01,0.0,25.1,AL,-9.622222,-37.767222
2022-01-01,0.0,24.4,AL,-9.622222,-37.767222


In [40]:
met_data.isna().sum()

PRECIPITAÇÃO TOTAL, HORÁRIO (mm)                5917264
TEMPERATURA DO AR - BULBO SECO, HORARIA (°C)    4932723
UF                                                    0
LATITUDE                                              0
LONGITUDE                                             0
dtype: int64

#### 2.3) Dados de Logística

In [54]:
logistica_raw = pd.read_excel(r"..\Dados\Transportes\transport_cost.xlsx")

In [55]:
df_logistica = logistica_raw.copy()

In [141]:
df_logistica.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4155 entries, 0 to 4154
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   codigo_ibge          4155 non-null   int64  
 1   nome_porto           4155 non-null   object 
 2   transportation_cost  4155 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 97.5+ KB


In [56]:
df_logistica.head()

Unnamed: 0,codigo_ibge,nome,codigo_uf,nome_porto,transportation_cost
0,2300101,Abaiara,23,suape,26769.167437
1,2300101,Abaiara,23,aratu,26247.386651
2,2300101,Abaiara,23,natal,27553.406755
3,2900108,Abaíra,29,suape,25204.17079
4,2900108,Abaíra,29,aratu,24126.590792


In [57]:
df_logistica.drop(columns=["nome", "codigo_uf"], inplace=True)

In [58]:
df_logistica.head()

Unnamed: 0,codigo_ibge,nome_porto,transportation_cost
0,2300101,suape,26769.167437
1,2300101,aratu,26247.386651
2,2300101,natal,27553.406755
3,2900108,suape,25204.17079
4,2900108,aratu,24126.590792


#### 2.4) Registros Administrativos:

Buscar informações sobre educação dentro do banco de dados de Resgistros administrativos que façam sentido com o que a ONG, Amigos do bem, busque dentro de um modelo de município:

- Taxa de evasão do ensino fundamental (TTREVA_EF_TOTAL) (sem dados)
- Taxa de evasão do ensino médio (TTREVA_EM_TOTAL) (sem dados)
- Taxa de evasão do ensino médio em Escolas públicas (TTREVA_EM_PUB). (sem dados)
- Taxa de evasão do ensino fundamental em Escolas públicas (TTREVA_EF_PUB). (sem dados)
- Percentual de matrículas da rede pública no ensino fundamental (PMATPUB_EF).
- Percentual de matrículas da rede pública no ensino médio (PMATPUB_EM)

In [41]:
df_adm_total_raw = pd.read_excel(r"..\Dados\Registros Administrativos\dados_registros_administrativos_total_2012_2017.xlsx", sheet_name="MUNICÍPIO")

In [74]:
df_adm_total = df_adm_total_raw.copy()

In [83]:
lista_colunas_uteis_adm = ["ANO","IBGE7", "NOME", "TTREVA_EF_TOTAL", "TTREVA_EM_TOTAL", "TTREVA_EF_PUB", 
                           "TTREVA_EM_PUB", "PMATPUB_EF", "PMATPUB_EM"]

In [84]:
df_util_adm = df_adm_total[lista_colunas_uteis_adm][df_adm_total["ANO"] == 2017]

In [85]:
df_util_adm.drop(columns="ANO", inplace=True)

In [86]:
df_util_adm.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5570 entries, 22280 to 27849
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   IBGE7            5570 non-null   int64  
 1   NOME             5570 non-null   object 
 2   TTREVA_EF_TOTAL  0 non-null      float64
 3   TTREVA_EM_TOTAL  0 non-null      float64
 4   TTREVA_EF_PUB    0 non-null      float64
 5   TTREVA_EM_PUB    0 non-null      float64
 6   PMATPUB_EF       5570 non-null   float64
 7   PMATPUB_EM       5561 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 391.6+ KB


In [88]:
df_util_adm = df_util_adm[["IBGE7", "PMATPUB_EF", "PMATPUB_EM"]]

Unnamed: 0,IBGE7,PMATPUB_EF,PMATPUB_EM
22280,1100015,100.0,100.0
22281,1100023,89.42,95.84
22282,1100031,100.0,100.0
22283,1100049,82.46,80.47
22284,1100056,96.31,97.47


In [89]:
df_util_adm.isna().sum()

IBGE7         0
PMATPUB_EF    0
PMATPUB_EM    9
dtype: int64

In [91]:
imputer = SimpleImputer(strategy="mean")
df_util_adm["PMATPUB_EM"] = imputer.fit_transform(df_util_adm[["PMATPUB_EM"]])

In [93]:
df_util_adm.set_index("IBGE7", inplace=True)

In [94]:
df_util_adm[["PMATPUB_EF", "PMATPUB_EM"]] = df_util_adm[["PMATPUB_EF", "PMATPUB_EM"]] / 100 

In [96]:
# Tabela final para registros administrativos:
df_util_adm.head()

Unnamed: 0_level_0,PMATPUB_EF,PMATPUB_EM
IBGE7,Unnamed: 1_level_1,Unnamed: 2_level_1
1100015,1.0,1.0
1100023,0.8942,0.9584
1100031,1.0,1.0
1100049,0.8246,0.8047
1100056,0.9631,0.9747


#### 2.5) Exportação (COMEX):

Anos 2020, 2021 e 2022

In [153]:
df_expo_2020_raw = pd.read_csv("..\Dados\COMEX\EXP_2020_MUN.csv", delimiter=";") 
df_expo_2021_raw = pd.read_csv("..\Dados\COMEX\EXP_2021_MUN.csv", delimiter=";")
df_expo_2022_raw = pd.read_csv("..\Dados\COMEX\EXP_2022_MUN.csv", delimiter=";")

In [154]:
df_expo_2020 = df_expo_2020_raw.copy()
df_expo_2021 = df_expo_2021_raw.copy()
df_expo_2022 = df_expo_2022_raw.copy()

In [156]:
df_expo = pd.concat([df_expo_2022, df_expo_2021, df_expo_2020])

In [203]:
df_expo_nordeste = df_expo[df_expo["SG_UF_MUN"].isin(["PE", "PB", "AL", "MA", "RN", "CE", "BA", "SE", "PI"])] 

In [204]:
df_expo_nordeste.head()

Unnamed: 0,CO_ANO,CO_MES,SH4,CO_PAIS,SG_UF_MUN,CO_MUN,KG_LIQUIDO,VL_FOB
34,2022,12,703,399,MA,2111300,126,325
68,2022,8,6402,337,CE,2312908,5685,69774
110,2022,7,703,301,MA,2111300,420,622
115,2022,6,808,741,MA,2111300,238,579
117,2022,1,807,434,MA,2111300,1474,1286


In [205]:
df_expo_nordeste.info()

<class 'pandas.core.frame.DataFrame'>
Index: 219271 entries, 34 to 977451
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   CO_ANO      219271 non-null  int64 
 1   CO_MES      219271 non-null  int64 
 2   SH4         219271 non-null  int64 
 3   CO_PAIS     219271 non-null  int64 
 4   SG_UF_MUN   219271 non-null  object
 5   CO_MUN      219271 non-null  int64 
 6   KG_LIQUIDO  219271 non-null  int64 
 7   VL_FOB      219271 non-null  int64 
dtypes: int64(7), object(1)
memory usage: 15.1+ MB


#### 2.6) Censo

Censos dos anos 1991, 2000 e 2010 dos Municípios Brasileiros.

In [7]:
censo_raw_municipal = pd.read_excel("..\Dados\Censo\Censo_municipal_estadual_nacional.xlsx", sheet_name="MUN 91-00-10")

In [169]:
censo_municipal = censo_raw_municipal.copy()

In [199]:
censo_municipal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16695 entries, 0 to 16694
Columns: 237 entries, ANO to IDHM_R
dtypes: float64(167), int64(69), object(1)
memory usage: 30.2+ MB


In [186]:
censo_municipal_nordeste = censo_municipal[censo_municipal['UF'].isin(codigos_ne)]
censo_municipal_nordeste

Unnamed: 0,ANO,UF,Codmun6,Codmun7,Município,ESPVIDA,FECTOT,MORT1,MORT5,RAZDEP,...,PIA1517,PIA18M,POP,POPT,I_ESCOLARIDADE,I_FREQ_PROP,IDHM,IDHM_E,IDHM_L,IDHM_R
449,1991,21,210005,2100055,AÇAILÂNDIA,57.69,4.23,82.36,106.51,85.23,...,,,63132,62622,0.146,0.125,0.344,0.132,0.545,0.565
450,1991,21,210010,2100105,AFONSO CUNHA,55.35,5.93,96.05,123.67,118.20,...,,,4579,4579,0.055,0.181,0.278,0.122,0.506,0.349
451,1991,21,210015,2100154,ÁGUA DOCE DO MARANHÃO,56.96,5.81,86.50,111.72,106.06,...,,,10714,10628,0.025,0.060,0.210,0.045,0.533,0.388
452,1991,21,210020,2100204,ALCÂNTARA,60.43,5.35,67.87,83.26,99.24,...,,,19435,19403,0.083,0.075,0.271,0.078,0.591,0.430
453,1991,21,210030,2100303,ALDEIAS ALTAS,57.91,5.93,81.10,104.92,99.63,...,,,19441,19252,0.041,0.107,0.257,0.078,0.549,0.397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13368,2010,29,293330,2933307,VITÓRIA DA CONQUISTA,72.30,2.01,21.24,17.13,46.97,...,16824.0,213429.0,304599,303343,0.493,0.630,0.678,0.581,0.788,0.681
13369,2010,29,293340,2933406,WAGNER,69.20,2.61,29.90,32.26,59.36,...,568.0,5830.0,8971,8880,0.343,0.566,0.587,0.479,0.737,0.573
13370,2010,29,293345,2933455,WANDERLEY,70.82,2.79,25.00,26.93,56.00,...,799.0,8137.0,12474,12158,0.327,0.614,0.600,0.498,0.764,0.569
13371,2010,29,293350,2933505,WENCESLAU GUIMARÃES,70.11,2.94,27.10,29.21,57.74,...,1414.0,13920.0,21893,21522,0.263,0.463,0.544,0.383,0.752,0.558


In [197]:
censo_modelo = censo_municipal_nordeste[censo_municipal_nordeste["Codmun7"].isin(codigos_municipios_modelos)]
censo_nodeste_threshold = censo_municipal_nordeste[censo_municipal_nordeste["IDHM"] <= 0.52 ]

In [198]:
censo_nodeste_threshold

Unnamed: 0,ANO,UF,Codmun6,Codmun7,Município,ESPVIDA,FECTOT,MORT1,MORT5,RAZDEP,...,PIA1517,PIA18M,POP,POPT,I_ESCOLARIDADE,I_FREQ_PROP,IDHM,IDHM_E,IDHM_L,IDHM_R
449,1991,21,210005,2100055,AÇAILÂNDIA,57.69,4.23,82.36,106.51,85.23,...,,,63132,62622,0.146,0.125,0.344,0.132,0.545,0.565
450,1991,21,210010,2100105,AFONSO CUNHA,55.35,5.93,96.05,123.67,118.20,...,,,4579,4579,0.055,0.181,0.278,0.122,0.506,0.349
451,1991,21,210015,2100154,ÁGUA DOCE DO MARANHÃO,56.96,5.81,86.50,111.72,106.06,...,,,10714,10628,0.025,0.060,0.210,0.045,0.533,0.388
452,1991,21,210020,2100204,ALCÂNTARA,60.43,5.35,67.87,83.26,99.24,...,,,19435,19403,0.083,0.075,0.271,0.078,0.591,0.430
453,1991,21,210030,2100303,ALDEIAS ALTAS,57.91,5.93,81.10,104.92,99.63,...,,,19441,19252,0.041,0.107,0.257,0.078,0.549,0.397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13215,2010,29,292150,2921500,MONTE SANTO,66.96,2.52,37.60,40.56,62.35,...,3225.0,33944.0,52294,50646,0.189,0.494,0.506,0.359,0.699,0.515
13252,2010,29,292420,2924207,PEDRO ALEXANDRE,71.68,2.21,22.60,24.35,58.73,...,1092.0,10520.0,16621,16188,0.229,0.440,0.513,0.354,0.778,0.489
13254,2010,29,292440,2924405,PILÃO ARCADO,69.19,3.10,29.90,32.29,64.49,...,2305.0,19967.0,32809,31594,0.204,0.444,0.506,0.343,0.737,0.514
13281,2010,29,292650,2926509,RIBEIRA DO AMPARO,66.66,2.54,38.80,41.77,63.14,...,1048.0,9002.0,14223,13857,0.218,0.509,0.512,0.384,0.694,0.503


#### 2.7) Dados Educacinais:

In [310]:
edu_raw_2020 = pd.read_csv(r"..\Dados\Q EDU\Dados_QEdu_Analitico_2020.csv", delimiter=";")
censo_raw_2020 = pd.read_excel(r"..\Dados\Censo\estimativa_dou_2020.xls", header=1, sheet_name="Municípios", dtype={"COD. MUNIC": str})

In [311]:
edu_2020 = edu_raw_2020.copy()
censo_2020 = censo_raw_2020.copy()

In [312]:
edu_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1795 entries, 0 to 1794
Data columns (total 76 columns):
 #   Column                                                                                                                  Non-Null Count  Dtype  
---  ------                                                                                                                  --------------  -----  
 0   COD Municipio                                                                                                           1795 non-null   object 
 1   Cidade                                                                                                                  1794 non-null   object 
 2   COD UF                                                                                                                  1794 non-null   float64
 3   UF                                                                                                                      1794 non-null   object 

In [313]:
edu_2020_nordeste = edu_2020[edu_2020["COD UF"].isin(codigos_ne)]

In [314]:
edu_2020_nordeste["Total de escolas estaduais"] = edu_2020_nordeste["Total de escolas estaduais"].str.replace(",", ".")
edu_2020_nordeste["Total de escolas municipais"] = edu_2020_nordeste["Total de escolas municipais"].str.replace(",", ".")

In [315]:
edu_2020_nordeste[[ 
                "Total de escolas estaduais", 
                "Total de escolas municipais"]] = edu_2020_nordeste[[ 
                                                                    "Total de escolas estaduais", 
                                                                    "Total de escolas municipais"]].astype('float').astype('Int64')

In [316]:
imputer = SimpleImputer(strategy="mean")
dados_imputados = imputer.fit_transform(edu_2020_nordeste[[ 
                                                           "Total de escolas estaduais", 
                                                           "Total de escolas municipais"]])

edu_2020_nordeste[["Total de escolas estaduais", "Total de escolas municipais"]] = dados_imputados.round()

##### 2.7.1) Número de escolas municipais e estaduais em um município

In [317]:
edu_2020_nordeste["Total_escolas_publicas_mun_est"] = edu_2020_nordeste['Total de escolas estaduais'] \
    + edu_2020_nordeste['Total de escolas municipais']

In [318]:
df_edu_2020 = edu_2020_nordeste[["COD Municipio", "Total_escolas_publicas_mun_est"]]

df_edu_2020.set_index("COD Municipio", inplace=True)

In [319]:
df_edu_2020.head()

Unnamed: 0_level_0,Total_escolas_publicas_mun_est
COD Municipio,Unnamed: 1_level_1
2507507,289.0
2207702,108.0
2412005,65.0
2302800,88.0
2110104,74.0


##### 2.7.2) Número de matrículas por população jovem:

In [320]:
total_matriculas =  edu_2020_nordeste[["COD Municipio", "Total de matrículas"]]
total_matriculas.set_index("COD Municipio", inplace=True)

In [321]:
total_matriculas.isna().sum()

Total de matrículas    0
dtype: int64

In [322]:
df_edu_2020 = df_edu_2020.merge(total_matriculas, how="inner", left_index=True, right_index=True)

In [323]:
df_edu_2020.head()

Unnamed: 0_level_0,Total_escolas_publicas_mun_est,Total de matrículas
COD Municipio,Unnamed: 1_level_1,Unnamed: 2_level_1
2507507,289.0,169336.0
2207702,108.0,38350.0
2412005,65.0,24709.0
2302800,88.0,20928.0
2110104,74.0,8861.0


In [324]:
censo_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5586 entries, 0 to 5585
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   UF                  5584 non-null   object 
 1   COD. UF             5570 non-null   float64
 2   COD. MUNIC          5570 non-null   object 
 3   NOME DO MUNICÍPIO   5570 non-null   object 
 4   POPULAÇÃO ESTIMADA  5570 non-null   object 
dtypes: float64(1), object(4)
memory usage: 218.3+ KB


In [325]:
censo_2020_ne = censo_2020[censo_2020["COD. UF"].isin(codigos_ne)]

In [326]:
censo_2020_ne

Unnamed: 0,UF,COD. UF,COD. MUNIC,NOME DO MUNICÍPIO,POPULAÇÃO ESTIMADA
450,MA,21.0,00055,Açailândia,113121
451,MA,21.0,00105,Afonso Cunha,6578
452,MA,21.0,00154,Água Doce do Maranhão,12652
453,MA,21.0,00204,Alcântara,22112
454,MA,21.0,00303,Aldeias Altas,26757
...,...,...,...,...,...
2239,BA,29.0,33307,Vitória da Conquista,341128
2240,BA,29.0,33406,Wagner,9344
2241,BA,29.0,33455,Wanderley,12180
2242,BA,29.0,33505,Wenceslau Guimarães,20978


In [327]:
censo_2020_ne["COD. UF"] = censo_2020_ne["COD. UF"].astype(int).astype(str) 

In [328]:
censo_2020_ne

Unnamed: 0,UF,COD. UF,COD. MUNIC,NOME DO MUNICÍPIO,POPULAÇÃO ESTIMADA
450,MA,21,00055,Açailândia,113121
451,MA,21,00105,Afonso Cunha,6578
452,MA,21,00154,Água Doce do Maranhão,12652
453,MA,21,00204,Alcântara,22112
454,MA,21,00303,Aldeias Altas,26757
...,...,...,...,...,...
2239,BA,29,33307,Vitória da Conquista,341128
2240,BA,29,33406,Wagner,9344
2241,BA,29,33455,Wanderley,12180
2242,BA,29,33505,Wenceslau Guimarães,20978


In [329]:
censo_2020_ne["CODIGO_MUNIC"] = censo_2020_ne["COD. UF"] +  censo_2020_ne["COD. MUNIC"]

In [330]:
censo_2020_ne.drop(columns=["COD. UF", "COD. MUNIC", "NOME DO MUNICÍPIO", "UF"], inplace=True)

In [331]:
censo_2020_ne.set_index("CODIGO_MUNIC", inplace=True)

In [332]:
censo_2020_ne

Unnamed: 0_level_0,POPULAÇÃO ESTIMADA
CODIGO_MUNIC,Unnamed: 1_level_1
2100055,113121
2100105,6578
2100154,12652
2100204,22112
2100303,26757
...,...
2933307,341128
2933406,9344
2933455,12180
2933505,20978


In [333]:
resultado = censo_2020_ne["POPULAÇÃO ESTIMADA"].str.split("(").to_frame()

In [334]:
index = resultado.dropna().index
valores = [i[0] for i in resultado.dropna()["POPULAÇÃO ESTIMADA"]]

In [335]:
valores_serie = pd.Series(valores, index=index).str.strip()
censo_2020_ne["POPULAÇÃO ESTIMADA"].loc[index] = valores_serie.values

In [336]:
censo_2020_ne["POPULAÇÃO ESTIMADA"] = censo_2020_ne["POPULAÇÃO ESTIMADA"].astype(int)

In [337]:
# Juntando o dataset de educação com a população:
df_edu_2020 = df_edu_2020.merge(censo_2020_ne, how="inner", left_index=True, right_index=True)

In [339]:
df_edu_2020.index.name = "COD_MUNIC"

In [340]:
df_edu_2020

Unnamed: 0_level_0,Total_escolas_publicas_mun_est,Total de matrículas,POPULAÇÃO ESTIMADA
COD_MUNIC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2507507,289.0,169336.0,817511
2207702,108.0,38350.0,153482
2412005,65.0,24709.0,103672
2302800,88.0,20928.0,77244
2110104,74.0,8861.0,25764
...,...,...,...
2917003,60.0,11398.0,36116
2103505,58.0,17674.0,41312
2402006,46.0,15571.0,68343
2916401,45.0,17693.0,76795


In [345]:
df_edu_2020["TOT_ESC_POR_POP"] = df_edu_2020["Total_escolas_publicas_mun_est"]/ df_edu_2020["POPULAÇÃO ESTIMADA"]
df_edu_2020["TOT_MAT_POR_POP"] = df_edu_2020["Total de matrículas"]/ df_edu_2020["POPULAÇÃO ESTIMADA"]

In [346]:
df_edu_2020

Unnamed: 0_level_0,Total_escolas_publicas_mun_est,Total de matrículas,POPULAÇÃO ESTIMADA,TOT_ESC_POR_POP,TOT_MAT_POR_POP
COD_MUNIC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2507507,289.0,169336.0,817511,0.000354,0.207136
2207702,108.0,38350.0,153482,0.000704,0.249866
2412005,65.0,24709.0,103672,0.000627,0.238338
2302800,88.0,20928.0,77244,0.001139,0.270934
2110104,74.0,8861.0,25764,0.002872,0.343930
...,...,...,...,...,...
2917003,60.0,11398.0,36116,0.001661,0.315594
2103505,58.0,17674.0,41312,0.001404,0.427818
2402006,46.0,15571.0,68343,0.000673,0.227836
2916401,45.0,17693.0,76795,0.000586,0.230393


In [349]:
df_edu_2020.index = df_edu_2020.index.astype(int)

Dataframe com informações de educação final:

In [354]:
df_edu_2020 = df_edu_2020.merge(df_util_adm, how="inner", left_index=True, right_index=True)

In [355]:
df_edu_2020_final = df_edu_2020.drop(columns=["POPULAÇÃO ESTIMADA", "Total_escolas_publicas_mun_est", "Total de matrículas"])

In [356]:
df_edu_2020_final

Unnamed: 0,TOT_ESC_POR_POP,TOT_MAT_POR_POP,PMATPUB_EF,PMATPUB_EM
2507507,0.000354,0.207136,0.6217,0.6889
2207702,0.000704,0.249866,0.7635,0.7195
2412005,0.000627,0.238338,0.7413,0.9674
2302800,0.001139,0.270934,0.8577,1.0000
2110104,0.002872,0.343930,0.9543,1.0000
...,...,...,...,...
2917003,0.001661,0.315594,0.9377,1.0000
2103505,0.001404,0.427818,0.8585,0.9709
2402006,0.000673,0.227836,0.6535,0.8181
2916401,0.000586,0.230393,0.8152,0.8877


In [357]:
df_edu_2020_final.to_csv(r"..\Dados\Q EDU\dados_edu_2020.csv")