# Sinopse Estatistica

In [1]:
import pandas as pd

In [None]:
# Convert the corrected string to a DataFrame
df = pd.read_csv("../raw_data/turmas/turmas_2022.csv")
df.info()

In [None]:
# Melt the dataframe
melted_df = df.melt(id_vars=["Região Geográfica", "Unidade da Federação", "Município", "Código do Município"],
                                        value_vars=["Urbana", "Rural"],
                                        var_name="Rede",
                                        value_name="Docentes")

# Sort the dataframe for better visualization
melted_df = melted_df.sort_values(by=["Município", "Rede"]).reset_index(drop=True)

melted_df

## Teachers dataset

In [2]:
import pandas as pd
import glob
import re


# Path to the folder containing the CSV files
path = '../raw_data/teachers/' 
all_files = glob.glob(path + "/*.csv")

# Create a list of dataframes
dfs = []

for filename in all_files:
    df = pd.read_csv(filename)
      
    match = re.search(r'(?<!\d)\d{4}(?!\d)', filename)
    # If match is found, get the matched string, otherwise set year to None
    year = match.group() if match else None
    
    # Melt the dataframe    
    melted_df = df.melt(id_vars=["Região Geográfica", "Unidade da Federação", "Município", "Código do Município"],
                                        value_vars=["Urbana", "Rural"],
                                        var_name="Rede",
                                        value_name="Docentes")

    # Sort the dataframe for better visualization
    melted_df = melted_df.sort_values(by=["Município", "Rede"]).reset_index(drop=True)
    
    melted_df["Ano"] = year


    dfs.append(melted_df)

# Now, dfs is a list where each element is a DataFrame loaded from a CSV file in the folder.


# Concatenate all DataFrames together
teachers_df = pd.concat(dfs, ignore_index=True)

# Write the combined DataFrame to a new CSV file
# file_path = "../transformed_data/cleaned_teachers.csv"
# combined_df.to_csv(file_path, index=False)

#drop columns
columns_to_drop = ['Região Geográfica', 'Unidade da Federação', 'Município']
teachers_df = teachers_df.drop(columns=columns_to_drop)

#rename columns
teachers_df = teachers_df.rename(columns={
    'Código do Município': 'Código_IBGE'
})

In [3]:
teachers_df

Unnamed: 0,Código_IBGE,Rede,Docentes,Ano
0,5200050,Rural,-,2018
1,5200050,Urbana,27,2018
2,3100104,Rural,-,2018
3,3100104,Urbana,15,2018
4,5200100,Rural,-,2018
...,...,...,...,...
178173,2900504,Urbana,166,2015
178174,1505106,Rural,19,2015
178175,1505106,Urbana,662,2015
178176,3533809,Rural,0,2015


In [None]:
teachers_df['Código_IBGE'] = teachers_df['Código_IBGE'].astype(str).str.replace(',', '').astype(int)


In [4]:
len(teachers_df.Código_IBGE.unique()), len(sorted(list(teachers_df.Ano.unique())))

(5570, 16)

## Students dataset

In [5]:
import pandas as pd
import glob
import re


# Path to the folder containing the CSV files
path = '../raw_data/students/' 
all_files = glob.glob(path + "/*.csv")

# Create a list of dataframes
dfs = []

for filename in all_files:
    df = pd.read_csv(filename)
      
    match = re.search(r'(?<!\d)\d{4}(?!\d)', filename)
    # If match is found, get the matched string, otherwise set year to None
    year = match.group() if match else None
    
    # Melt the dataframe    
    melted_df = df.melt(id_vars=["Região Geográfica", "Unidade da Federação", "Município", "Código do Município"],
                                        value_vars=["Urbana", "Rural"],
                                        var_name="Rede",
                                        value_name="Matrículas")

    # Sort the dataframe for better visualization
    melted_df = melted_df.sort_values(by=["Município", "Rede"]).reset_index(drop=True)
    
    melted_df["Ano"] = year


    dfs.append(melted_df)

# Now, dfs is a list where each element is a DataFrame loaded from a CSV file in the folder.


# Concatenate all DataFrames together
students_df = pd.concat(dfs, ignore_index=True)

#drop columns
columns_to_drop = ['Região Geográfica', 'Unidade da Federação', 'Município']
students_df = students_df.drop(columns=columns_to_drop)

#rename columns
students_df = students_df.rename(columns={
    'Código do Município': 'Código_IBGE'
})

# Write the combined DataFrame to a new CSV file
# file_path = "../transformed_data/cleaned_students.csv"
# combined_df.to_csv(file_path, index=False)

In [None]:
students_df

In [6]:
len(students_df.Código_IBGE.unique()), len(sorted(list(students_df.Ano.unique())))

(5570, 16)

## Schools dataset

In [7]:
import pandas as pd
import glob
import re


# Path to the folder containing the CSV files
path = '../raw_data/schools/' 
all_files = glob.glob(path + "/*.csv")

# Create a list of dataframes
dfs = []

for filename in all_files:
    df = pd.read_csv(filename)
      
    match = re.search(r'(?<!\d)\d{4}(?!\d)', filename)
    # If match is found, get the matched string, otherwise set year to None
    year = match.group() if match else None
    
    # Melt the dataframe    
    melted_df = df.melt(id_vars=["Região Geográfica", "Unidade da Federação", "Município", "Código do Município"],
                                        value_vars=["Urbana", "Rural"],
                                        var_name="Rede",
                                        value_name="Estabelecimentos")

    # Sort the dataframe for better visualization
    melted_df = melted_df.sort_values(by=["Município", "Rede"]).reset_index(drop=True)
    
    melted_df["Ano"] = year


    dfs.append(melted_df)

# Now, dfs is a list where each element is a DataFrame loaded from a CSV file in the folder.


# Concatenate all DataFrames together
schools_df = pd.concat(dfs, ignore_index=True)

#drop columns
columns_to_drop = ['Região Geográfica', 'Unidade da Federação', 'Município']
schools_df = schools_df.drop(columns=columns_to_drop)

#rename columns
schools_df = schools_df.rename(columns={
    'Código do Município': 'Código_IBGE'
})

# Write the combined DataFrame to a new CSV file
# file_path = "../transformed_data/cleaned_schools.csv"
# combined_df.to_csv(file_path, index=False)

In [8]:
len(schools_df.Código_IBGE.unique()), len(sorted(list(schools_df.Ano.unique())))

(5570, 16)

## Turmas dataset

In [9]:
import pandas as pd
import glob
import re


# Path to the folder containing the CSV files
path = '../raw_data/turmas/' 
all_files = glob.glob(path + "/*.csv")

# Create a list of dataframes
dfs = []

for filename in all_files:
    df = pd.read_csv(filename)
      
    match = re.search(r'(?<!\d)\d{4}(?!\d)', filename)
    # If match is found, get the matched string, otherwise set year to None
    year = match.group() if match else None
    
    # Melt the dataframe    
    melted_df = df.melt(id_vars=["Região Geográfica", "Unidade da Federação", "Município", "Código do Município"],
                                        value_vars=["Urbana", "Rural"],
                                        var_name="Rede",
                                        value_name="Turmas")

    # Sort the dataframe for better visualization
    melted_df = melted_df.sort_values(by=["Município", "Rede"]).reset_index(drop=True)
    
    melted_df["Ano"] = year


    dfs.append(melted_df)

# Now, dfs is a list where each element is a DataFrame loaded from a CSV file in the folder.


# Concatenate all DataFrames together
batch_df = pd.concat(dfs, ignore_index=True)

#drop columns
columns_to_drop = ['Região Geográfica', 'Unidade da Federação', 'Município']
batch_df = batch_df.drop(columns=columns_to_drop)

#rename columns
batch_df = batch_df.rename(columns={
    'Código do Município': 'Código_IBGE'
})

# Write the combined DataFrame to a new CSV file
# file_path = "../transformed_data/cleaned_turmas.csv"
# combined_df.to_csv(file_path, index=False)

In [None]:
batch_df

In [10]:
len(batch_df.Código_IBGE.unique()), len(sorted(list(batch_df.Ano.unique())))

(5570, 16)

## Combinine all datasets

In [14]:
from functools import reduce

# List of dataframes to merge
dfs = [students_df, teachers_df, schools_df, batch_df]

# Merging all dataframes in the list based on the common columns

merged_all_df = reduce(lambda left, right: pd.merge(left, right, on=['Ano', 'Código_IBGE', 'Rede'], how='outer'), dfs)

#rename columns
merged_all_df = merged_all_df.rename(columns={
    'Rede': 'Localização'
})

merged_all_df.head()  # Displaying the first few rows of the merged dataframe

Unnamed: 0,Código_IBGE,Localização,Matrículas,Ano,Docentes,Estabelecimentos,Turmas
0,5200050,Rural,0,2009,0,0,0
1,5200050,Urbana,302,2009,19,1,12
2,3100104,Rural,0,2009,0,0,0
3,3100104,Urbana,211,2009,16,1,7
4,5200100,Rural,0,2009,0,0,0


In [15]:
len(list(merged_all_df.Código_IBGE.unique()))

5570

In [16]:
# Write the combined DataFrame to a new CSV file
file_path = "../transformed_data/school_infrastructure_per_municipality.csv"
merged_all_df.to_csv(file_path, index=False)