In [11]:
import pandas as pd

schooling_data = pd.read_csv("datasets/expected-years-of-schooling.csv")
schooling_data.head()

Unnamed: 0,Entity,Code,Year,Expected Years of Schooling (years)
0,Afghanistan,AFG,1990,2.6
1,Afghanistan,AFG,1991,2.9
2,Afghanistan,AFG,1992,3.2
3,Afghanistan,AFG,1993,3.6
4,Afghanistan,AFG,1994,3.9


In [12]:
schooling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5142 entries, 0 to 5141
Data columns (total 4 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Entity                               5142 non-null   object 
 1   Code                                 5124 non-null   object 
 2   Year                                 5142 non-null   int64  
 3   Expected Years of Schooling (years)  5142 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 160.8+ KB


In [13]:
# drop rows with missing values in the code column
schooling_data = schooling_data.dropna(subset=["Code"])
schooling_data.head()


Unnamed: 0,Entity,Code,Year,Expected Years of Schooling (years)
0,Afghanistan,AFG,1990,2.6
1,Afghanistan,AFG,1991,2.9
2,Afghanistan,AFG,1992,3.2
3,Afghanistan,AFG,1993,3.6
4,Afghanistan,AFG,1994,3.9


In [14]:
schooling_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5124 entries, 0 to 5141
Data columns (total 4 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Entity                               5124 non-null   object 
 1   Code                                 5124 non-null   object 
 2   Year                                 5124 non-null   int64  
 3   Expected Years of Schooling (years)  5124 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 200.2+ KB


In [15]:
# save the translated data to csv
schooling_data.to_csv("datasets/expected-years-of-schooling-cleaned.csv", index=False)

In [4]:
from statistics_calc import descriptors, translate_descriptors

# Generate descriptors
desc = descriptors(schooling_data)

# Translate the descriptors to Spanish
desc = translate_descriptors(desc)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/schooling-quantitative.csv", decimal=",")

desc

Unnamed: 0,Año,Años de escolaridad esperados
Total de valores,5142.0,5142.0
Media,2003.810968,11.748269
Desviación estándar,8.03765,3.346839
Valor mínimo,1990.0,2.1
Q1,1997.0,9.8
Q2,2004.0,12.1
Q3,2011.0,13.975
Q4,2017.0,23.3
Asimetría,-0.049536,-0.356944
Curtosis,-1.185758,0.06449


In [5]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = ["Entidad", "Código"]

# Generate statistics
estadisticas = qualitative_stats(schooling_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/schooling-qualitative.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Total de valores,Valores unicos (#),Valores unicos (%),Valores nulos (%)
1,Entidad,Afghanistan,28,0.544535,5142,193,3.753403,0.0
2,Código,AFG,28,0.544535,5124,192,3.733956,0.350058


In [None]:
# count the number of missing values in each column
missing = schooling_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

Code    18
dtype: int64

In [None]:
# Count the number of unique values in each column
unique_values = schooling_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv(
    "descriptors/schooling_unique_values.csv", index=False, decimal="."
)
unique_values_df

Unnamed: 0,Columna,Dominio
0,Entity,193
1,Code,192
2,Year,28
3,Expected Years of Schooling (years),187


In [None]:
# count the number of cells with missing values
missing_values = schooling_data.isnull().sum().sum()
missing_values

18

In [None]:
# drop the rows with missing values in the code column
schooling_data_clean = schooling_data.dropna(subset=["Code"])

# count the total number of cells
print("Total number of cells:", schooling_data_clean.size)

Total number of cells: 20496
