In [1]:
import pandas as pd

schooling_data = pd.read_csv("datasets/expected-years-of-schooling.csv")
schooling_data.head()

Unnamed: 0,Entity,Code,Year,Expected Years of Schooling (years)
0,Afghanistan,AFG,1990,2.6
1,Afghanistan,AFG,1991,2.9
2,Afghanistan,AFG,1992,3.2
3,Afghanistan,AFG,1993,3.6
4,Afghanistan,AFG,1994,3.9


In [2]:
schooling_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5142 entries, 0 to 5141
Data columns (total 4 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Entity                               5142 non-null   object 
 1   Code                                 5124 non-null   object 
 2   Year                                 5142 non-null   int64  
 3   Expected Years of Schooling (years)  5142 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 160.8+ KB


In [3]:
# drop rows with missing values in the code column
schooling_data = schooling_data.dropna(subset=["Code"])
schooling_data.head()

Unnamed: 0,Entity,Code,Year,Expected Years of Schooling (years)
0,Afghanistan,AFG,1990,2.6
1,Afghanistan,AFG,1991,2.9
2,Afghanistan,AFG,1992,3.2
3,Afghanistan,AFG,1993,3.6
4,Afghanistan,AFG,1994,3.9


In [4]:
schooling_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5124 entries, 0 to 5141
Data columns (total 4 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Entity                               5124 non-null   object 
 1   Code                                 5124 non-null   object 
 2   Year                                 5124 non-null   int64  
 3   Expected Years of Schooling (years)  5124 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 200.2+ KB


In [5]:
# save the translated data to csv
schooling_data.to_csv("datasets/expected-years-of-schooling-cleaned.csv", index=False)

In [6]:
from statistics_calc import descriptors, translate_descriptors

# Generate descriptors
desc = descriptors(schooling_data)

# Translate the descriptors to Spanish
desc = translate_descriptors(desc)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/schooling-quantitative.csv", decimal=",")

desc

Unnamed: 0,Year,Expected Years of Schooling (years)
Total de valores,5124.0,5124.0
Media,2003.794496,11.750195
Desviación estándar,8.041064,3.352382
Valor mínimo,1990.0,2.1
Q1,1997.0,9.8
Q2,2004.0,12.1
Q3,2011.0,14.0
Q4,2017.0,23.3
Asimetría,-0.046755,-0.358071
Curtosis,-1.187365,0.055759


In [7]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = schooling_data.select_dtypes(include=["object"]).columns

# Generate statistics
estadisticas = qualitative_stats(schooling_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/schooling-qualitative.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Total de valores,Valores unicos (#),Valores unicos (%),Valores nulos (%)
1,Entity,Afghanistan,28,0.546448,5124,192,3.747073,0.0
2,Code,AFG,28,0.546448,5124,192,3.747073,0.0


In [8]:
from statistics_calc import generate_domain_df

# Generate statistics
domain_df = generate_domain_df(schooling_data)

# Guardar datos en un archivo CSV
domain_df.to_csv("descriptors/schooling-domain.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
domain_df


Unnamed: 0,Columna,Dominio
0,Entity,192
1,Code,192
2,Year,28
3,Expected Years of Schooling (years),187


In [9]:
from statistics_calc import generate_range_df

range_df = generate_range_df(schooling_data)

# save to file
range_df.to_csv("descriptors/schooling-range.csv", index=False, decimal=",")

range_df


Unnamed: 0,Columna,Rango
0,Year,"(1990, 2017)"
1,Expected Years of Schooling (years),"(2.1, 23.3)"
