In [1]:
import pandas as pd

national_income_data = pd.read_csv("datasets/gross-national-income-per-capita.csv")
national_income_data.head()

Unnamed: 0,Entity,Code,Year,"GNI per capita, PPP (constant 2017 international $)"
0,Afghanistan,AFG,2017,2085.487571
1,Africa Eastern and Southern,,1990,2554.595257
2,Africa Eastern and Southern,,1991,2495.966953
3,Africa Eastern and Southern,,1992,2421.787983
4,Africa Eastern and Southern,,1993,2397.01279


In [2]:
# drop all the rows with missing values in the code column
national_income_data = national_income_data.dropna(subset=["Code"])
national_income_data.head()

Unnamed: 0,Entity,Code,Year,"GNI per capita, PPP (constant 2017 international $)"
0,Afghanistan,AFG,2017,2085.487571
63,Albania,ALB,2008,9955.158909
64,Albania,ALB,2009,10152.389131
65,Albania,ALB,2010,10642.626833
66,Albania,ALB,2011,11056.513384


In [3]:
national_income_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3930 entries, 0 to 5107
Data columns (total 4 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Entity                                               3930 non-null   object 
 1   Code                                                 3930 non-null   object 
 2   Year                                                 3930 non-null   int64  
 3   GNI per capita, PPP (constant 2017 international $)  3930 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 153.5+ KB


In [4]:
# save the data to a new csv file
national_income_data.to_csv(
    "datasets/gross-national-income-per-capita-cleaned.csv", index=False
)

In [5]:
from statistics_calc import descriptors, translate_descriptors

# Generate descriptors
desc = descriptors(national_income_data)

# Translate the descriptors to Spanish
desc = translate_descriptors(desc)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/income-quantitative.csv", decimal=",")

desc

Unnamed: 0,Year,"GNI per capita, PPP (constant 2017 international $)"
Total de valores,3930.0,3930.0
Media,2006.465394,17120.115534
Desviación estándar,8.580785,17718.549166
Valor mínimo,1990.0,416.03266
Q1,1999.0,3843.455479
Q2,2007.0,10366.682001
Q3,2014.0,25037.446405
Q4,2020.0,112322.24818
Asimetría,-0.232431,1.541397
Curtosis,-1.094449,2.282072


In [6]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = national_income_data.select_dtypes(include=["object"]).columns

# Generate statistics
estadisticas = qualitative_stats(national_income_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/income-qualitative.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Total de valores,Valores unicos (#),Valores unicos (%),Valores nulos (%)
1,Entity,Algeria,31,0.788804,3930,194,4.936387,0.0
2,Code,ARG,31,0.788804,3930,194,4.936387,0.0


In [7]:
from statistics_calc import generate_domain_df

# Generate statistics
domain_df = generate_domain_df(national_income_data)

# Guardar datos en un archivo CSV
domain_df.to_csv("descriptors/income-domain.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
domain_df


Unnamed: 0,Columna,Dominio
0,Entity,194
1,Code,194
2,Year,31
3,"GNI per capita, PPP (constant 2017 internation...",3930


In [8]:
from statistics_calc import generate_range_df

range_df = generate_range_df(national_income_data)

# save to file
range_df.to_csv("descriptors/income-range.csv", index=False, decimal=",")

range_df


Unnamed: 0,Columna,Rango
0,Year,"(1990, 2020)"
1,"GNI per capita, PPP (constant 2017 internation...","(416.032659857963, 112322.248179827)"
