In [1]:
import pandas as pd

hdi_data = pd.read_csv("datasets/human-development-index.csv")
hdi_data.head()

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
0,Afghanistan,AFG,1980,0.228
1,Afghanistan,AFG,1985,0.273
2,Afghanistan,AFG,2002,0.373
3,Afghanistan,AFG,2003,0.383
4,Afghanistan,AFG,2004,0.398


In [2]:
# get all entries with nan values in Code
hdi_data[hdi_data["Code"].isna()]


Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
2390,Korea,,1990,0.728
2391,Korea,,1991,0.739
2392,Korea,,1992,0.746
2393,Korea,,1993,0.756
2394,Korea,,1994,0.767
2395,Korea,,1995,0.778
2396,Korea,,1996,0.789
2397,Korea,,1997,0.8
2398,Korea,,1998,0.797
2399,Korea,,1999,0.808


In [3]:
# get all entries with "Korea" in the country name
hdi_data[hdi_data["Entity"].str.contains("Korea")]


Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
2390,Korea,,1990,0.728
2391,Korea,,1991,0.739
2392,Korea,,1992,0.746
2393,Korea,,1993,0.756
2394,Korea,,1994,0.767
2395,Korea,,1995,0.778
2396,Korea,,1996,0.789
2397,Korea,,1997,0.8
2398,Korea,,1998,0.797
2399,Korea,,1999,0.808


In [4]:
# Replace all entries with exactly "Korea" in Entity with "North Korea"
mask = hdi_data["Entity"] == "Korea"
hdi_data.loc[mask, "Entity"] = "North Korea"

# Get all entries with "Korea" in the country name
hdi_data[hdi_data["Entity"].str.contains("Korea")]


Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
2390,North Korea,,1990,0.728
2391,North Korea,,1991,0.739
2392,North Korea,,1992,0.746
2393,North Korea,,1993,0.756
2394,North Korea,,1994,0.767
2395,North Korea,,1995,0.778
2396,North Korea,,1996,0.789
2397,North Korea,,1997,0.8
2398,North Korea,,1998,0.797
2399,North Korea,,1999,0.808


In [5]:
# add "PRK" as the code for North Korea
mask = hdi_data["Entity"] == "North Korea"
hdi_data.loc[mask, "Code"] = "PRK"

# Get all entries with nan values in code
hdi_data[hdi_data["Code"].isna()]


Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)


In [6]:
hdi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          5001 non-null   object 
 1   Code                            5001 non-null   object 
 2   Year                            5001 non-null   int64  
 3   Human Development Index (UNDP)  5001 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 156.4+ KB


In [7]:
# get all rows prior to 1990
old_data = hdi_data[hdi_data["Year"] < 1990]
old_data

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
0,Afghanistan,AFG,1980,0.228
1,Afghanistan,AFG,1985,0.273
18,Albania,ALB,1980,0.625
19,Albania,ALB,1985,0.623
126,Argentina,ARG,1980,0.675
...,...,...,...,...
4884,Vietnam,VNM,1985,0.479
4941,Zambia,ZMB,1980,0.418
4942,Zambia,ZMB,1985,0.409
4971,Zimbabwe,ZWE,1980,0.437


In [8]:
hdi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          5001 non-null   object 
 1   Code                            5001 non-null   object 
 2   Year                            5001 non-null   int64  
 3   Human Development Index (UNDP)  5001 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 156.4+ KB


In [9]:
# save the translated data to a new file
hdi_data.to_csv("datasets/human-development-index-cleaned.csv", index=False)

In [10]:
from statistics_calc import descriptors, translate_descriptors

# Generate descriptors
desc = descriptors(hdi_data)

# Translate the descriptors to Spanish
desc = translate_descriptors(desc)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/hdi-quantitative.csv", decimal=",")

desc

Unnamed: 0,Year,Human Development Index (UNDP)
Total de valores,5001.0,5001.0
Media,2003.407119,0.651761
Desviación estándar,8.954743,0.167121
Valor mínimo,1980.0,0.19
Q1,1997.0,0.524
Q2,2004.0,0.677
Q3,2011.0,0.781
Q4,2017.0,0.953
Asimetría,-0.422293,-0.394236
Curtosis,-0.513055,-0.694236


In [11]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = hdi_data.select_dtypes(include=["object"]).columns

# Generate statistics
estadisticas = qualitative_stats(hdi_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/hdi-qualitative.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Total de valores,Valores unicos (#),Valores unicos (%),Valores nulos (%)
1,Entity,Albania,30,0.59988,5001,190,3.79924,0.0
2,Code,ALB,30,0.59988,5001,190,3.79924,0.0


In [12]:
from statistics_calc import generate_domain_df

# Generate statistics
domain_df = generate_domain_df(hdi_data)

# Guardar datos en un archivo CSV
domain_df.to_csv("descriptors/hdi-domain.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
domain_df

Unnamed: 0,Columna,Dominio
0,Entity,190
1,Code,190
2,Year,30
3,Human Development Index (UNDP),702


In [13]:
from statistics_calc import generate_range_df

range_df = generate_range_df(hdi_data)

# save to file
range_df.to_csv("descriptors/hdi-range.csv", index=False, decimal=",")

range_df

Unnamed: 0,Columna,Rango
0,Year,"(1980, 2017)"
1,Human Development Index (UNDP),"(0.19, 0.953)"
