In [27]:
import pandas as pd

hihd_data = pd.read_csv("datasets/hdi-vs-hihd.csv")
hihd_data.head()

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP),Historical Index of Human Development (Prados de la Escosura),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1980,0.228,0.11,13356500.0,
2,Afghanistan,AFG,1985,0.273,0.11,11938204.0,
3,Afghanistan,AFG,2002,0.373,,22600774.0,
4,Afghanistan,AFG,2003,0.383,,23680871.0,


In [28]:
hihd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55738 entries, 0 to 55737
Data columns (total 7 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   Entity                                                         55738 non-null  object 
 1   Code                                                           54084 non-null  object 
 2   Year                                                           55738 non-null  int64  
 3   Human Development Index (UNDP)                                 5001 non-null   float64
 4   Historical Index of Human Development (Prados de la Escosura)  3210 non-null   float64
 5   Population (historical estimates)                              55656 non-null  float64
 6   Continent                                                      285 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 3.0+ MB


In [29]:
# drop Human Development Index (UNDP), Historical Index of Human Development (Prados de la Escosura) and Continent columns
hihd_data = hihd_data.drop(
    columns=[
        "Human Development Index (UNDP)",
        "Historical Index of Human Development (Prados de la Escosura)",
        "Continent",
    ]
)
hihd_data.head()


Unnamed: 0,Entity,Code,Year,Population (historical estimates)
0,Abkhazia,OWID_ABK,2015,
1,Afghanistan,AFG,1980,13356500.0
2,Afghanistan,AFG,1985,11938204.0
3,Afghanistan,AFG,2002,22600774.0
4,Afghanistan,AFG,2003,23680871.0


In [30]:
# drop rows with missing values in Population (historical estimates)
hihd_data = hihd_data.dropna(subset=["Population (historical estimates)"])
hihd_data.head()


Unnamed: 0,Entity,Code,Year,Population (historical estimates)
1,Afghanistan,AFG,1980,13356500.0
2,Afghanistan,AFG,1985,11938204.0
3,Afghanistan,AFG,2002,22600774.0
4,Afghanistan,AFG,2003,23680871.0
5,Afghanistan,AFG,2004,24726689.0


In [31]:
# drop all the rows prior to the year 1800
hihd_data = hihd_data[hihd_data["Year"] >= 1800]
hihd_data.head()


Unnamed: 0,Entity,Code,Year,Population (historical estimates)
1,Afghanistan,AFG,1980,13356500.0
2,Afghanistan,AFG,1985,11938204.0
3,Afghanistan,AFG,2002,22600774.0
4,Afghanistan,AFG,2003,23680871.0
5,Afghanistan,AFG,2004,24726689.0


In [32]:
# drop all the rows with missing values in code
hihd_data = hihd_data.dropna(subset=["Code"])
hihd_data.head()


Unnamed: 0,Entity,Code,Year,Population (historical estimates)
1,Afghanistan,AFG,1980,13356500.0
2,Afghanistan,AFG,1985,11938204.0
3,Afghanistan,AFG,2002,22600774.0
4,Afghanistan,AFG,2003,23680871.0
5,Afghanistan,AFG,2004,24726689.0


In [33]:
hihd_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 46966 entries, 1 to 55736
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Entity                             46966 non-null  object 
 1   Code                               46966 non-null  object 
 2   Year                               46966 non-null  int64  
 3   Population (historical estimates)  46966 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.8+ MB


In [34]:
# save the translated data into a new csv file
hihd_data.to_csv("datasets/hdi-vs-hihd-cleaned.csv", index=False)

In [5]:
from statistics_calc import descriptors, translate_descriptors

# Generate descriptors
desc = descriptors(hihd_data)

# Translate the descriprors
desc = translate_descriptors(desc)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/hihd-quantitative.csv", decimal=",")

desc

Unnamed: 0,Año,IDH,IDH Histórico Ajustado,Población
Total de valores,55738.0,5001.0,3210.0,55656.0
Media,1604.466055,0.651761,0.287542,32463520.0
Desviación estándar,1415.644065,0.167121,0.202527,250302800.0
Valor mínimo,-10000.0,0.19,0.02,1.0
Q1,1832.0,0.524,0.1125,133874.0
Q2,1899.0,0.677,0.25,1218570.0
Q3,1964.0,0.781,0.44,5396250.0
Q4,2021.0,0.953,0.9,7874966000.0
Asimetría,-5.962559,-0.394236,0.589366,17.44683
Curtosis,37.528641,-0.694236,-0.445863,386.1772


In [6]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = ["Entidad", "Código", "Continente"]

# Generate statistics
estadisticas = qualitative_stats(hihd_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/hihd-qualitative.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Total de valores,Valores unicos (#),Valores unicos (%),Valores nulos (%)
1,Entidad,Afghanistan,259,0.464674,55738,294,0.527468,0.0
2,Código,AFG,259,0.464674,54084,286,0.513115,2.967455
3,Continente,Europe,75,0.134558,285,7,0.012559,99.488679


In [5]:
# count the number of missing values in each column
missing = hihd_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

Population (historical estimates)                                   82
Code                                                              1654
Human Development Index (UNDP)                                   50737
Historical Index of Human Development (Prados de la Escosura)    52528
Continent                                                        55453
dtype: int64

In [2]:
# Count the number of unique values in each column
unique_values = hihd_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv("descriptors/hihd_unique_values.csv", index=False, decimal=".")
unique_values_df

Unnamed: 0,Columna,Dominio
0,Entity,294
1,Code,286
2,Year,259
3,Human Development Index (UNDP),702
4,Historical Index of Human Development (Prados ...,89
5,Population (historical estimates),51142
6,Continent,7


In [7]:
# count the number of cells with missing values
total_cells = hihd_data.size
missing_cells = hihd_data.isnull().sum().sum()
missing_cells

160454

In [8]:
# drop the rows with nan values in the code column
hihd_data_clean = hihd_data.dropna(subset=["Code"])

# count the number of cells
total_cells = hihd_data_clean.size
total_cells

378588