In [1]:
import pandas as pd

hihd_data = pd.read_csv("datasets/hdi-vs-hihd.csv")
hihd_data.head()

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP),Historical Index of Human Development (Prados de la Escosura),Population (historical estimates),Continent
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1980,0.228,0.11,13356500.0,
2,Afghanistan,AFG,1985,0.273,0.11,11938204.0,
3,Afghanistan,AFG,2002,0.373,,22600774.0,
4,Afghanistan,AFG,2003,0.383,,23680871.0,


In [4]:
# translate the column names
hihd_data.columns = [
    "Entidad",
    "Código",
    "Año",
    "IDH",
    "IDH Histórico Ajustado",
    "Población",
    "Continente",
]
hihd_data.head()


Unnamed: 0,Entidad,Código,Año,IDH,IDH Histórico Ajustado,Población,Continente
0,Abkhazia,OWID_ABK,2015,,,,Asia
1,Afghanistan,AFG,1980,0.228,0.11,13356500.0,
2,Afghanistan,AFG,1985,0.273,0.11,11938204.0,
3,Afghanistan,AFG,2002,0.373,,22600774.0,
4,Afghanistan,AFG,2003,0.383,,23680871.0,


In [5]:
# save the translated data into a new csv file
hihd_data.to_csv("datasets/hdi-vs-hihd-spanish.csv", index=False)

In [2]:
hihd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55738 entries, 0 to 55737
Data columns (total 7 columns):
 #   Column                                                         Non-Null Count  Dtype  
---  ------                                                         --------------  -----  
 0   Entity                                                         55738 non-null  object 
 1   Code                                                           54084 non-null  object 
 2   Year                                                           55738 non-null  int64  
 3   Human Development Index (UNDP)                                 5001 non-null   float64
 4   Historical Index of Human Development (Prados de la Escosura)  3210 non-null   float64
 5   Population (historical estimates)                              55656 non-null  float64
 6   Continent                                                      285 non-null    object 
dtypes: float64(3), int64(1), object(3)
memory usage: 3.0+ MB


In [3]:
from statistics_calc import descriptors

# Generate descriptors
desc = descriptors(hihd_data)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/hihd.csv", decimal=",")

desc

Unnamed: 0,Year,Human Development Index (UNDP),Historical Index of Human Development (Prados de la Escosura),Population (historical estimates)
count,55738.0,5001.0,3210.0,55656.0
mean,1604.466055,0.651761,0.287542,32463520.0
std,1415.644065,0.167121,0.202527,250302800.0
min,-10000.0,0.19,0.02,1.0
25%,1832.0,0.524,0.1125,133874.0
50%,1899.0,0.677,0.25,1218570.0
75%,1964.0,0.781,0.44,5396250.0
max,2021.0,0.953,0.9,7874966000.0
skewness,-5.962559,-0.394236,0.589366,17.44683
kurtosis,37.528641,-0.694236,-0.445863,386.1772


In [6]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = ["Entity", "Code", "Continent"]

# Generate statistics
estadisticas = qualitative_stats(hihd_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/hihd_quality.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

  estadisticas = pd.concat([estadisticas, nueva_fila], ignore_index=True)


Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Valores unicos (#),Valores unicos (%),Valores nulos (%)
0,Entity,Afghanistan,259,0.464674,294,0.527468,0.0
1,Code,AFG,259,0.464674,286,0.513115,2.967455
2,Continent,Europe,75,0.134558,7,0.012559,99.488679


In [5]:
# count the number of missing values in each column
missing = hihd_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

Population (historical estimates)                                   82
Code                                                              1654
Human Development Index (UNDP)                                   50737
Historical Index of Human Development (Prados de la Escosura)    52528
Continent                                                        55453
dtype: int64

In [2]:
# Count the number of unique values in each column
unique_values = hihd_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv("descriptors/hihd_unique_values.csv", index=False, decimal=".")
unique_values_df

Unnamed: 0,Columna,Dominio
0,Entity,294
1,Code,286
2,Year,259
3,Human Development Index (UNDP),702
4,Historical Index of Human Development (Prados ...,89
5,Population (historical estimates),51142
6,Continent,7


In [7]:
# count the number of cells with missing values
total_cells = hihd_data.size
missing_cells = hihd_data.isnull().sum().sum()
missing_cells

160454

In [8]:
# drop the rows with nan values in the code column
hihd_data_clean = hihd_data.dropna(subset=["Code"])

# count the number of cells
total_cells = hihd_data_clean.size
total_cells

378588