In [19]:
import pandas as pd

hdi_data = pd.read_csv("datasets/human-development-index.csv")
hdi_data.head()

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
0,Afghanistan,AFG,1980,0.228
1,Afghanistan,AFG,1985,0.273
2,Afghanistan,AFG,2002,0.373
3,Afghanistan,AFG,2003,0.383
4,Afghanistan,AFG,2004,0.398


In [20]:
# translate the column names to spanish
hdi_data.columns = ["Entidad", "Código", "Año", "Índice de Desarrollo Humano (PNUD)"]
hdi_data.head()

Unnamed: 0,Entidad,Código,Año,Índice de Desarrollo Humano (PNUD)
0,Afghanistan,AFG,1980,0.228
1,Afghanistan,AFG,1985,0.273
2,Afghanistan,AFG,2002,0.373
3,Afghanistan,AFG,2003,0.383
4,Afghanistan,AFG,2004,0.398


In [21]:
# Set all rows with the year 1980 and 1985 to null
hdi_data.loc[hdi_data["Año"].isin([1980, 1985]), :] = None
hdi_data.head()

Unnamed: 0,Entidad,Código,Año,Índice de Desarrollo Humano (PNUD)
0,,,,
1,,,,
2,Afghanistan,AFG,2002.0,0.373
3,Afghanistan,AFG,2003.0,0.383
4,Afghanistan,AFG,2004.0,0.398


In [22]:
# save the translated data to a new file
hdi_data.to_csv("datasets/human-development-index-spanish.csv", index=False)

In [2]:
hdi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          5001 non-null   object 
 1   Code                            4973 non-null   object 
 2   Year                            5001 non-null   int64  
 3   Human Development Index (UNDP)  5001 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 156.4+ KB


In [3]:
from statistics_calc import descriptors

# Generate descriptors
desc = descriptors(hdi_data)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/hdi.csv", decimal=",")

desc

Unnamed: 0,Year,Human Development Index (UNDP)
count,5001.0,5001.0
mean,2003.407119,0.651761
std,8.954743,0.167121
min,1980.0,0.19
25%,1997.0,0.524
50%,2004.0,0.677
75%,2011.0,0.781
max,2017.0,0.953
skewness,-0.422293,-0.394236
kurtosis,-0.513055,-0.694236


In [6]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = ["Entity", "Code"]

# Generate statistics
estadisticas = qualitative_stats(hdi_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/hdi_quality.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

  estadisticas = pd.concat([estadisticas, nueva_fila], ignore_index=True)


Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Valores unicos (#),Valores unicos (%),Valores nulos (%)
0,Entity,Albania,30,0.59988,190,3.79924,0.0
1,Code,ALB,30,0.59988,189,3.779244,0.559888


In [5]:
# count the number of missing values in each column
missing = hdi_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

Code    28
dtype: int64

In [2]:
# Count the number of unique values in each column
unique_values = hdi_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv(
    "descriptors/hdi_unique_values.csv", index=False, decimal="."
)
unique_values_df

Unnamed: 0,Columna,Dominio
0,Entity,190
1,Code,189
2,Year,30
3,Human Development Index (UNDP),702


In [2]:
# count the number of cells with missing values
total_cells = hdi_data.size
missing_cells = hdi_data.isnull().sum().sum()
missing_cells

28

In [3]:
# drop the rows with nan values in the code column
hdi_data = hdi_data.dropna(subset=["Code"])

# count the number of cells
total_cells = hdi_data.size
total_cells

19892