In [1]:
import pandas as pd

hdi_data = pd.read_csv("datasets/human-development-index.csv")
hdi_data.head()

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
0,Afghanistan,AFG,1980,0.228
1,Afghanistan,AFG,1985,0.273
2,Afghanistan,AFG,2002,0.373
3,Afghanistan,AFG,2003,0.383
4,Afghanistan,AFG,2004,0.398


In [2]:
hdi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          5001 non-null   object 
 1   Code                            4973 non-null   object 
 2   Year                            5001 non-null   int64  
 3   Human Development Index (UNDP)  5001 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 156.4+ KB


In [3]:
# drop all the rows with missing values in the code column
hdi_data = hdi_data.dropna(subset=["Code"])
hdi_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4973 entries, 0 to 5000
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          4973 non-null   object 
 1   Code                            4973 non-null   object 
 2   Year                            4973 non-null   int64  
 3   Human Development Index (UNDP)  4973 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 194.3+ KB


In [4]:
# get all rows prior to 1990
old_data = hdi_data[hdi_data["Year"] < 1990]
old_data

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
0,Afghanistan,AFG,1980,0.228
1,Afghanistan,AFG,1985,0.273
18,Albania,ALB,1980,0.625
19,Albania,ALB,1985,0.623
126,Argentina,ARG,1980,0.675
...,...,...,...,...
4884,Vietnam,VNM,1985,0.479
4941,Zambia,ZMB,1980,0.418
4942,Zambia,ZMB,1985,0.409
4971,Zimbabwe,ZWE,1980,0.437


In [5]:
# drop all rows with data prior to 1990
hdi_data = hdi_data.drop(old_data.index)
hdi_data

Unnamed: 0,Entity,Code,Year,Human Development Index (UNDP)
2,Afghanistan,AFG,2002,0.373
3,Afghanistan,AFG,2003,0.383
4,Afghanistan,AFG,2004,0.398
5,Afghanistan,AFG,2005,0.408
6,Afghanistan,AFG,2006,0.417
...,...,...,...,...
4996,Zimbabwe,ZWE,2013,0.516
4997,Zimbabwe,ZWE,2014,0.525
4998,Zimbabwe,ZWE,2015,0.529
4999,Zimbabwe,ZWE,2016,0.532


In [6]:
hdi_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4757 entries, 2 to 5000
Data columns (total 4 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          4757 non-null   object 
 1   Code                            4757 non-null   object 
 2   Year                            4757 non-null   int64  
 3   Human Development Index (UNDP)  4757 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 185.8+ KB


In [7]:
# save the translated data to a new file
hdi_data.to_csv("datasets/human-development-index-cleaned.csv", index=False)

In [8]:
from statistics_calc import descriptors, translate_descriptors

# Generate descriptors
desc = descriptors(hdi_data)

# Translate the descriptors to Spanish
desc = translate_descriptors(desc)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/hdi-quantitative.csv", decimal=",")

desc

Unnamed: 0,Year,Human Development Index (UNDP)
Total de valores,4757.0,4757.0
Media,2004.351692,0.655067
Desviación estándar,7.941341,0.165734
Valor mínimo,1990.0,0.199
Q1,1998.0,0.529
Q2,2005.0,0.68
Q3,2011.0,0.783
Q4,2017.0,0.953
Asimetría,-0.14185,-0.39243
Curtosis,-1.133726,-0.702126


In [9]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = hdi_data.select_dtypes(include=["object"]).columns

# Generate statistics
estadisticas = qualitative_stats(hdi_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/hdi-qualitative.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Total de valores,Valores unicos (#),Valores unicos (%),Valores nulos (%)
1,Entity,Albania,28,0.588606,4757,189,3.973092,0.0
2,Code,ALB,28,0.588606,4757,189,3.973092,0.0


In [10]:
# count the number of missing values in each column
missing = hdi_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

Series([], dtype: int64)

In [11]:
# Count the number of unique values in each column
unique_values = hdi_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv("descriptors/hdi_unique_values.csv", index=False, decimal=".")
unique_values_df

Unnamed: 0,Columna,Dominio
0,Entity,189
1,Code,189
2,Year,28
3,Human Development Index (UNDP),692
