In [3]:
import pandas as pd

national_income_data = pd.read_csv("datasets/gross-national-income-per-capita.csv")
national_income_data.head()

Unnamed: 0,Entity,Code,Year,"GNI per capita, PPP (constant 2017 international $)"
0,Afghanistan,AFG,2017,2085.487571
1,Africa Eastern and Southern,,1990,2554.595257
2,Africa Eastern and Southern,,1991,2495.966953
3,Africa Eastern and Southern,,1992,2421.787983
4,Africa Eastern and Southern,,1993,2397.01279


In [5]:
# translate the column names to spanish
national_income_data.columns = ['Entidad', "Código", "Año" ,"Ingreso Nacional Bruto per cápita"]
national_income_data.head()

Unnamed: 0,Entidad,Código,Año,Ingreso Nacional Bruto per cápita
0,Afghanistan,AFG,2017,2085.487571
1,Africa Eastern and Southern,,1990,2554.595257
2,Africa Eastern and Southern,,1991,2495.966953
3,Africa Eastern and Southern,,1992,2421.787983
4,Africa Eastern and Southern,,1993,2397.01279


In [6]:
# save the data to a new csv file
national_income_data.to_csv(
    "datasets/gross-national-income-per-capita-spanish.csv", index=False
)

In [11]:
# count the number of unique values in the 'Código' column
national_income_data["Entidad"].nunique()

235

In [10]:
# get the year where there's entries for all countries
national_income_data["Año"].value_counts()

Año
2017    235
2016    195
2015    195
2014    195
2013    194
2012    193
2018    192
2011    192
2010    190
2009    185
2008    183
2019    181
2007    180
2006    177
2005    170
2004    166
2003    162
2002    162
2001    161
2000    158
1998    149
1999    148
1997    148
1996    145
1995    144
1994    125
2020    122
1993    118
1992    116
1991    115
1990    112
Name: count, dtype: int64

In [3]:
national_income_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5108 entries, 0 to 5107
Data columns (total 4 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   Entity                                               5108 non-null   object 
 1   Code                                                 3930 non-null   object 
 2   Year                                                 5108 non-null   int64  
 3   GNI per capita, PPP (constant 2017 international $)  5108 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 159.8+ KB


In [4]:
from statistics_calc import descriptors

# Generate descriptors
desc = descriptors(national_income_data)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/income.csv", decimal=",")

desc


Unnamed: 0,Year,"GNI per capita, PPP (constant 2017 international $)"
count,5108.0,5108.0
mean,2006.272905,16214.757852
std,8.573295,16924.432425
min,1990.0,416.03266
25%,1999.0,3594.229805
50%,2007.0,10029.822354
75%,2014.0,23289.224641
max,2020.0,112322.24818
skewness,-0.194268,1.59091
kurtosis,-1.108965,2.484584


In [5]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = ["Entity", "Code"]

# Generate statistics
estadisticas = qualitative_stats(national_income_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/income_quality.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas


  estadisticas = pd.concat([estadisticas, nueva_fila], ignore_index=True)


Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Valores unicos (#),Valores unicos (%),Valores nulos (%)
0,Entity,Africa Eastern and Southern,31,0.606891,235,4.600626,0.0
1,Code,ARG,31,0.606891,194,3.797964,23.061864


In [9]:
# count the number of missing values in each column
missing = national_income_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

Code    1178
dtype: int64

In [2]:
# Count the number of unique values in each column
unique_values = national_income_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv(
    "descriptors/income_unique_values.csv", index=False, decimal="."
)
unique_values_df

Unnamed: 0,Columna,Dominio
0,Entity,235
1,Code,194
2,Year,31
3,"GNI per capita, PPP (constant 2017 internation...",5056


In [11]:
# count the number of cells with missing values
total_cells = national_income_data.size
missing_cells = national_income_data.isnull().sum().sum()
missing_cells

1178

In [9]:
# drop the rows with nan values in the code column
national_income_data = national_income_data.dropna(subset=["Code"])

# count the number of cells
total_cells = national_income_data.size
total_cells

15720