In [1]:
import pandas as pd

national_income_data = pd.read_csv("datasets/gross-national-income-per-capita.csv")
national_income_data.head()

Unnamed: 0,Entity,Code,Year,"GNI per capita, PPP (constant 2017 international $)"
0,Afghanistan,AFG,2017,2085.487571
1,Africa Eastern and Southern,,1990,2554.595257
2,Africa Eastern and Southern,,1991,2495.966953
3,Africa Eastern and Southern,,1992,2421.787983
4,Africa Eastern and Southern,,1993,2397.01279


In [3]:
# save the data to a new csv file
national_income_data.to_csv(
    "datasets/gross-national-income-per-capita-cleaned.csv", index=False
)

In [8]:
national_income_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5108 entries, 0 to 5107
Data columns (total 4 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Entidad                            5108 non-null   object 
 1   Código                             3930 non-null   object 
 2   Año                                5108 non-null   int64  
 3   Ingreso Nacional Bruto per cápita  5108 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 159.8+ KB


In [7]:
from statistics_calc import descriptors, translate_descriptors

# Generate descriptors
desc = descriptors(national_income_data)

# Translate the descriptors to Spanish
desc = translate_descriptors(desc)

# Export the DataFrame to a CSV file
desc.to_csv("descriptors/income-quantitative.csv", decimal=",")

desc

Unnamed: 0,Año,Ingreso Nacional Bruto per cápita
Total de valores,5108.0,5108.0
Media,2006.272905,16214.757852
Desviación estándar,8.573295,16924.432425
Valor mínimo,1990.0,416.03266
Q1,1999.0,3594.229805
Q2,2007.0,10029.822354
Q3,2014.0,23289.224641
Q4,2020.0,112322.24818
Asimetría,-0.194268,1.59091
Curtosis,-1.108965,2.484584


In [6]:
from statistics_calc import qualitative_stats

# Columnas cualitativas de interés
cols_cualitativas = ["Entidad", "Código"]

# Generate statistics
estadisticas = qualitative_stats(national_income_data, cols_cualitativas)

# Guardar datos en un archivo CSV
estadisticas.to_csv("descriptors/income-qualitative.csv", index=False, decimal=",")

# Mostrar el DataFrame de estadísticas
estadisticas

Unnamed: 0,Columna,Moda,Moda (#),Moda (%),Total de valores,Valores unicos (#),Valores unicos (%),Valores nulos (%)
1,Entidad,Africa Eastern and Southern,31,0.606891,5108,235,4.600626,0.0
2,Código,ARG,31,0.606891,3930,194,3.797964,23.061864


In [9]:
# count the number of missing values in each column
missing = national_income_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing

Code    1178
dtype: int64

In [2]:
# Count the number of unique values in each column
unique_values = national_income_data.nunique()

# Convert the Series to a DataFrame
unique_values_df = unique_values.to_frame().reset_index()

# Rename the columns
unique_values_df.columns = ["Columna", "Dominio"]

# Save the DataFrame to a CSV file
unique_values_df.to_csv(
    "descriptors/income_unique_values.csv", index=False, decimal="."
)
unique_values_df

Unnamed: 0,Columna,Dominio
0,Entity,235
1,Code,194
2,Year,31
3,"GNI per capita, PPP (constant 2017 internation...",5056


In [11]:
# count the number of cells with missing values
total_cells = national_income_data.size
missing_cells = national_income_data.isnull().sum().sum()
missing_cells

1178

In [9]:
# drop the rows with nan values in the code column
national_income_data = national_income_data.dropna(subset=["Code"])

# count the number of cells
total_cells = national_income_data.size
total_cells

15720