In [1]:
import os, pandas as pd
import numpy as np

# Leer los archivos de datos en dataframes

country_isos=pd.read_excel(os.path.join('country_isos.xlsx'))
poblacion=pd.read_csv(os.path.join("co2.csv"))
obesidad = pd.read_csv(os.path.join("obesidad.csv"))
gripe = pd.read_csv(os.path.join("annual-mortality-rate-from-seasonal-influenza-ages-65.csv"))

In [2]:
#LIMPIEZA DATA 1: Si mi variable seleccionada es la población por cada país en el año 2020

# Como mi fuente me brinda información de diversas fechas, porcedo a filtrar los datos de 2020 directamente en el DataFrame original
poblacion = poblacion[poblacion['year'] == 2020]

# Cambio en la selección de columnas específicas, por el nombre de columna "population"
poblacion = poblacion[['country', 'population']].rename(columns={'population': 'Population_in_2020'})
poblacion.reset_index(drop=True, inplace=True)

# Se hace cambio del nombre de la columna "country" a "Country" en el DataFrame original
poblacion.rename(columns={'country': 'Country'}, inplace=True)

# Se realiza el cambio del texto en la columna "Country" de minúsculas a mayúsculas en el DataFrame original
poblacion['Country'] = poblacion['Country'].str.upper()



In [3]:
#Revisando si la primera data está bien, obtenemos el siguiente resultado
poblacion

Unnamed: 0,Country,Population_in_2020
0,AFGHANISTAN,3.897224e+07
1,AFRICA,1.360677e+09
2,AFRICA (GCP),
3,ALAND ISLANDS,
4,ALBANIA,2.866850e+06
...,...,...
266,WESTERN SAHARA,5.560600e+05
267,WORLD,7.840953e+09
268,YEMEN,3.228404e+07
269,ZAMBIA,1.892772e+07


In [4]:
#LIMPIAR DATA 2: obesidad en adultos

# Reiniciar los índices después de ordenar, ya que vemos como estos mantienen las posiciones anteriores
obesidad.reset_index(drop=True, inplace=True)

# Se elimina la columna "Pos" y también es necesario cambiar el nombre de "Name" a "Country"
obesidad = obesidad.drop(columns=['Pos'])
obesidad = obesidad.rename(columns={'Name': 'Country'})

# Se cambia el nombre de la columna "Value" al nombre la variable para esta fuente "Obesity_adult_prevalence_rate"
obesidad = obesidad.rename(columns={'Value': 'Obesity_adult_prevalence_rate'})

# Se realiza el cambio del texto en la columna "Country", de minúsculas a mayúsculas
obesidad['Country'] = obesidad['Country'].str.upper()

#Finalmente se obtiene lo siguiente
obesidad

Unnamed: 0,Country,Obesity_adult_prevalence_rate
0,AMERICAN SAMOA,74.6
1,NAURU,71.1
2,COOK ISLANDS,63.7
3,TOKELAU,63.4
4,TONGA,57.6
...,...,...
186,MADAGASCAR,1.6
187,ERITREA,1.5
188,NEPAL,1.4
189,BANGLADESH,1.1


In [5]:
#LIMPIEZA DATA 3: El número anual de muertes por cada 100.000 personas en el grupo de edad de mayores de 65 años

# Cambiar el nombre de la columna "Entity" a "Country"
gripe = gripe.rename(columns={'Entity': 'Country'})

# Cambiar el texto en la columna "Country" de minúsculas a mayúsculas
gripe['Country'] = gripe['Country'].str.upper()


# Eliminar la columna "Year" y "Code"
gripe = gripe.drop(columns=['Year'])
gripe = gripe.drop(columns=['Code'])


# Cambiar el nombre de la columna "rate over 65" a "Country"
gripe = gripe.rename(columns={'rate over65': 'numero_anual_muertes'})



In [6]:
gripe

Unnamed: 0,Country,numero_anual_muertes
0,AFGHANISTAN,62.57
1,AFRICA,63.92
2,ALBANIA,29.59
3,ALGERIA,54.98
4,AMERICAS,53.60
...,...,...
195,WESTERN PACIFIC,43.45
196,WORLD,53.40
197,YEMEN,58.73
198,ZAMBIA,54.65


In [7]:
# Ahora con la mayoría de data limpia, procedemos a revisarla
# A pesar de que no es necesario, nos hace dar una idea de como nos quedaron las datas después de la limpieza
poblacion.shape,obesidad.shape,gripe.shape

((271, 2), (191, 2), (200, 2))

In [8]:
#Realizaré el primer merge entre mi dos de mis datas
merge_data1=poblacion.merge(obesidad,how='outer',left_on='Country',right_on='Country')

# Revisando el resultado del primer merge
merge_data1

Unnamed: 0,Country,Population_in_2020,Obesity_adult_prevalence_rate
0,AFGHANISTAN,3.897224e+07,2.2
1,AFRICA,1.360677e+09,
2,AFRICA (GCP),,
3,ALAND ISLANDS,,
4,ALBANIA,2.866850e+06,21.3
...,...,...,...
280,"CONGO, REPUBLIC OF THE",,4.7
281,BURMA,,4.0
282,"KOREA, NORTH",,3.9
283,TIMOR-LESTE,,2.7


In [9]:
# Como las datas que debo hacer son cuatro, debo realizar un nuevo merge, esta vez entre otra de mis datas
# y el realizado anteriormente
merge_data2=merge_data1.merge(gripe,how='left',left_on='Country',right_on='Country')

#revisando el resultado de este segundo merge
merge_data2

Unnamed: 0,Country,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,3.897224e+07,2.2,62.57
1,AFRICA,1.360677e+09,,63.92
2,AFRICA (GCP),,,
3,ALAND ISLANDS,,,
4,ALBANIA,2.866850e+06,21.3,29.59
...,...,...,...,...
280,"CONGO, REPUBLIC OF THE",,4.7,
281,BURMA,,4.0,
282,"KOREA, NORTH",,3.9,
283,TIMOR-LESTE,,2.7,


In [10]:
# Finalmente, tenemos el merge del resultado de la unión de mis propias datas con las brindadas por el profesor 

merge_final=country_isos.merge(merge_data2,how='left',left_on='Country',right_on='Country')

# Revisando nuestro resultado, tenemos
merge_final

Unnamed: 0,Country,Officialstatename,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The Islamic Republic of Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The Republic of Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The People's Democratic Republic of Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The Republic of Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA AND BARBUDA,Antigua and Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
166,UZBEKISTAN,The Republic of Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
167,VIET NAM,The Socialist Republic of Viet Nam,.vn,VN,VNM,,,
168,YEMEN,The Republic of Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The Republic of Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65


In [11]:
#creamos una copia para realizar una vista de lo que serían la data si eliminamos los países que contienen NaN en alguna
# de sus celdas
merge_final_copia = merge_final.copy()
filas_con_nan = merge_final_copia.isna().any(axis=1)
filas_con_nan = merge_final_copia[filas_con_nan]

filas_con_nan
merge_final_copia.dropna(axis=0, how='any', inplace=True)

In [12]:
# Ahora, con toda mi data, puedo realizar la limpieza final

# Quiero asegurarme que los datos en mi columnas sean de un tipo de valor coherente (Números)
# por lo que es necesario deshacerme de valores como NaN, que pueden ser un obstáculo en la aplicación de mi nueva data
merge_final.replace(np.nan,0,inplace=True) 

# Revisando el merge final
merge_final

Unnamed: 0,Country,Officialstatename,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The Islamic Republic of Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The Republic of Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The People's Democratic Republic of Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The Republic of Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA AND BARBUDA,Antigua and Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
166,UZBEKISTAN,The Republic of Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
167,VIET NAM,The Socialist Republic of Viet Nam,.vn,VN,VNM,0.0,0.0,0.00
168,YEMEN,The Republic of Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The Republic of Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65


In [13]:
# Cambiando el tipo de dato de las celdas de las columnas de mi data
merge_final['Obesity_adult_prevalence_rate'] = merge_final['Obesity_adult_prevalence_rate'].astype(str)
merge_final['Population_in_2020'] = merge_final['Population_in_2020'].astype(str)
merge_final['numero_anual_muertes'] = merge_final['numero_anual_muertes'].astype(str)

# Nos aseguramos de eliminar los espacios en blanco
merge_final.Obesity_adult_prevalence_rate.str.contains(pat=r'\d',regex=True)
merge_final.Population_in_2020.str.contains(pat=r'\d',regex=True)
merge_final.numero_anual_muertes.str.contains(pat=r'\d',regex=True)

# Verificamos la aplicación de los códigos
merge_final

Unnamed: 0,Country,Officialstatename,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The Islamic Republic of Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The Republic of Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The People's Democratic Republic of Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The Republic of Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA AND BARBUDA,Antigua and Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
166,UZBEKISTAN,The Republic of Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
167,VIET NAM,The Socialist Republic of Viet Nam,.vn,VN,VNM,0.0,0.0,0.0
168,YEMEN,The Republic of Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The Republic of Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65


In [14]:
# Reemplazamos los valores en blanco, por guiones bajos
merge_final.rename(columns={'Officialstatename': 'Official_State_Name'}, inplace=True)
merge_final['Official_State_Name']=merge_final['Official_State_Name'].str.replace(' ','_',regex=True)
merge_final['Country']=merge_final['Country'].str.replace(' ','_',regex=True)

#Revisamos nuestra data
merge_final

Unnamed: 0,Country,Official_State_Name,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The_Islamic_Republic_of_Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The_Republic_of_Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The_People's_Democratic_Republic_of_Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The_Republic_of_Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA_AND_BARBUDA,Antigua_and_Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
166,UZBEKISTAN,The_Republic_of_Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
167,VIET_NAM,The_Socialist_Republic_of_Viet_Nam,.vn,VN,VNM,0.0,0.0,0.0
168,YEMEN,The_Republic_of_Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The_Republic_of_Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65


In [15]:
# Eliminar espacios en blanco
merge_final=merge_final.rename(columns=lambda x:x.strip())

# Finalmente, nuestro resultado final es el siguiente, considerando que los valores de NaN pasan a ser 0
merge_final

Unnamed: 0,Country,Official_State_Name,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The_Islamic_Republic_of_Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The_Republic_of_Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The_People's_Democratic_Republic_of_Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The_Republic_of_Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA_AND_BARBUDA,Antigua_and_Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
166,UZBEKISTAN,The_Republic_of_Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
167,VIET_NAM,The_Socialist_Republic_of_Viet_Nam,.vn,VN,VNM,0.0,0.0,0.0
168,YEMEN,The_Republic_of_Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The_Republic_of_Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65


In [16]:
merge_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Country                        171 non-null    object
 1   Official_State_Name            171 non-null    object
 2   InternetccTLD                  171 non-null    object
 3   iso2                           171 non-null    object
 4   iso3                           171 non-null    object
 5   Population_in_2020             171 non-null    object
 6   Obesity_adult_prevalence_rate  171 non-null    object
 7   numero_anual_muertes           171 non-null    object
dtypes: object(8)
memory usage: 10.8+ KB


In [17]:
# Arreglando el Dtype de algunas columnas
merge_final['Population_in_2020'] = merge_final['Population_in_2020'].astype('float64')
merge_final['Obesity_adult_prevalence_rate'] = merge_final['Obesity_adult_prevalence_rate'].astype('float64')
merge_final['numero_anual_muertes'] = merge_final['numero_anual_muertes'].astype('float64')

In [18]:
merge_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171 entries, 0 to 170
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        171 non-null    object 
 1   Official_State_Name            171 non-null    object 
 2   InternetccTLD                  171 non-null    object 
 3   iso2                           171 non-null    object 
 4   iso3                           171 non-null    object 
 5   Population_in_2020             171 non-null    float64
 6   Obesity_adult_prevalence_rate  171 non-null    float64
 7   numero_anual_muertes           171 non-null    float64
dtypes: float64(3), object(5)
memory usage: 10.8+ KB


In [20]:
merge_final_copia.rename(columns={'Officialstatename': 'Official_State_Name'}, inplace=True)
merge_final_copia['Official_State_Name']=merge_final_copia['Official_State_Name'].str.replace(' ','_',regex=True)
merge_final_copia['Country']=merge_final_copia['Country'].str.replace(' ','_',regex=True)
merge_final_copia['Population_in_2020'] = merge_final_copia['Population_in_2020'].astype('float64')
merge_final_copia['Obesity_adult_prevalence_rate'] = merge_final_copia['Obesity_adult_prevalence_rate'].astype('float64')
merge_final_copia['numero_anual_muertes'] = merge_final_copia['numero_anual_muertes'].astype('float64')
merge_final_copia.info()

<class 'pandas.core.frame.DataFrame'>
Index: 138 entries, 0 to 170
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Country                        138 non-null    object 
 1   Official_State_Name            138 non-null    object 
 2   InternetccTLD                  138 non-null    object 
 3   iso2                           138 non-null    object 
 4   iso3                           138 non-null    object 
 5   Population_in_2020             138 non-null    float64
 6   Obesity_adult_prevalence_rate  138 non-null    float64
 7   numero_anual_muertes           138 non-null    float64
dtypes: float64(3), object(5)
memory usage: 9.7+ KB


In [21]:
merge_final_copia

Unnamed: 0,Country,Official_State_Name,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The_Islamic_Republic_of_Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The_Republic_of_Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The_People's_Democratic_Republic_of_Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The_Republic_of_Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA_AND_BARBUDA,Antigua_and_Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
165,URUGUAY,The_Oriental_Republic_of_Uruguay,.uy,UY,URY,3429087.0,24.8,61.77
166,UZBEKISTAN,The_Republic_of_Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
168,YEMEN,The_Republic_of_Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The_Republic_of_Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65


In [22]:
# Se tienen dos resultados merge_final_copia y merge_final

# Considerando eliminar los países que contenían NaN
merge_final_copia

Unnamed: 0,Country,Official_State_Name,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The_Islamic_Republic_of_Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The_Republic_of_Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The_People's_Democratic_Republic_of_Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The_Republic_of_Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA_AND_BARBUDA,Antigua_and_Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
165,URUGUAY,The_Oriental_Republic_of_Uruguay,.uy,UY,URY,3429087.0,24.8,61.77
166,UZBEKISTAN,The_Republic_of_Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
168,YEMEN,The_Republic_of_Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The_Republic_of_Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65


In [23]:
# considerando el valor de NaN como 0
merge_final

Unnamed: 0,Country,Official_State_Name,InternetccTLD,iso2,iso3,Population_in_2020,Obesity_adult_prevalence_rate,numero_anual_muertes
0,AFGHANISTAN,The_Islamic_Republic_of_Afghanistan,.af,AF,AFG,38972236.0,2.2,62.57
1,ALBANIA,The_Republic_of_Albania,.al,AL,ALB,2866850.0,21.3,29.59
2,ALGERIA,The_People's_Democratic_Republic_of_Algeria,.dz,DZ,DZA,43451668.0,16.0,54.98
3,ANGOLA,The_Republic_of_Angola,.ao,AO,AGO,33428490.0,6.4,73.83
4,ANTIGUA_AND_BARBUDA,Antigua_and_Barbuda,.ag,AG,ATG,92672.0,25.6,42.39
...,...,...,...,...,...,...,...,...
166,UZBEKISTAN,The_Republic_of_Uzbekistan,.uz,UZ,UZB,33526662.0,15.1,37.22
167,VIET_NAM,The_Socialist_Republic_of_Viet_Nam,.vn,VN,VNM,0.0,0.0,0.00
168,YEMEN,The_Republic_of_Yemen,.ye,YE,YEM,32284044.0,14.5,58.73
169,ZAMBIA,The_Republic_of_Zambia,.zm,ZM,ZMB,18927716.0,3.6,54.65
