In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

El objetivo de este notebook es poder juntar los csv de PM10 de los años 2020, 2021, 2022, 2023 y 2024 en un solo csv. Tambien verificar calidad en los datos: Unicidad, Consistencia, Validez y Completitud.

In [2]:
# Cargar los datos de PM10 de varios años
pm10_2020 = "raw/pm10-2020.csv"
pm10_2021 = "raw/pm10-2021.csv"
pm10_2022 = "raw/pm10-2022.csv"
pm10_2023 = "raw/pm10-2023.csv"
pm10_2024 = "raw/pm10-2024.csv"

df_2020 = pd.read_csv(pm10_2020, sep=',')
df_2021 = pd.read_csv(pm10_2021, sep=',')
df_2022 = pd.read_csv(pm10_2022, sep=',')
df_2023 = pd.read_csv(pm10_2023, sep=',')
df_2024 = pd.read_csv(pm10_2024, sep=',')

# Combinar los datos 
df_10 = pd.concat([df_2020, df_2021, df_2022, df_2023, df_2024], ignore_index=True)
df_10.head()

Unnamed: 0,Estacion,Fecha inicial,Fecha final,PM10
0,"""USME""",12/31/2020 22:00,12/31/2020 22:59,39.5
1,"""USME""",12/31/2020 21:00,12/31/2020 21:59,35.5
2,"""USME""",12/31/2020 20:00,12/31/2020 20:59,38.3
3,"""USME""",12/31/2020 19:00,12/31/2020 19:59,34.7
4,"""USME""",12/31/2020 18:00,12/31/2020 18:59,38.7


De una vez vamos a procesar la fecha incial que es la que vamos a tomar, vamos a dejar la columna "Fecha inicial" renombrada a fecha, sacaremos de ahi, "ANIO" y "HORA"

In [3]:
# procesar la columna de fecha y hora
df_10['Fecha inicial'] = pd.to_datetime(df_10['Fecha inicial'], format='%m/%d/%Y %H:%M')
df_10.rename(columns={'Fecha inicial': 'FECHA'}, inplace=True)
df_10['ANIO'] = df_10['FECHA'].dt.year
df_10['HORA'] = df_10['FECHA'].dt.hour
df_10['MES'] = df_10['FECHA'].dt.month
df_10['DIA'] = df_10['FECHA'].dt.day
df_10.head()

# Renombrar columna a uppercase
df_10.rename(columns={'Estacion': 'ESTACION'}, inplace=True)

# Borrar columna fecha final
df_10.drop(columns=['Fecha final'], inplace=True)

In [4]:
df_10.head()

Unnamed: 0,ESTACION,FECHA,PM10,ANIO,HORA,MES,DIA
0,"""USME""",2020-12-31 22:00:00,39.5,2020,22,12,31
1,"""USME""",2020-12-31 21:00:00,35.5,2020,21,12,31
2,"""USME""",2020-12-31 20:00:00,38.3,2020,20,12,31
3,"""USME""",2020-12-31 19:00:00,34.7,2020,19,12,31
4,"""USME""",2020-12-31 18:00:00,38.7,2020,18,12,31


Ahora vamos a cargar y procesar los datos para PM2.5

In [5]:
# Cargar datos pm2.5
pm25_2020 = "raw/pm25-2020.csv"
pm25_2021 = "raw/pm25-2021.csv"
pm25_2022 = "raw/pm25-2022.csv"
pm25_2023 = "raw/pm25-2023.csv"
pm25_2024 = "raw/pm25-2024.csv"

df_2020_25 = pd.read_csv(pm25_2020, sep=',')
df_2021_25 = pd.read_csv(pm25_2021, sep=',')
df_2022_25 = pd.read_csv(pm25_2022, sep=',')
df_2023_25 = pd.read_csv(pm25_2023, sep=',')
df_2024_25 = pd.read_csv(pm25_2024, sep=',')

# Combinar los datos 
df_25 = pd.concat([df_2020_25, df_2021_25, df_2022_25, df_2023_25, df_2024_25], ignore_index=True)
df_25.head()

Unnamed: 0,Estacion,Fecha inicial,Fecha final,PM2.5
0,"""USME""",12/31/2020 22:00,12/31/2020 22:59,24.0
1,"""USME""",12/31/2020 21:00,12/31/2020 21:59,28.8
2,"""USME""",12/31/2020 20:00,12/31/2020 20:59,21.8
3,"""USME""",12/31/2020 19:00,12/31/2020 19:59,17.1
4,"""USME""",12/31/2020 18:00,12/31/2020 18:59,10.6


In [6]:
# procesar la columna de fecha y hora
df_25['Fecha inicial'] = pd.to_datetime(df_25['Fecha inicial'], format='%m/%d/%Y %H:%M')
df_25.rename(columns={'Fecha inicial': 'FECHA'}, inplace=True)
df_25['ANIO'] = df_25['FECHA'].dt.year
df_25['HORA'] = df_25['FECHA'].dt.hour
df_25['MES'] = df_25['FECHA'].dt.month
df_25['DIA'] = df_25['FECHA'].dt.day
df_25.head()

# Renombrar columna a uppercase
df_25.rename(columns={'Estacion': 'ESTACION'}, inplace=True)

# Borrar columna fecha final
df_25.drop(columns=['Fecha final'], inplace=True)

In [7]:
df_25.head()

Unnamed: 0,ESTACION,FECHA,PM2.5,ANIO,HORA,MES,DIA
0,"""USME""",2020-12-31 22:00:00,24.0,2020,22,12,31
1,"""USME""",2020-12-31 21:00:00,28.8,2020,21,12,31
2,"""USME""",2020-12-31 20:00:00,21.8,2020,20,12,31
3,"""USME""",2020-12-31 19:00:00,17.1,2020,19,12,31
4,"""USME""",2020-12-31 18:00:00,10.6,2020,18,12,31


In [8]:
# vamos a juntar los datos de PM10 y PM2.5 en un solo DataFrame usando merge, asegurandonos de que cada fila tenga la estacion, misma fecha, anio, hora, mes y dia y 
# la concetracion de pm2.5 y pm10 correpondiente a esa fecha.

df_final = pd.merge(df_10, df_25, on=['ESTACION', 'FECHA', 'ANIO', 'HORA', 'MES', 'DIA'])
df_final.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702661 entries, 0 to 702660
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   ESTACION  702661 non-null  object        
 1   FECHA     702661 non-null  datetime64[ns]
 2   PM10      702661 non-null  float64       
 3   ANIO      702661 non-null  int32         
 4   HORA      702661 non-null  int32         
 5   MES       702661 non-null  int32         
 6   DIA       702661 non-null  int32         
 7   PM2.5     702661 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int32(4), object(1)
memory usage: 32.2+ MB


In [None]:
# Exportar csv 
df_final.to_csv('consolidados/pm_consolidado.csv', index=False)