In [50]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

El objetivo de este notebook es poder juntar los csv de PM10 de los años 2020, 2021, 2022 y 2023 en un solo csv. Tambien verificar calidad en los datos: Unicidad, Consistencia, Validez y Completitud.

In [51]:
# Cargar los datos de PM10 de varios años
pm10_2020 = "raw/pm10-2020.csv"
pm10_2021 = "raw/pm10-2021.csv"
pm10_2022 = "raw/pm10-2022.csv"
pm10_2023 = "raw/pm10-2023.csv"

df_2020 = pd.read_csv(pm10_2020, sep=',')
df_2021 = pd.read_csv(pm10_2021, sep=',')
df_2022 = pd.read_csv(pm10_2022, sep=',')
df_2023 = pd.read_csv(pm10_2023, sep=',')

# Combinar los datos 
df_10 = pd.concat([df_2020, df_2021, df_2022, df_2023], ignore_index=True)
df_10.head()

Unnamed: 0,Estacion,Fecha inicial,Fecha final,PM10
0,"""USME""",12/31/2020 22:00,12/31/2020 22:59,39.5
1,"""USME""",12/31/2020 21:00,12/31/2020 21:59,35.5
2,"""USME""",12/31/2020 20:00,12/31/2020 20:59,38.3
3,"""USME""",12/31/2020 19:00,12/31/2020 19:59,34.7
4,"""USME""",12/31/2020 18:00,12/31/2020 18:59,38.7


De una vez vamos a procesar la fecha incial que es la que vamos a tomar, vamos a dejar la columna "Fecha inicial" renombrada a fecha, sacaremos de ahi, "ANIO" y "HORA"

In [52]:
# procesar la columna de fecha y hora
df_10['Fecha inicial'] = pd.to_datetime(df_10['Fecha inicial'], format='%m/%d/%Y %H:%M')
df_10.rename(columns={'Fecha inicial': 'FECHA'}, inplace=True)
df_10['ANIO'] = df_10['FECHA'].dt.year
df_10['HORA'] = df_10['FECHA'].dt.hour
df_10['MES'] = df_10['FECHA'].dt.month
df_10['DIA'] = df_10['FECHA'].dt.day
df_10.head()

# Renombrar columna a uppercase
df_10.rename(columns={'Estacion': 'ESTACION'}, inplace=True)

# Borrar columna fecha final
df_10.drop(columns=['Fecha final'], inplace=True)

In [53]:
df_10.head()

Unnamed: 0,ESTACION,FECHA,PM10,ANIO,HORA,MES,DIA
0,"""USME""",2020-12-31 22:00:00,39.5,2020,22,12,31
1,"""USME""",2020-12-31 21:00:00,35.5,2020,21,12,31
2,"""USME""",2020-12-31 20:00:00,38.3,2020,20,12,31
3,"""USME""",2020-12-31 19:00:00,34.7,2020,19,12,31
4,"""USME""",2020-12-31 18:00:00,38.7,2020,18,12,31


Ahora vamos a cargar y procesar los datos para PM2.5

In [54]:
# Cargar datos pm2.5
pm25_2020 = "raw/pm25-2020.csv"
pm25_2021 = "raw/pm25-2021.csv"
pm25_2022 = "raw/pm25-2022.csv"
pm25_2023 = "raw/pm25-2023.csv"

df_2020_25 = pd.read_csv(pm25_2020, sep=',')
df_2021_25 = pd.read_csv(pm25_2021, sep=',')
df_2022_25 = pd.read_csv(pm25_2022, sep=',')
df_2023_25 = pd.read_csv(pm25_2023, sep=',')

# Combinar los datos 
df_25 = pd.concat([df_2020_25, df_2021_25, df_2022_25, df_2023_25], ignore_index=True)
df_25.head()

Unnamed: 0,Estacion,Fecha inicial,Fecha final,PM2.5
0,"""USME""",12/31/2020 22:00,12/31/2020 22:59,24.0
1,"""USME""",12/31/2020 21:00,12/31/2020 21:59,28.8
2,"""USME""",12/31/2020 20:00,12/31/2020 20:59,21.8
3,"""USME""",12/31/2020 19:00,12/31/2020 19:59,17.1
4,"""USME""",12/31/2020 18:00,12/31/2020 18:59,10.6


In [55]:
# procesar la columna de fecha y hora
df_25['Fecha inicial'] = pd.to_datetime(df_25['Fecha inicial'], format='%m/%d/%Y %H:%M')
df_25.rename(columns={'Fecha inicial': 'FECHA'}, inplace=True)
df_25['ANIO'] = df_25['FECHA'].dt.year
df_25['HORA'] = df_25['FECHA'].dt.hour
df_25['MES'] = df_25['FECHA'].dt.month
df_25['DIA'] = df_25['FECHA'].dt.day
df_25.head()

# Renombrar columna a uppercase
df_25.rename(columns={'Estacion': 'ESTACION'}, inplace=True)

# Borrar columna fecha final
df_25.drop(columns=['Fecha final'], inplace=True)

In [56]:
df_25.head()

Unnamed: 0,ESTACION,FECHA,PM2.5,ANIO,HORA,MES,DIA
0,"""USME""",2020-12-31 22:00:00,24.0,2020,22,12,31
1,"""USME""",2020-12-31 21:00:00,28.8,2020,21,12,31
2,"""USME""",2020-12-31 20:00:00,21.8,2020,20,12,31
3,"""USME""",2020-12-31 19:00:00,17.1,2020,19,12,31
4,"""USME""",2020-12-31 18:00:00,10.6,2020,18,12,31


In [57]:
# Ahora cargamos los datos de CO, NO2, O3, SO2 de 2020-2023

co_2020 = "raw/co-2020.csv"
co_2021 = "raw/co-2021.csv" 
co_2022 = "raw/co-2022.csv"
co_2023 = "raw/co-2023.csv"
no2_2020 = "raw/no2-2020.csv"
no2_2021 = "raw/no2-2021.csv"
no2_2022 = "raw/no2-2022.csv"
no2_2023 = "raw/no2-2023.csv"
o3_2020 = "raw/o3-2020.csv"
o3_2021 = "raw/o3-2021.csv"
o3_2022 = "raw/o3-2022.csv"
o3_2023 = "raw/o3-2023.csv"
so2_2020 = "raw/so2-2020.csv"
so2_2021 = "raw/so2-2021.csv"
so2_2022 = "raw/so2-2022.csv"
so2_2023 = "raw/so2-2023.csv"

df_co_2020 = pd.read_csv(co_2020, sep=',')
df_co_2021 = pd.read_csv(co_2021, sep=',')
df_co_2022 = pd.read_csv(co_2022, sep=',')
df_co_2023 = pd.read_csv(co_2023, sep=',')
df_no2_2020 = pd.read_csv(no2_2020, sep=',')
df_no2_2021 = pd.read_csv(no2_2021, sep=',')
df_no2_2022 = pd.read_csv(no2_2022, sep=',')
df_no2_2023 = pd.read_csv(no2_2023, sep=',')
df_o3_2020 = pd.read_csv(o3_2020, sep=',')
df_o3_2021 = pd.read_csv(o3_2021, sep=',')
df_o3_2022 = pd.read_csv(o3_2022, sep=',')
df_o3_2023 = pd.read_csv(o3_2023, sep=',')
df_so2_2020 = pd.read_csv(so2_2020, sep=',')
df_so2_2021 = pd.read_csv(so2_2021, sep=',')
df_so2_2022 = pd.read_csv(so2_2022, sep=',')
df_so2_2023 = pd.read_csv(so2_2023, sep=',')

# Combinar los datos
df_co = pd.concat([df_co_2020, df_co_2021, df_co_2022, df_co_2023], ignore_index=True)
df_no2 = pd.concat([df_no2_2020, df_no2_2021, df_no2_2022, df_no2_2023], ignore_index=True)
df_o3 = pd.concat([df_o3_2020, df_o3_2021, df_o3_2022, df_o3_2023], ignore_index=True)
df_so2 = pd.concat([df_so2_2020, df_so2_2021, df_so2_2022, df_so2_2023], ignore_index=True)

# procesar la columna de fecha y hora
for df in [df_co, df_no2, df_o3, df_so2]:
    df['Fecha inicial'] = pd.to_datetime(df['Fecha inicial'], format='%Y-%m-%d %H:%M')
    df.rename(columns={'Fecha inicial': 'FECHA'}, inplace=True)
    df['ANIO'] = df['FECHA'].dt.year
    df['HORA'] = df['FECHA'].dt.hour
    df['MES'] = df['FECHA'].dt.month
    df['DIA'] = df['FECHA'].dt.day
    df.rename(columns={'Estacion': 'ESTACION'}, inplace=True)
    df.drop(columns=['Fecha final'], inplace=True) 

# Ahora unimos todos los dataframes en uno solo
data_frames = [df_10, df_25, df_co, df_no2, df_o3, df_so2]
tabla_final = pd.concat(data_frames, ignore_index=True)
tabla_final.head()

Unnamed: 0,ESTACION,FECHA,PM10,ANIO,HORA,MES,DIA,PM2.5,CO,NO2,O3,SO2
0,"""USME""",2020-12-31 22:00:00,39.5,2020,22,12,31,,,,,
1,"""USME""",2020-12-31 21:00:00,35.5,2020,21,12,31,,,,,
2,"""USME""",2020-12-31 20:00:00,38.3,2020,20,12,31,,,,,
3,"""USME""",2020-12-31 19:00:00,34.7,2020,19,12,31,,,,,
4,"""USME""",2020-12-31 18:00:00,38.7,2020,18,12,31,,,,,


#

In [58]:
# Ahora vamos a juntar todos los datos, de forma tal que cada fila tenga la estacion, misma fecha, anio, hora, mes y dia y la concetracion de pm2.5, pm10, co, no2, o3 y so2 correpondiente a esa fecha.
df_final = tabla_final.groupby(['ESTACION', 'FECHA', 'ANIO', 'HORA', 'MES', 'DIA'], as_index=False).first()

# Para los nulos en algun contaminante imputamos 0 
df_final.fillna(0, inplace=True)

# Crear FECHA_KEY en formato AAAAMMDD como entero (ordenable y único)
df_final['FECHA'] = df_final['FECHA'].dt.strftime('%Y%m%d').astype(int)

# Ordenar por estación y fecha
df_final = df_final.sort_values(by=['ESTACION', 'FECHA'])


In [59]:
df_final["ESTACION"].value_counts() 

ESTACION
USAQUEN                       34958
MINAMBIENTE                   34958
CENTRO DE ALTO RENDIMIENTO    34922
TUNAL                         34865
PUENTE ARANDA                 34833
SAN CRISTOBAL                 34793
LAS FERIAS                    34634
GUAYMARAL                     34601
P_CAMI - FONTIBÓN             34548
SUBA                          34276
KENNEDY                       33771
BOGOTA RURAL - MOCHUELO       32956
EL JAZMÍN                     28104
"USME"                        27858
CIUDAD BOLÍVAR                27828
MÓVIL 7MA                     26745
BOLIVIA                       26300
CARVAJAL - SEVILLANA          24859
COLINA                        24658
MÓVIL FONTIBÓN                23034
BOSA                           6838
MOCHUELO - COLEGIO             3561
Name: count, dtype: int64

In [60]:
df_final

Unnamed: 0,ESTACION,FECHA,ANIO,HORA,MES,DIA,PM10,PM2.5,CO,NO2,O3,SO2
0,"""USME""",20201001,2020,0,10,1,26.8,10.3,114.49,14.29104,38.6514,13.87858
1,"""USME""",20201001,2020,1,10,1,23.6,7.7,228.98,14.10300,42.5754,22.51996
2,"""USME""",20201001,2020,2,10,1,24.5,0.0,114.49,7.70964,46.8918,8.37952
3,"""USME""",20201001,2020,3,10,1,18.0,0.0,114.49,14.85516,39.6324,10.21254
4,"""USME""",20201001,2020,4,10,1,13.6,0.0,228.98,29.89836,25.8984,38.23156
...,...,...,...,...,...,...,...,...,...,...,...,...
633895,USAQUEN,20231231,2023,18,12,31,31.8,19.0,1259.39,0.00000,6.4750,1.83300
633896,USAQUEN,20231231,2023,19,12,31,29.7,22.0,1144.90,0.00000,7.2590,2.35700
633897,USAQUEN,20231231,2023,20,12,31,29.2,18.0,1030.41,0.00000,7.2590,2.09500
633898,USAQUEN,20231231,2023,21,12,31,36.5,14.0,1144.90,0.00000,3.1390,1.83300


In [61]:
import geopandas as gpd

# Cargar el TopoJSON
gdf_localidades = gpd.read_file("raw/bta_localidades.json")

# Ver las columnas
print(gdf_localidades.columns)

# Ver primeras filas
gdf_localidades.head()


Index(['id', 'OBJECTID', 'NOMBRE', 'CODIGO_LOC', 'DECRETO', 'LINK', 'SIMBOLO',
       'ESCALA_CAP', 'FECHA_CAPT', 'SHAPE_AREA', 'SHAPE_LEN', 'geometry'],
      dtype='object')


Unnamed: 0,id,OBJECTID,NOMBRE,CODIGO_LOC,DECRETO,LINK,SIMBOLO,ESCALA_CAP,FECHA_CAPT,SHAPE_AREA,SHAPE_LEN,geometry
0,,1,Santa Fe,3,Acuerdo 117 de 2003,,,,,45170650.0,43779.90544,"POLYGON ((-74.06841 4.6288, -74.06807 4.62847,..."
1,,11,Puente Aranda,16,Acuerdo 8 de 1977,,,,,17311150.0,17854.555403,"POLYGON ((-74.1183 4.63741, -74.11504 4.64053,..."
2,,13,Ciudad Bolívar,19,Acuerdo 14 de 1983,,,,,129986400.0,77732.027669,"POLYGON ((-74.15216 4.59976, -74.15218 4.59925..."
3,,6,Barrios Unidos,12,Acuerdo 8 de 1977,,,,,11903450.0,13426.542795,"POLYGON ((-74.05725 4.68684, -74.06249 4.65594..."
4,,2,Suba,11,Acuerdo 8 de 1977,,,,,100560600.0,65665.349126,"POLYGON ((-74.0345 4.82547, -74.03478 4.82418,..."


In [62]:
gdf_localidades.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   id          0 non-null      object  
 1   OBJECTID    20 non-null     int32   
 2   NOMBRE      20 non-null     object  
 3   CODIGO_LOC  20 non-null     object  
 4   DECRETO     20 non-null     object  
 5   LINK        0 non-null      object  
 6   SIMBOLO     0 non-null      object  
 7   ESCALA_CAP  0 non-null      object  
 8   FECHA_CAPT  0 non-null      object  
 9   SHAPE_AREA  20 non-null     float64 
 10  SHAPE_LEN   20 non-null     float64 
 11  geometry    20 non-null     geometry
dtypes: float64(2), geometry(1), int32(1), object(8)
memory usage: 1.9+ KB


In [63]:
map_estacion_localidad = {
    "USAQUEN": "Usaquén",
    "MINAMBIENTE": "Teusaquillo",
    "CENTRO DE ALTO RENDIMIENTO": "Barrios Unidos",
    "TUNAL": "Tunjuelito",
    "PUENTE ARANDA": "Puente Aranda",
    "SAN CRISTOBAL": "San Cristóbal",
    "LAS FERIAS": "Engativá",
    "GUAYMARAL": "Suba",
    "P_CAMI - FONTIBÓN": "Fontibón",
    "SUBA": "Suba",
    "KENNEDY": "Kennedy",
    "EL JAZMÍN": "Antonio Nariño",
    "CIUDAD BOLÍVAR": "Ciudad Bolívar",
    "MÓVIL 7MA": "Chapinero",
    "BOSA": "Bosa",
    "CARVAJAL - SEVILLANA": "Kennedy",
    "COLINA": "Suba",
    "MÓVIL FONTIBÓN": "Fontibón",
    "BOLIVIA": "Engativá",
    "BOGOTA RURAL - MOCHUELO": "Ciudad Bolívar",
    "MOCHUELO - COLEGIO": "Ciudad Bolívar",
    "\"USME\"": "Usme"
}
map_localidad_id = {
    "Usaquén": 1, "Chapinero": 2, "Santa Fe": 3, "San Cristóbal": 4,
    "Usme": 5, "Tunjuelito": 6, "Bosa": 7, "Kennedy": 8, "Fontibón": 9,
    "Engativá": 10, "Suba": 11, "Barrios Unidos": 12, "Teusaquillo": 13,
    "Los Mártires": 14, "Antonio Nariño": 15, "Puente Aranda": 16,
    "La Candelaria": 17, "Rafael Uribe": 18, "Ciudad Bolívar": 19
}


In [64]:
df_final["LOCALIDAD"] = df_final["ESTACION"].map(map_estacion_localidad)
df_final["ID_LOCALIDAD"] = df_final["LOCALIDAD"].map(map_localidad_id)


In [65]:
df_final["LOCALIDAD"].value_counts()

LOCALIDAD
Suba              93535
Ciudad Bolívar    64345
Engativá          60934
Kennedy           58630
Fontibón          57582
Teusaquillo       34958
Usaquén           34958
Barrios Unidos    34922
Tunjuelito        34865
Puente Aranda     34833
San Cristóbal     34793
Antonio Nariño    28104
Usme              27858
Chapinero         26745
Bosa               6838
Name: count, dtype: int64

In [66]:
df_final["ESTACION"].value_counts()

ESTACION
USAQUEN                       34958
MINAMBIENTE                   34958
CENTRO DE ALTO RENDIMIENTO    34922
TUNAL                         34865
PUENTE ARANDA                 34833
SAN CRISTOBAL                 34793
LAS FERIAS                    34634
GUAYMARAL                     34601
P_CAMI - FONTIBÓN             34548
SUBA                          34276
KENNEDY                       33771
BOGOTA RURAL - MOCHUELO       32956
EL JAZMÍN                     28104
"USME"                        27858
CIUDAD BOLÍVAR                27828
MÓVIL 7MA                     26745
BOLIVIA                       26300
CARVAJAL - SEVILLANA          24859
COLINA                        24658
MÓVIL FONTIBÓN                23034
BOSA                           6838
MOCHUELO - COLEGIO             3561
Name: count, dtype: int64

In [67]:
df_final.drop(columns=["ESTACION"], inplace=True)

In [68]:
# Exportar el dataframe final a un archivo CSV
df_final.to_csv(
    "consolidados/contaminantes_2020_2023.csv",
    encoding="utf-8-sig",   # incluye BOM compatible con Power BI
    index=False,            # sin índice extra
    sep=",",                # separador por comas
    decimal="."             # punto decimal
)


In [69]:
df_final.head()

Unnamed: 0,FECHA,ANIO,HORA,MES,DIA,PM10,PM2.5,CO,NO2,O3,SO2,LOCALIDAD,ID_LOCALIDAD
0,20201001,2020,0,10,1,26.8,10.3,114.49,14.29104,38.6514,13.87858,Usme,5
1,20201001,2020,1,10,1,23.6,7.7,228.98,14.103,42.5754,22.51996,Usme,5
2,20201001,2020,2,10,1,24.5,0.0,114.49,7.70964,46.8918,8.37952,Usme,5
3,20201001,2020,3,10,1,18.0,0.0,114.49,14.85516,39.6324,10.21254,Usme,5
4,20201001,2020,4,10,1,13.6,0.0,228.98,29.89836,25.8984,38.23156,Usme,5
