In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df_2023 = pd.read_csv("Datasets\dataset_2023.csv")
df_2024 = pd.read_csv("Datasets\dataset_2024.csv")

In [3]:
df_2023['Date'] = pd.to_datetime(df_2023['Date'])
df_2024['Date'] = pd.to_datetime(df_2024['Date'])

# Paso 2: Añadir columnas de año y mes para agrupación
df_2023['Year'] = df_2023['Date'].dt.year
df_2023['Month'] = df_2023['Date'].dt.month

df_2024['Year'] = df_2024['Date'].dt.year
df_2024['Month'] = df_2024['Date'].dt.month

# Paso 3: Unir ambos datasets
df = pd.concat([df_2023, df_2024], ignore_index=True)

# Paso 4: Agrupar por año y mes y calcular promedio de concentración
df_monthly = df.groupby(['Year', 'Month'])['Daily Max 1-hour NO2 Concentration'].mean().reset_index()

# Paso 5: Crear columnas indicadoras para años
df_monthly['Year_2023'] = (df_monthly['Year'] == 2023).astype(int)
df_monthly['Year_2024'] = (df_monthly['Year'] == 2024).astype(int)

# Paso 6: Crear columnas indicadoras para cada mes
for m in range(1, 13):
    df_monthly[f'Month_{m:02d}'] = (df_monthly['Month'] == m).astype(int)

# Paso 7: Seleccionar solo las columnas que quieres (promedio, años y meses)
columnas_finales = ['Daily Max 1-hour NO2 Concentration', 'Year_2023', 'Year_2024'] + [f'Month_{m:02d}' for m in range(1, 13)]
df_final = df_monthly[columnas_finales]

print(df_final.head())

   Daily Max 1-hour NO2 Concentration  Year_2023  Year_2024  Month_01  \
0                           23.591096          1          0         1   
1                           20.048881          1          0         0   
2                           19.052349          1          0         0   
3                           16.834459          1          0         0   
4                           17.949843          1          0         0   

   Month_02  Month_03  Month_04  Month_05  Month_06  Month_07  Month_08  \
0         0         0         0         0         0         0         0   
1         1         0         0         0         0         0         0   
2         0         1         0         0         0         0         0   
3         0         0         1         0         0         0         0   
4         0         0         0         1         0         0         0   

   Month_09  Month_10  Month_11  Month_12  
0         0         0         0         0  
1         0         0 

In [4]:
df_final.to_csv('datasets/datos_NO2_promedio_2023_2024.csv', index=False)

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6]:
df_final = pd.read_csv('datasets/datos_NO2_promedio_2023_2024.csv')


In [7]:
X = df_final.drop(columns=['Daily Max 1-hour NO2 Concentration'])
y = df_final['Daily Max 1-hour NO2 Concentration']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
model = LinearRegression()
model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [10]:
y_pred = model.predict(X_test)


In [11]:
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R^2 Score:", r2_score(y_test, y_pred))

Mean Absolute Error (MAE): 2.702201119904851
Mean Squared Error (MSE): 10.395185318068954
Root Mean Squared Error (RMSE): 3.2241565281587916
R^2 Score: 0.018306818680668968


In [12]:
# Paso 1: Convertir 'Date' a datetime
df_2025 = pd.read_csv("Datasets\dataset_2025.csv")
df_2025['Date'] = pd.to_datetime(df_2025['Date'])

# Paso 2: Extraer año y mes
df_2025['Year'] = df_2025['Date'].dt.year
df_2025['Month'] = df_2025['Date'].dt.month

# Paso 3: Agrupar por año y mes y calcular promedio mensual
df_2025_monthly = df_2025.groupby(['Year', 'Month'])['Daily Max 1-hour NO2 Concentration'].mean().reset_index()

# Paso 4: Crear columnas indicadoras para año (solo 2025)
df_2025_monthly['Year_2025'] = 1  # Todos 1 porque solo es 2025

# Paso 5: Crear columnas indicadoras para cada mes
for m in range(1, 13):
    df_2025_monthly[f'Month_{m:02d}'] = (df_2025_monthly['Month'] == m).astype(int)

# Paso 6: Seleccionar columnas necesarias (similar a 2023-2024, pero solo año 2025)
columnas_finales_2025 = ['Daily Max 1-hour NO2 Concentration', 'Year_2025'] + [f'Month_{m:02d}' for m in range(1, 13)]
df_2025_final = df_2025_monthly[columnas_finales_2025]

In [13]:
df_2025_final.to_csv('datasets/datos_promedio_2025.csv', index=False)

In [16]:
import pandas as pd
import matplotlib.pyplot as plt

# Cargar dataset 2025 ya procesado
df_2025_final = pd.read_csv('datasets/datos_promedio_2025.csv')

# Agregar las columnas de años que faltan (2023 y 2024) con ceros
df_2025_final['Year_2023'] = 0
df_2025_final['Year_2024'] = 0

# Reordenar columnas para que coincidan con las que usó el modelo
columnas_orden = ['Year_2023', 'Year_2024', 'Year_2025'] + [f'Month_{m:02d}' for m in range(1, 13)]

# Seleccionar solo las columnas de características para la predicción
X_2025 = df_2025_final[columnas_orden]

# Predecir usando el modelo entrenado
y_2025_pred = model.predict(X_2025)

# Valores reales
y_2025_real = df_2025_final['Daily Max 1-hour NO2 Concentration']

# Crear gráfico comparativo
meses = range(1, len(y_2025_real) + 1)  # Por ejemplo, 1 a 12

plt.figure(figsize=(12, 6))
plt.plot(meses, y_2025_real, marker='o', label='Valores Reales 2025')
plt.plot(meses, y_2025_pred, marker='x', label='Valores Predichos 2025')
plt.xticks(meses, [f'Mes {m}' for m in meses])
plt.xlabel('Mes')
plt.ylabel('Promedio Concentración NO2')
plt.title('Comparación de Valores Reales y Predichos para 2025')
plt.legend()
plt.grid(True)
plt.show()

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Year_2025
