In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# =====================
# 1. Cargar datasets
# =====================
df_2022 = pd.read_csv("2022.csv")
df_2023 = pd.read_csv("2023.csv")
df_2024 = pd.read_csv("2024.csv")

# Agregar columna Year
df_2022["Year"] = 2022
df_2023["Year"] = 2023
df_2024["Year"] = 2024

# =====================
# 2. Unir datos de entrenamiento
# =====================
df_total = pd.concat([df_2022, df_2023], ignore_index=True)

# Filtrar Puerto Rico
df_pr = df_total[df_total["State"] == "Alaska"].copy()
df_2024_pr = df_2024[df_2024["State"] == "Alaska"].copy()

# =====================
# 3. Procesar fechas
# =====================
df_pr["Date"] = pd.to_datetime(df_pr["Date"])
df_pr["Month"] = df_pr["Date"].dt.month

df_2024_pr["Date"] = pd.to_datetime(df_2024_pr["Date"])
df_2024_pr["Month"] = df_2024_pr["Date"].dt.month

# =====================
# 4. Calcular MODA mensual de PM2.5 (2022-2023)
# =====================
monthly_mode = (
    df_pr.groupby(["Year", "Month"])["Daily Mean PM2.5 Concentration"]
    .apply(lambda x: x.mode()[0])
    .reset_index()
)

# y = variable objetivo
y = monthly_mode["Daily Mean PM2.5 Concentration"]

# Crear variables dummies (para año y mes)
X = pd.get_dummies(monthly_mode[["Year", "Month"]], columns=["Year", "Month"], prefix=["Year", "Month"])

print("Shape de X:", X.shape)
print("Columnas de X:", X.columns.tolist())

# =====================
# 5. Entrenar modelo
# =====================
model = LinearRegression()
model.fit(X, y)

# =====================
# 6. Dataset dummy para predicciones 2024
# =====================
months_2024 = pd.DataFrame({
    "Year_2022": [0]*12,
    "Year_2023": [0]*12,
    "Month_1": [1,0,0,0,0,0,0,0,0,0,0,0],
    "Month_2": [0,1,0,0,0,0,0,0,0,0,0,0],
    "Month_3": [0,0,1,0,0,0,0,0,0,0,0,0],
    "Month_4": [0,0,0,1,0,0,0,0,0,0,0,0],
    "Month_5": [0,0,0,0,1,0,0,0,0,0,0,0],
    "Month_6": [0,0,0,0,0,1,0,0,0,0,0,0],
    "Month_7": [0,0,0,0,0,0,1,0,0,0,0,0],
    "Month_8": [0,0,0,0,0,0,0,1,0,0,0,0],
    "Month_9": [0,0,0,0,0,0,0,0,1,0,0,0],
    "Month_10":[0,0,0,0,0,0,0,0,0,1,0,0],
    "Month_11":[0,0,0,0,0,0,0,0,0,0,1,0],
    "Month_12":[0,0,0,0,0,0,0,0,0,0,0,1],
})

# Reindexar para que coincida con columnas de X
months_2024 = months_2024.reindex(columns=X.columns, fill_value=0)

# Predicciones para 2024
pred_2024 = model.predict(months_2024)

# =====================
# 7. Dataset real 2024 (moda de PM2.5 por mes)
# =====================
real_2024 = (
    df_2024_pr.groupby("Month")["Daily Mean PM2.5 Concentration"]
    .apply(lambda x: x.mode()[0])
    .reset_index()
)
real_2024["Year"] = 2024
real_2024.rename(columns={"Daily Mean PM2.5 Concentration": "Real_PM25"}, inplace=True)

# Unir con predicciones
comp_2024 = real_2024.copy()
comp_2024["Pred_PM25"] = pred_2024

print("\nComparación Real vs Predicho (2024, con MODA - PM2.5):")
print(comp_2024)

# =====================
# 8. Visualización
# =====================
plt.figure(figsize=(12,6))
plt.plot(comp_2024["Month"], comp_2024["Real_PM25"], "bo-", label="Real 2024 (MODA)")
plt.plot(comp_2024["Month"], comp_2024["Pred_PM25"], "ro--", label="Predicho 2024")
plt.xticks(np.arange(1,13))
plt.xlabel("Mes")
plt.ylabel("Nivel moda PM2.5")
plt.title("Comparación Real vs Predicho - PM2.5 en Puerto Rico (2024, mensual con MODA)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


Shape de X: (0, 0)
Columnas de X: []


ValueError: at least one array or dtype is required

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# =====================
# 1. Cargar datasets
# =====================
df_2022 = pd.read_csv("2022.csv")
df_2023 = pd.read_csv("2023.csv")
df_2024 = pd.read_csv("2024.csv")

# Agregar columna Year
df_2022["Year"] = 2022
df_2023["Year"] = 2023
df_2024["Year"] = 2024

# =====================
# 2. Unir datos de entrenamiento
# =====================
df_total = pd.concat([df_2022, df_2023], ignore_index=True)

# Filtrar Alaska
df_ak = df_total[df_total["State"] == "Alaska"].copy()
df_2024_ak = df_2024[df_2024["State"] == "Alaska"].copy()

# =====================
# 3. Procesar fechas
# =====================
df_ak["Date"] = pd.to_datetime(df_ak["Date"])
df_ak["Month"] = df_ak["Date"].dt.month

df_2024_ak["Date"] = pd.to_datetime(df_2024_ak["Date"])
df_2024_ak["Month"] = df_2024_ak["Date"].dt.month

# =====================
# 4. Calcular MODA mensual de PM2.5 (2022-2023)
# =====================
monthly_mode = (
    df_ak.groupby(["Year", "Month"])["Daily Mean PM2.5 Concentration"]
    .apply(lambda x: x.mode()[0])
    .reset_index()
)

# y = variable objetivo
y = monthly_mode["Daily Mean PM2.5 Concentration"]

# Crear variables dummies (para año y mes)
X = pd.get_dummies(monthly_mode[["Year", "Month"]], columns=["Year", "Month"], prefix=["Year", "Month"])

# =====================
# 5. Entrenar modelo
# =====================
model = LinearRegression()
model.fit(X, y)

# =====================
# 6. Dataset dummy para predicciones 2024
# =====================
months_2024 = pd.DataFrame({
    "Year_2022": [0]*12,
    "Year_2023": [0]*12,
    "Month_1": [1,0,0,0,0,0,0,0,0,0,0,0],
    "Month_2": [0,1,0,0,0,0,0,0,0,0,0,0],
    "Month_3": [0,0,1,0,0,0,0,0,0,0,0,0],
    "Month_4": [0,0,0,1,0,0,0,0,0,0,0,0],
    "Month_5": [0,0,0,0,1,0,0,0,0,0,0,0],
    "Month_6": [0,0,0,0,0,1,0,0,0,0,0,0],
    "Month_7": [0,0,0,0,0,0,1,0,0,0,0,0],
    "Month_8": [0,0,0,0,0,0,0,1,0,0,0,0],
    "Month_9": [0,0,0,0,0,0,0,0,1,0,0,0],
    "Month_10":[0,0,0,0,0,0,0,0,0,1,0,0],
    "Month_11":[0,0,0,0,0,0,0,0,0,0,1,0],
    "Month_12":[0,0,0,0,0,0,0,0,0,0,0,1],
})

# Reindexar para que coincida con columnas de X
months_2024 = months_2024.reindex(columns=X.columns, fill_value=0)

# Predicciones para 2024
pred_2024 = model.predict(months_2024)

# =====================
# 7. Dataset real 2024 (moda de PM2.5 por mes)
# =====================
real_2024 = (
    df_2024_ak.groupby("Month")["Daily Mean PM2.5 Concentration"]
    .apply(lambda x: x.mode()[0])
    .reset_index()
)
real_2024["Year"] = 2024
real_2024.rename(columns={"Daily Mean PM2.5 Concentration": "Real_PM25"}, inplace=True)

# Unir con predicciones
comp_2024 = real_2024.copy()
comp_2024["Pred_PM25"] = pred_2024

print("\nComparación Real vs Predicho (2024, con MODA - PM2.5 en Alaska):")
print(comp_2024)

# =====================
# 8. Visualización
# =====================
plt.figure(figsize=(12,6))
plt.plot(comp_2024["Month"], comp_2024["Real_PM25"], "bo-", label="Real 2024 (MODA)")
plt.plot(comp_2024["Month"], comp_2024["Pred_PM25"], "ro--", label="Predicho 2024")
plt.xticks(np.arange(1,13))
plt.xlabel("Mes")
plt.ylabel("Nivel moda PM2.5")
plt.title("Comparación Real vs Predicho - PM2.5 en Alaska (2024, mensual con MODA)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()


ValueError: at least one array or dtype is required