In [None]:
import os
os.chdir('..')

In [None]:
from src.data_peparation import get_q3data
import xgboost as xgb
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, f1_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

task3_data = get_q3data()

encoder = LabelEncoder()
forecast_targets = [
    (205, "SO2", "2023-11-01", "2023-12-01"),
    (209, "NO2", "2023-09-01", "2023-10-01"),
    (223, "O3", "2023-07-01", "2023-08-01"),
    (224, "CO", "2023-10-01", "2023-11-01"),
    (226, "PM10", "2023-08-01", "2023-09-01"),
    (227, "PM2.5", "2023-12-01", "2024-01-01"),
]

predictions = {"target": {}}

for t in forecast_targets:
    station = t[0]
    df_model = task3_data.copy()
    target = t[1] + "anomalies"
    start_date = t[2]
    end_date = t[3]

    anomalies_columns = [col for col in df_model.columns if "anomalies" in col]
    drop_columns = ["Measurement date"] + anomalies_columns
    df_model.dropna(subset=[target], inplace=True)
    df_model.dropna(inplace=True)
    features = df_model.drop(columns=drop_columns)
    X = features.values
    y = df_model[target].astype(int)
    y = encoder.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)


    model = xgb.XGBClassifier(
        n_estimators=1500, learning_rate=0.2, max_depth=3, random_state=42,
        enable_categorical=True, early_stopping_rounds=50,objective='multi:softprob',eval_metric="mlogloss")
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

    print(f"Station: {station}")
    print(f"Target: {target}")
    y_pred = model.predict(X_test)


    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

    mask = (task3_data["Measurement date"] >= start_date) & (task3_data["Measurement date"] < end_date) & (task3_data["Station code"] == station)
    df_filtered = task3_data.loc[mask].reset_index(drop=True)
    future_dates = df_filtered["Measurement date"]
    df_filtered.drop(columns=["Measurement date"], inplace=True)
    y_pred = model.predict(df_filtered.drop(columns=anomalies_columns)).flatten().tolist()

    y_pred = encoder.inverse_transform(y_pred)

    predictions["target"][str(station)] = {str(date): int(val) for date, val in zip(future_dates, y_pred)}



output_path = "predictions/predictions_task_3_.json"
with open(output_path, "w") as f:
    json.dump(predictions, f, indent=2)

print(f"Predicciones guardadas en {output_path}")

In [None]:
feature_importances = model.feature_importances_


importances = pd.DataFrame({"Feature": features.columns, "Importance": feature_importances})
importances = importances.sort_values(by="Importance", ascending=False)
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
sns.barplot(x="Importance", y="Feature", data=importances.head(10))
plt.tight_layout()
plt.show()
