In [1]:
import json
import joblib
import numpy as np
import pandas as pd


from models.classification import train_weather_classifier
from models.metrics import classification_metrics
from sklearn.preprocessing import LabelEncoder


In [2]:
df = pd.read_csv("../data/processed/classification_data.csv",index_col=0)
df["DATE"] = pd.to_datetime(df["DATE"])

df = df.sort_values("DATE").reset_index(drop=True)
df.head()


Unnamed: 0,DATE,weather_type,PRCP,PRCP_roll_3,PRCP_roll_7,SNOW,AWND,month,dayofyear,TAVG
0,1960-01-01,Sunny,0.0,,,0.0,10.51,1,1,33.5
1,1960-01-02,Rainy,0.08,,,0.0,10.51,1,2,34.5
2,1960-01-03,Rainy,0.68,,,0.0,10.51,1,3,44.5
3,1960-01-04,Sunny,0.0,0.253333,,0.0,10.51,1,4,37.0
4,1960-01-05,Sunny,0.0,0.253333,,0.0,10.51,1,5,34.5


In [3]:
target = "weather_type"

features = [
    "PRCP",
    "PRCP_roll_3",
    "PRCP_roll_7",
    "SNOW",
    "AWND",
    "month",
    "dayofyear",
    "TAVG"
]

df_model = df[features + [target]].dropna()

X = df_model[features]
y = df_model[target]

X.shape, y.value_counts()


((20082, 8),
 weather_type
 Sunny     11219
 Rainy      4701
 Foggy      2147
 Stormy     1180
 Snowy       835
 Name: count, dtype: int64)

In [4]:
y.value_counts(normalize=True)


weather_type
Sunny     0.558659
Rainy     0.234090
Foggy     0.106912
Stormy    0.058759
Snowy     0.041580
Name: proportion, dtype: float64

In [5]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))


{'Foggy': np.int64(0),
 'Rainy': np.int64(1),
 'Snowy': np.int64(2),
 'Stormy': np.int64(3),
 'Sunny': np.int64(4)}

In [6]:
test_size = int(len(df_model) * 0.2)

X_train = X.iloc[:-test_size]
X_test  = X.iloc[-test_size:]

y_train = y_encoded[:-test_size]
y_test  = y_encoded[-test_size:]

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (16066, 8)
Test size: (4016, 8)


In [7]:
clf = train_weather_classifier(
    X_train,
    y_train,
    model_type="xgboost",
    tune_hyperparams=True
)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [8]:
y_pred_encoded = clf.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

y_test_labels = label_encoder.inverse_transform(y_test)


In [9]:
from models.metrics import classification_metrics

metrics = classification_metrics(y_test_labels, y_pred)
metrics


{'accuracy': 0.8164840637450199,
 'precision_macro': 0.7085626092219652,
 'recall_macro': 0.6254067625334946,
 'f1_macro': 0.6567519048165014,
 'confusion_matrix': [[70, 0, 0, 0, 266],
  [0, 826, 0, 78, 1],
  [14, 14, 156, 6, 45],
  [2, 136, 5, 111, 17],
  [153, 0, 0, 0, 2116]],
 'classification_report': {'Foggy': {'precision': 0.2928870292887029,
   'recall': 0.20833333333333334,
   'f1-score': 0.24347826086956523,
   'support': 336.0},
  'Rainy': {'precision': 0.8463114754098361,
   'recall': 0.912707182320442,
   'f1-score': 0.8782562466772993,
   'support': 905.0},
  'Snowy': {'precision': 0.968944099378882,
   'recall': 0.6638297872340425,
   'f1-score': 0.7878787878787878,
   'support': 235.0},
  'Stormy': {'precision': 0.5692307692307692,
   'recall': 0.4095940959409594,
   'f1-score': 0.47639484978540775,
   'support': 271.0},
  'Sunny': {'precision': 0.865439672801636,
   'recall': 0.9325694138386955,
   'f1-score': 0.8977513788714467,
   'support': 2269.0},
  'accuracy': 0.81

In [18]:
joblib.dump(
    clf,
    "../models/artifacts/models/weather_classifier_xgb.joblib"
)

joblib.dump(
    label_encoder,
    "../models/artifacts/encoders/weather_label_encoder.joblib"
)

with open("../models/artifacts/metrics/classification_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

classifier_info = {
    "target": "weather_type",
    "features": [
        "PRCP",
        "PRCP_roll_3",
        "PRCP_roll_7",
        "SNOW",
        "AWND",
        "month",
        "dayofyear",
        "TMAX_lag_1",
        "TMAX_roll_7"
    ],
    "model": "XGBoostClassifier",
    "notes": "WT-codes excluded to prevent information leakage"
}

with open("../models/artifacts/metadata/classifier_info.json", "w") as f:
    json.dump(classifier_info, f, indent=2)
