In [None]:
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from models.regression import train_temperature_model
from models.metrics import regression_metrics



In [13]:
df = pd.read_csv("../data/processed/regression_data.csv",index_col=0)
df["DATE"] = pd.to_datetime(df["DATE"])
df = df.sort_values("DATE").reset_index(drop=True)
df.tail()


Unnamed: 0,DATE,TMAX,TMIN,TAVG,PRCP,SNOW,AWND,TMAX_lag_1,TMAX_roll_7,dayofyear,month
20084,2014-12-27,54.0,37.0,44.0,0.0,0.0,6.26,51.0,47.857143,361,12
20085,2014-12-28,54.0,42.0,47.0,0.05,0.0,10.51,54.0,50.714286,362,12
20086,2014-12-29,46.0,35.0,43.0,0.0,0.0,10.96,54.0,53.0,363,12
20087,2014-12-30,36.0,28.0,34.0,0.0,0.0,9.4,46.0,52.714286,364,12
20088,2014-12-31,34.0,28.0,30.0,0.0,0.0,11.86,36.0,50.714286,365,12


In [14]:
target = "TMAX"

features = [
    "TMIN",
    "PRCP",
    "SNOW",
    "AWND",
    "TMAX_lag_1",
    "TMAX_roll_7",
    "dayofyear",
    "month"
]

df_model = df[features + [target]].dropna()

X = df_model[features]
y = df_model[target]


In [15]:
test_size = int(len(df_model) * 0.2)

X_train = X.iloc[:-test_size]
X_test  = X.iloc[-test_size:]

y_train = y.iloc[:-test_size]
y_test  = y.iloc[-test_size:]

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (16066, 8)
Test size: (4016, 8)


In [16]:
baseline_model = train_temperature_model(
    X_train,
    y_train,
    model_type="linear"
)

baseline_pred = baseline_model.predict(X_test)

baseline_metrics = regression_metrics(y_test,baseline_pred)
baseline_metrics


{'MAE': 3.73174735507421,
 'RMSE': np.float64(4.764529193089308),
 'R2': 0.9256156192511199}

In [17]:
xgb_model = train_temperature_model(
    X_train,
    y_train,
    model_type="xgboost"
)

y_pred = xgb_model.predict(X_test)

xgb_metrics = regression_metrics(y_test, y_pred)

xgb_metrics


[0]	validation_0-rmse:16.77072
[1]	validation_0-rmse:16.05907
[2]	validation_0-rmse:15.33472
[3]	validation_0-rmse:14.65138
[4]	validation_0-rmse:14.00560
[5]	validation_0-rmse:13.39743
[6]	validation_0-rmse:12.82074
[7]	validation_0-rmse:12.27626
[8]	validation_0-rmse:11.76068
[9]	validation_0-rmse:11.30346


[10]	validation_0-rmse:10.84451
[11]	validation_0-rmse:10.41266
[12]	validation_0-rmse:10.03248
[13]	validation_0-rmse:9.67572
[14]	validation_0-rmse:9.31119
[15]	validation_0-rmse:8.96884
[16]	validation_0-rmse:8.64749
[17]	validation_0-rmse:8.34874
[18]	validation_0-rmse:8.08826
[19]	validation_0-rmse:7.84656
[20]	validation_0-rmse:7.59727
[21]	validation_0-rmse:7.36485
[22]	validation_0-rmse:7.14916
[23]	validation_0-rmse:6.96484
[24]	validation_0-rmse:6.79408
[25]	validation_0-rmse:6.63803
[26]	validation_0-rmse:6.47352
[27]	validation_0-rmse:6.31793
[28]	validation_0-rmse:6.17340
[29]	validation_0-rmse:6.04152
[30]	validation_0-rmse:5.91964
[31]	validation_0-rmse:5.80633
[32]	validation_0-rmse:5.70235
[33]	validation_0-rmse:5.60463
[34]	validation_0-rmse:5.51405
[35]	validation_0-rmse:5.43204
[36]	validation_0-rmse:5.35543
[37]	validation_0-rmse:5.29230
[38]	validation_0-rmse:5.22697
[39]	validation_0-rmse:5.16686
[40]	validation_0-rmse:5.11638
[41]	validation_0-rmse:5.06564
[42]	

Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[120]	validation_0-rmse:4.38245
[121]	validation_0-rmse:4.38175
[122]	validation_0-rmse:4.38004
[123]	validation_0-rmse:4.37754
[124]	validation_0-rmse:4.37589
[125]	validation_0-rmse:4.37387
[126]	validation_0-rmse:4.37247
[127]	validation_0-rmse:4.37059
[128]	validation_0-rmse:4.36973
[129]	validation_0-rmse:4.36770
[130]	validation_0-rmse:4.36672
[131]	validation_0-rmse:4.36461
[132]	validation_0-rmse:4.36364
[133]	validation_0-rmse:4.36120
[134]	validation_0-rmse:4.35937
[135]	validation_0-rmse:4.35792
[136]	validation_0-rmse:4.35715
[137]	validation_0-rmse:4.35619
[138]	validation_0-rmse:4.35531
[139]	validation_0-rmse:4.35328
[140]	validation_0-rmse:4.35193
[141]	validation_0-rmse:4.35017
[142]	validation_0-rmse:4.34907
[143]	validation_0-rmse:4.34819
[144]	validation_0-rmse:4.34645
[145]	validation_0-rmse:4.34479
[146]	validation_0-rmse:4.34364
[147]	validation_0-rmse:4.34288
[148]	validation_0-rmse:4.34216
[149]	validation_0-rmse:4.34018
[150]	validation_0-rmse:4.33829
[151]	va

{'MAE': 3.4392292000857956,
 'RMSE': np.float64(4.431217270226271),
 'R2': 0.9356589925491391}

In [18]:
pd.DataFrame(
    [baseline_metrics, xgb_metrics],
    index=["Baseline (Linear)", "XGBoosting"]
)


Unnamed: 0,MAE,RMSE,R2
Baseline (Linear),3.731747,4.764529,0.925616
XGBoosting,3.439229,4.431217,0.935659


In [21]:
joblib.dump(xgb_model, "../models/artifacts/models/tmax_xgboost.joblib")

with open("../models/artifacts/metrics/regression_metrics.json", "w") as f:
    json.dump(xgb_metrics, f, indent=2)

with open("../models/artifacts/params/xgboost_params.json", "w") as f:
    json.dump(xgb_model.get_params(), f, indent=2)

feature_config = {
    "target": "TMAX",
    "features": [
        "TMIN",
        "PRCP",
        "SNOW",
        "AWND",
        "TMAX_lag_1",
        "TMAX_roll_7",
        "monthly_avg_TMAX",
        "dayofyear",
        "month"
    ]
}

with open("../models/artifacts/metadata/xgboost_info.json", "w") as f:
    json.dump(feature_config, f, indent=2)

