In [1]:
import pandas as pd

In [3]:
from functools import reduce

In [19]:
#Define file names and key columns
files={
    "temperature":"temperature.csv",
    "humidity":"humidity.csv",
    "pressure":"pressure.csv",
    "wind_speed":"wind_speed.csv",
    "wind_direction":"wind_direction.csv",
    "weather_description":"weather_description.csv",
}

In [37]:
import pandas as pd
from functools import reduce

# 0. Map each feature name to its wide‑format CSV
files = {
    "temperature":         "temperature.csv",
    "humidity":            "humidity.csv",
    "pressure":            "pressure.csv",
    "wind_speed":          "wind_speed.csv",
    "wind_direction":      "wind_direction.csv",
    "weather_description": "weather_description.csv"
}

# 1. Load & unpivot each CSV into long form
dfs = []
for feature, fn in files.items():
    # parse_dates turns 'datetime' from text → datetime64
    df_wide = pd.read_csv(fn, parse_dates=["datetime"])
    
    # melt so each row is (datetime, city, <feature> value)
    df_long = df_wide.melt(
        id_vars=["datetime"],
        var_name="city",
        value_name=feature          # ← use the loop variable here
    )
    dfs.append(df_long)

# 2. Merge all feature‑DataFrames on (city, datetime)
master_df = reduce(
    lambda left, right: pd.merge(left, right, on=["city","datetime"], how="inner"),
    dfs
)

# 3. Attach static city attributes
city_attr = pd.read_csv("city_attributes.csv")  # assume it has 'city' plus metadata
master_df = master_df.merge(city_attr, on="city", how="left")

# 4. (Optional) tidy up
master_df = master_df.sort_values(["city","datetime"]).reset_index(drop=True)

# 5. Inspect
print(master_df.head(5))


             datetime         city  temperature  humidity  pressure  \
0 2012-10-01 12:00:00  Albuquerque          NaN       NaN       NaN   
1 2012-10-01 13:00:00  Albuquerque   285.120000      50.0    1024.0   
2 2012-10-01 14:00:00  Albuquerque   285.154558      49.0    1024.0   
3 2012-10-01 15:00:00  Albuquerque   285.233952      49.0    1024.0   
4 2012-10-01 16:00:00  Albuquerque   285.313345      49.0    1024.0   

   wind_speed  wind_direction weather_description        Country   Latitude  \
0         NaN             NaN                 NaN  United States  35.084492   
1         4.0           360.0        sky is clear  United States  35.084492   
2         4.0           360.0        sky is clear  United States  35.084492   
3         4.0           360.0        sky is clear  United States  35.084492   
4         4.0           360.0        sky is clear  United States  35.084492   

    Longitude  
0 -106.651138  
1 -106.651138  
2 -106.651138  
3 -106.651138  
4 -106.651138  


In [39]:
import numpy as np
df=master_df.copy()

In [41]:
#1.Temporal features
df["hour"]=df["datetime"].dt.hour
df["dow"]=df["datetime"].dt.dayofweek
df["month"]=df["datetime"].dt.month

In [43]:
#cyclic encoding
df["hour_sin"]=np.sin(2*np.pi*df["hour"]/24)
df["hour_cos"]=np.cos(2*np.pi*df["hour"]/24)
df["dow_sin"]=np.sin(2*np.pi*df["dow"]/7)
df["dow_cos"]=np.cos(2*np.pi*df["dow"]/7)
df["month_sin"]=np.sin(2*np.pi*(df["month"]-1)/12)
df["hour_cos"]=np.cos(2*np.pi*(df["month"]-1)/12)

In [45]:
#2.Lag Features(past temperatures)
lags=[1,2,3,6,12,24]  #hours back

for lag in lags:
    df[f"temp_lag_{lag}"]=(
        df.groupby("city")["temperature"].shift(lag))

In [49]:
#3.Rolling Window statistics (3h&6h means/std)
windows=[3,6]

for w in windows:
    #1.exclude the current hour so the window only looks at the past w hours
    grp=df.groupby("city")["temperature"].shift(1)
    #2.Compute the rolling mean over last w values
    df[f"temp_roll_mean_{w}h"]=(grp.rolling(window=w).mean().reset_index(0,drop=True))
    #3.Compute the rolling std over last w values
    df[f"temp_roll_std_{w}h"]=(grp.rolling(window=w).std().reset_index(0,drop=True))

In [51]:
# Finally, drop any rows with NaNs introduced by shifting/rolling (or handle via imputation)
df = df.dropna(subset=[f"temp_lag_{lags[-1]}", f"temp_roll_mean_{windows[-1]}h"])

# Now df is enriched and ready for modeling!
print(df.columns.tolist())
print(df.head())

['datetime', 'city', 'temperature', 'humidity', 'pressure', 'wind_speed', 'wind_direction', 'weather_description', 'Country', 'Latitude', 'Longitude', 'hour', 'dow', 'month', 'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos', 'month_sin', 'temp_lag_1', 'temp_lag_2', 'temp_lag_3', 'temp_lag_6', 'temp_lag_12', 'temp_lag_24', 'temp_roll_mean_3h', 'temp_roll_std_3h', 'temp_roll_mean_6h', 'temp_roll_std_6h']
              datetime         city  temperature  humidity  pressure  \
25 2012-10-02 13:00:00  Albuquerque   286.980607      47.0    1024.0   
26 2012-10-02 14:00:00  Albuquerque   287.060000      47.0    1025.0   
27 2012-10-02 15:00:00  Albuquerque   289.910000      41.0    1026.0   
28 2012-10-02 16:00:00  Albuquerque   291.470000      40.0    1016.0   
29 2012-10-02 17:00:00  Albuquerque   292.860000       NaN       NaN   

    wind_speed  wind_direction weather_description        Country   Latitude  \
25         4.0           360.0        sky is clear  United States  35.084492   
26   

In [65]:
#Model training 
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 1. Define features & target (same as before)
exclude_cols = ["city", "datetime", "temperature","weather_description","Country"]
feature_cols = [c for c in df.columns if c not in exclude_cols]
# After feature engineering, before train/test split:
# Drop any row where temperature (target) or any predictor is NaN
keep_cols = ["temperature"] + feature_cols
df_model = df.dropna(subset=keep_cols).reset_index(drop=True)

# Then redefine X and y
X = df_model[feature_cols]
y = df_model["temperature"]

# 2. Time‑based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# 3. Train a Random Forest
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# 4. Predict & evaluate
y_pred = rf.predict(X_test)
mae  = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"MAE:  {mae:.3f} °C")
print(f"RMSE: {rmse:.3f} °C")


MAE:  0.663 °C
RMSE: 1.029 °C




In [67]:
import joblib
joblib.dump(rf,"rf_hourly_temp.pkl")

['rf_hourly_temp.pkl']