<a href="https://colab.research.google.com/github/ananyascodehq/chennai-weather-prediction/blob/main/chennai_weather_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd, numpy as np
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv("/kaggle/input/heat-code-by-fodse/chennai_weather_ml_competition_dataset.csv")

In [None]:
# Datetime + cyclical
df['dt'] = pd.to_datetime(df['date'] + ' ' + df['time'])
df = df.assign(hour=df['dt'].dt.hour, dow=df['dt'].dt.dayofweek, month=df['dt'].dt.month,
               hour_sin=np.sin(2*np.pi*df['dt'].dt.hour/24),
               hour_cos=np.cos(2*np.pi*df['dt'].dt.hour/24),
               month_sin=np.sin(2*np.pi*df['dt'].dt.month/12),
               month_cos=np.cos(2*np.pi*df['dt'].dt.month/12))

# Lags + rolling
for c in ['temperature','humidity','wind_speed']:
    for l in [1,6,12,24]:
        df[f'{c}_lag{l}'] = df[c].shift(l)
    for w in [6,24]:
        df[f'{c}_roll{w}'] = df[c].shift(1).rolling(w).mean()

# Safe interactions
df = df.assign(
    humid_temp = df['humidity'] * df['temperature_lag1'],  # use lagged temp
    wind_press = df['wind_speed'] * df['pressure']
)

# Cleanup + features (drop raw hour/month + direct target use)
df = df.dropna().reset_index(drop=True)
features = [c for c in df.columns if c not in ['date','time','dt','temperature','hour','month', 'dow']]


In [None]:
X, y = df[features], df['temperature']
split_idx = int(len(df)*0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

lgb = LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=6,
                    subsample=0.8, colsample_bytree=0.8, random_state=42, verbose=-1).fit(X, y)
xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6,
                   subsample=0.8, colsample_bytree=0.8, random_state=42).fit(X, y)


In [None]:
df['date'] = pd.to_datetime(df['date'])
last_sunday = df[df['dow']==6]['date'].max().date()
sunday = df[(df['dow']==6) & (df['date'].dt.date==last_sunday) & df['hour'].between(10,21)]
sunday = sunday.drop_duplicates('hour').sort_values('hour')

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
lgb_pred = lgb.predict(X_test)
xgb_pred = xgb.predict(X_test)
ens_pred = 0.8*xgb_pred + 0.2*lgb_pred

df_metrics = pd.DataFrame([
    {"Model": "LGBRegressor",
     "MAE": mean_absolute_error(y_test, lgb_pred),
     "RMSE": np.sqrt(mean_squared_error(y_test, lgb_pred)),
     "R2": r2_score(y_test, lgb_pred)},
    {"Model": "XGBRegressor",
     "MAE": mean_absolute_error(y_test, xgb_pred),
     "RMSE": np.sqrt(mean_squared_error(y_test, xgb_pred)),
     "R2": r2_score(y_test, xgb_pred)},
    {"Model": "Ensemble",
     "MAE": mean_absolute_error(y_test, ens_pred),
     "RMSE": np.sqrt(mean_squared_error(y_test, ens_pred)),
     "R2": r2_score(y_test, ens_pred)}
])

df_metrics

In [None]:
df['date'] = pd.to_datetime(df['date'])
last_sunday = df[df['dow']==6]['date'].max().date()
sunday = df[(df['dow']==6) & (df['date'].dt.date==last_sunday) & df['hour'].between(10,21)]
sunday = sunday.drop_duplicates('hour').sort_values('hour')
lgb_pred_sub = lgb.predict(sunday[features])
xgb_pred_sub = xgb.predict(sunday[features])
ens_pred_sub = 0.8*xgb_pred_sub + 0.2*lgb_pred_sub
submission = pd.DataFrame({
    "ID": range(1, 13),
    "temperature_prediction": ens_pred_sub.round(2)
})
submission.to_csv("submission.csv", index=False)