In [None]:
import pandas as pd
import re

# LOAD DATA
df = pd.read_csv("/content/Dubai_weather_1990_today.csv")

# --- FIX COLUMN NAMES ---
df.columns = [re.sub(' +', ' ', col.strip().lower()) for col in df.columns]

# --- CONVERT DATE ---
df["time"] = pd.to_datetime(df["time"])

# --- SORT BY DATE ---
df = df.sort_values("time")

# --- DROP MOSTLY EMPTY COLUMNS ---
# These columns often contain too many NaNs in this specific dataset
mostly_empty = ["wind gust (km/h)", "sunshine duration (minutes)", "snow depth (cm)"]
df = df.drop(columns=[c for c in mostly_empty if c in df.columns], errors="ignore")

# --- FILL REMAINING MISSING VALUES ---
df = df.interpolate().ffill().bfill()

# --- CREATE RAIN TODAY ---
precip_col = [c for c in df.columns if "precip" in c]
if precip_col:
    col = precip_col[0]
    df["rain_today"] = (df[col] > 0).astype(int)
else:
    print("Warning: No precipitation column found.")

print("Cleaned columns:", df.columns.tolist())
print("Remaining rows:", len(df))
display(df.head())

Cleaned columns: ['time', 'average temperature (°c)', 'minimum temperature (°c)', 'maximum temperature (°c)', 'precipitation (mm)', 'wind direction (°)', 'wind speed (km/h)', 'air pressure (hpa)', 'rain_today']
Remaining rows: 12824


Unnamed: 0,time,average temperature (°c),minimum temperature (°c),maximum temperature (°c),precipitation (mm),wind direction (°),wind speed (km/h),air pressure (hpa),rain_today
0,1990-01-01,18.3,13.6,23.4,135.9,229.0,17.8,996.1,1
1,1990-01-02,17.8,9.9,24.7,135.9,229.0,17.8,996.1,1
2,1990-01-03,17.9,14.0,22.8,135.9,229.0,17.8,996.1,1
3,1990-01-04,19.3,10.8,27.8,135.9,229.0,17.8,996.1,1
4,1990-01-05,21.2,17.5,26.2,135.9,229.0,17.8,996.1,1


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Create a copy to avoid SettingWithCopy warnings
df_model = df.copy()

# --- TARGET = RAIN TOMORROW ---
df_model["rain_tomorrow"] = df_model["rain_today"].shift(-1)

# --- FEATURE ENGINEERING ---
df_model["temp_range"] = df_model["maximum temperature (\u00b0c)"] - df_model["minimum temperature (\u00b0c)"]
df_model["avg_temp_lag1"] = df_model["average temperature (\u00b0c)"].shift(1)
df_model["avg_temp_lag3"] = df_model["average temperature (\u00b0c)"].shift(3)
df_model["rain_lag1"] = df_model["rain_today"].shift(1)
df_model["rain_last3"] = df_model["rain_today"].rolling(3).sum()
df_model["month"] = df_model["time"].dt.month
df_model["dayofyear"] = df_model["time"].dt.dayofyear

# --- DROP NaN FROM SHIFTS/ROLLING ---
df_model = df_model.dropna()

# --- BUILD X AND y ---
feature_cols = [
    "minimum temperature (\u00b0c)",
    "maximum temperature (\u00b0c)",
    "average temperature (\u00b0c)",
    "temp_range",
    "avg_temp_lag1",
    "avg_temp_lag3",
    "rain_lag1",
    "rain_last3",
    "month",
    "dayofyear"
]

# --- TIME SPLIT ---
split_date = pd.Timestamp("2022-01-01")
train = df_model[df_model["time"] < split_date]
test  = df_model[df_model["time"] >= split_date]

X_train, y_train = train[feature_cols], train["rain_tomorrow"]
X_test, y_test = test[feature_cols], test["rain_tomorrow"]

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}")

# --- TRAIN MODEL ---
model = RandomForestClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)

# --- EVALUATE ---
pred = model.predict(X_test)
print(classification_report(y_test, pred))

Training samples: 11685, Testing samples: 1135
              precision    recall  f1-score   support

         0.0       0.96      0.99      0.98      1089
         1.0       0.36      0.11      0.17        46

    accuracy                           0.96      1135
   macro avg       0.66      0.55      0.57      1135
weighted avg       0.94      0.96      0.94      1135

