# Get data

In [33]:
import pandas as pd

data = pd.read_csv("final_new.csv")
data = data[data['next_day_weather_status'] != 'Drizzle']

X = data.drop(['next_day_weather_status', 'date','id', 'weather_code', 'weather_status'], axis=1)
y = data['next_day_weather_status']

print(X.columns)

print(y.value_counts())

Index(['month', 'day', 'temperature_2m_mean', 'temperature_2m_max',
       'temperature_2m_min', 'wind_speed_10m_max', 'wind_gusts_10m_max',
       'wind_direction_10m_dominant', 'shortwave_radiation_sum',
       'et0_fao_evapotranspiration'],
      dtype='object')
Cloudy       99658
Clear sky    31551
Rain         27549
Snow         17688
Name: next_day_weather_status, dtype: int64


# data is Unbalanced so use Synthetic Data Generation

In [34]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE,SVMSMOTE,KMeansSMOTE 
# smote = SMOTE(random_state=42)
smote = KMeansSMOTE(random_state=42) # the best
X_resampled, y_resampled = smote.fit_resample(X, y)
print(y_resampled.value_counts())

Rain         99661
Clear sky    99660
Snow         99659
Cloudy       99658
Name: next_day_weather_status, dtype: int64


# Train model 

In [None]:
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


model = RandomForestClassifier(random_state=42)  

model.fit(X_train, y_train)


y_pred = model.predict(X_test)



# metrics

In [39]:
from sklearn.metrics import accuracy_score, classification_report

print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 86.7173

Classification Report:
              precision    recall  f1-score   support

   Clear sky       0.87      0.88      0.87     20153
      Cloudy       0.74      0.73      0.73     19935
        Rain       0.96      0.96      0.96     19763
        Snow       0.90      0.90      0.90     19877

    accuracy                           0.87     79728
   macro avg       0.87      0.87      0.87     79728
weighted avg       0.87      0.87      0.87     79728

