In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense # type: ignore
from tensorflow.keras.callbacks import EarlyStopping # type: ignore

print("--- Step 1: Loading and Preprocessing Data ---")
try:
    dataset = pd.read_csv('Extracted-dataset.csv')
except FileNotFoundError:
    print("Error: 'Extracted-dataset.csv' not found. Please check the file path.")
    exit()

print("Original Dataset Shape:", dataset.shape)

# --- STEP 1: Encode binary target/label columns ---
dataset['RainToday'] = dataset['RainToday'].map({'No': 0, 'Yes': 1})
dataset['RainTomorrow'] = dataset['RainTomorrow'].map({'No': 0, 'Yes': 1})

# Check for fully missing columns
fully_missing_cols = dataset.columns[dataset.isnull().all()]
print("Columns with all missing values:", fully_missing_cols.tolist())

# --- STEP 2: Handle Missing Values ---
print("--- Step 2: Handling Missing Values ---")

# Isolate columns for different imputation strategies
categorical_cols = dataset.select_dtypes(include=['object']).columns
numerical_cols = dataset.select_dtypes(include=['float64', 'int64']).columns
columns_to_impute = [col for col in dataset.columns if col not in ['Date', 'Location']]

# Step 2.1: Impute object/categorical columns with mode
for col in categorical_cols:
    mode_value = dataset[col].mode()[0]
    print(f"Imputing missing values in column '{col}' with mode value: '{mode_value}'")
    dataset[col] = dataset[col].fillna(mode_value)

# Step 2.2: Label Encode categorical columns
print("Label encoding categorical columns...")
lencoders = {}
for col in categorical_cols:
    lencoders[col] = LabelEncoder()
    dataset[col] = lencoders[col].fit_transform(dataset[col])

# Prepare data for MICE imputation
dataset_for_mice = dataset[columns_to_impute].copy()

# Step 2.3: Impute numerical columns using Iterative Imputer (MICE)
print("Running MICE imputation on numerical features...")
mice_imputer = IterativeImputer(random_state=42)
dataset_imputed_array = mice_imputer.fit_transform(dataset_for_mice)

# Convert the imputed array back to a DataFrame with correct column names
dataset_imputed = pd.DataFrame(dataset_imputed_array, columns=dataset_for_mice.columns)

# Final check
print("Shape after imputation:", dataset_imputed.shape)
print(dataset_imputed.head(6))

# Step 1: Keep only numeric columns (needed for Pearson correlation)
numeric_df = dataset_imputed.select_dtypes(include=['float64', 'int64'])

# Step 2: Get correlation of each numeric column with 'RainTomorrow'
correlation_with_target = numeric_df.corr()['RainTomorrow'].sort_values(ascending=False)

# Step 3: Print correlation values
print("\n--- Pearson Correlation with 'RainTomorrow' ---")
print(correlation_with_target)

plt.figure(figsize=(10, 6))
sns.barplot(
    x=correlation_with_target.drop('RainTomorrow').values,
    y=correlation_with_target.drop('RainTomorrow').index
)
plt.title("Pearson Correlation with 'RainTomorrow'")
plt.xlabel("Correlation Coefficient")
plt.ylabel("Features")
plt.tight_layout()
plt.savefig('correlation_barplot.png')
plt.close()

# Make a copy to avoid changing the original data unintentionally
data = dataset_imputed.copy()

# Remove 'RISK_MM' if it exists
if 'RISK_MM' in data.columns:
    data.drop('RISK_MM', axis=1, inplace=True)
    print("'RISK_MM' feature removed to prevent data leakage.")
else:
    print("'RISK_MM' already removed or not present.")

# Display the shape and the first few rows to confirm
print("\nUpdated dataset shape:", data.shape)
print("\nFirst 5 rows after removing 'RISK_MM':")
print(data.head())

# X = all features except 'RainTomorrow', y = target variable
X = data.drop('RainTomorrow', axis=1)
y = data['RainTomorrow']

# Split data before scaling to prevent data leakage from the test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# Apply StandardScaler only on the training data, then transform both train and test
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Check class balance
print("Original Class Distribution:\n", y.value_counts(normalize=True))
print("Training Set Class Distribution:\n", y_train.value_counts(normalize=True))
print("Testing Set Class Distribution:\n", y_test.value_counts(normalize=True))

# Plot class distribution in original training data
y_train_df = pd.DataFrame(y_train)

plt.figure(figsize=(6, 4))
sns.countplot(x=y_train_df['RainTomorrow'], hue=y_train_df['RainTomorrow'], palette='pastel', legend=False)
plt.title("Class Distribution Before Oversampling")
plt.xlabel("RainTomorrow (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.savefig('class_distribution_before_oversampling.png')
plt.close()

print("\n--- Step 3: Oversampling and Handling Outliers on Training Data ---")
df_train_combined = pd.concat([X_train_scaled, y_train.reset_index(drop=True)], axis=1)
minority_class = df_train_combined[df_train_combined['RainTomorrow'] == 1]
majority_class = df_train_combined[df_train_combined['RainTomorrow'] == 0]

if not minority_class.empty:
    minority_oversampled = resample(minority_class,
                                    replace=True,
                                    n_samples=len(majority_class),
                                    random_state=42)
    df_train_resampled = pd.concat([majority_class, minority_oversampled])
else:
    print("⚠️ Minority class is empty. Oversampling not performed.")
    df_train_resampled = df_train_combined.copy()

print("Class distribution after oversampling:\n", df_train_resampled['RainTomorrow'].value_counts())
plt.figure(figsize=(6, 4))
sns.countplot(x=df_train_resampled['RainTomorrow'], hue=df_train_resampled['RainTomorrow'], palette='pastel', legend=False)
plt.title("Class Distribution After Oversampling", fontsize=14)
plt.xlabel("Rain Tomorrow (0 = No, 1 = Yes)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks([0, 1], ['No', 'Yes'])
plt.savefig('class_distribution_after_oversampling.png')
plt.close()

# Now, remove outliers from the BALANCED training data
features_to_clean = df_train_resampled.drop('RainTomorrow', axis=1)
Q1 = features_to_clean.quantile(0.25)
Q3 = features_to_clean.quantile(0.75)
IQR = Q3 - Q1

outlier_mask = ((features_to_clean < (Q1 - 1.5 * IQR)) | (features_to_clean > (Q3 + 1.5 * IQR))).any(axis=1)
cleaned_resampled = df_train_resampled[~outlier_mask]

X_train_final = cleaned_resampled.drop('RainTomorrow', axis=1)
y_train_final = cleaned_resampled['RainTomorrow']

print("Final shape after outlier removal:", cleaned_resampled.shape)
print("Final class distribution:\n", y_train_final.value_counts(normalize=True))

# Combine features and target
final_cleaned_data = pd.concat([X_train_final, y_train_final], axis=1)

print("\n--- Final Cleaned Training Dataset (Preview) ---")
print(final_cleaned_data.head())

# Save to CSV
final_cleaned_data.to_csv('Cleaned-dataset.csv', index=False)
print("\n✅ Final cleaned dataset saved as 'Cleaned-dataset.csv'")

# Model Training and Evaluation
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'MLP Classifier': MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=500, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}
results = {}

for model_name, model in models.items():
    print(f"\n--- Training {model_name} ---")
    model.fit(X_train_final, y_train_final)

    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    results[model_name] = {
        'ROC_AUC': roc_auc_score(y_test, y_pred_proba),
        'Classification Report': classification_report(y_test, y_pred, output_dict=True)
    }

    print(f"--- {model_name} Evaluation on Test Data ---")
    print(f"ROC AUC Score: {results[model_name]['ROC_AUC']:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Rain', 'Rain'])
    fig, ax = plt.subplots(figsize=(6, 6))
    disp.plot(cmap=plt.cm.Blues, ax=ax)
    disp.ax_.set_title(f"Confusion Matrix for {model_name}")
    plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_")}.png')
    plt.close(fig)

# --- Add Neural Network Model ---
print("\n--- Training Neural Network ---")

# Define the Neural Network model
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_final.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
nn_model.compile(optimizer='adam',
                 loss='binary_crossentropy',
                 metrics=[tf.keras.metrics.AUC(name='auc')])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = nn_model.fit(X_train_final, y_train_final,
                        epochs=100,
                        batch_size=32,
                        validation_split=0.2,
                        callbacks=[early_stopping],
                        verbose=0)

# Evaluate the Neural Network
y_pred_proba_nn = nn_model.predict(X_test_scaled).ravel()
y_pred_nn = (y_pred_proba_nn > 0.5).astype(int)

results['Neural Network'] = {
    'ROC_AUC': roc_auc_score(y_test, y_pred_proba_nn),
    'Classification Report': classification_report(y_test, y_pred_nn, output_dict=True)
}

print(f"--- Neural Network Evaluation on Test Data ---")
print(f"ROC AUC Score: {results['Neural Network']['ROC_AUC']:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nn))

cm_nn = confusion_matrix(y_test, y_pred_nn)
disp_nn = ConfusionMatrixDisplay(confusion_matrix=cm_nn, display_labels=['No Rain', 'Rain'])
fig_nn, ax_nn = plt.subplots(figsize=(6, 6))
disp_nn.plot(cmap=plt.cm.Blues, ax=ax_nn)
disp_nn.ax_.set_title(f"Confusion Matrix for Neural Network")
plt.savefig(f'confusion_matrix_Neural_Network.png')
plt.close(fig_nn)

print("\n--- Step 5: Final Model Comparison Summary ---")

performance_data = []

for model_name, metrics in results.items():
    report = metrics['Classification Report']
    if '1' in report:
        precision_1 = report['1']['precision']
        recall_1 = report['1']['recall']
        f1_score_1 = report['1']['f1-score']
    else:
        precision_1 = recall_1 = f1_score_1 = 0.0

    performance_data.append({
        'Model': model_name,
        'ROC_AUC': metrics['ROC_AUC'],
        'Precision_1': precision_1,
        'Recall_1': recall_1,
        'F1-Score_1': f1_score_1
    })

performance_df = pd.DataFrame(performance_data)
performance_df = performance_df.sort_values(by='ROC_AUC', ascending=False)
print(performance_df)

# Step 3: Plot ROC curves
plt.figure(figsize=(10, 8))
models['Neural Network'] = nn_model

for model_name, model in models.items():
    if model_name == 'Neural Network':
        y_pred_proba = model.predict(X_test_scaled).ravel()
    else:
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('roc_curve_comparison.png')
plt.close()

# Step 4: Conclusion
print("\n--- Final Conclusion ---")
best_model = performance_df.iloc[0]['Model']
print(f"Based on ROC AUC and F1-Score for the minority class, the best performing model is likely: {best_model}")

--- Step 1: Loading and Preprocessing Data ---
Original Dataset Shape: (121575, 24)
Columns with all missing values: []
--- Step 2: Handling Missing Values ---
Imputing missing values in column 'Date' with mode value: '2013-03-02'
Imputing missing values in column 'Location' with mode value: 'Canberra'
Imputing missing values in column 'WindGustDir' with mode value: 'W'
Imputing missing values in column 'WindDir9am' with mode value: 'N'
Imputing missing values in column 'WindDir3pm' with mode value: 'W'
Label encoding categorical columns...
Running MICE imputation on numerical features...




Shape after imputation: (121575, 22)
   MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustDir  \
0     20.4     25.8       0.0          6.0      12.4          6.0   
1     20.9     26.7       0.2          8.0      10.3          3.0   
2     22.3     26.3       0.0          3.2       2.0          0.0   
3     21.6     22.2       1.2          2.8       0.0          2.0   
4     20.4     23.5       2.6          2.2       2.9          2.0   
5     20.4     24.4       0.0          3.0       8.7          0.0   

   WindGustSpeed  WindDir9am  WindDir3pm  WindSpeed9am  ...  Humidity3pm  \
0           31.0         6.0         7.0          13.0  ...         71.0   
1           31.0         5.0         4.0          15.0  ...         77.0   
2           35.0         1.0         9.0           6.0  ...         90.0   
3           41.0         2.0         2.0          20.0  ...         95.0   
4           52.0         2.0         2.0          24.0  ...         86.0   
5           48.0       

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- XGBoost Evaluation on Test Data ---
ROC AUC Score: 0.9143

Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.87      0.90     28226
         1.0       0.65      0.78      0.71      8247

    accuracy                           0.85     36473
   macro avg       0.79      0.83      0.80     36473
weighted avg       0.87      0.85      0.86     36473


--- Training Neural Network ---


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 462us/step
--- Neural Network Evaluation on Test Data ---
ROC AUC Score: 0.9243

Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      0.90      0.92     28226
         1.0       0.69      0.78      0.73      8247

    accuracy                           0.87     36473
   macro avg       0.81      0.84      0.82     36473
weighted avg       0.88      0.87      0.87     36473


--- Step 5: Final Model Comparison Summary ---
                 Model   ROC_AUC  Precision_1  Recall_1  F1-Score_1
4       Neural Network  0.924309          0.0       0.0         0.0
1        Random Forest  0.915586          0.0       0.0         0.0
3              XGBoost  0.914254          0.0       0.0         0.0
2       MLP Classifier  0.904645          0.0       0.0         0.0
0  Logistic Regression  0.879546          0.0       0.0         0.0
[1m1140/1140[0m [32m━━━━━━━━━━━━━━━━━━━━[