In [None]:
# Machine Learning Model Training and Comparison

## Step 1: Import Libraries

```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
```

## Step 2: Load and Prepare the Data

```python
# Assume preprocessed and resampled data are already available
X_train_scaled = X_train_scaled  # scaled training features
X_test_scaled = X_test_scaled    # scaled test features
y_train = y_train
y_test = y_test
```

## Step 3: Define and Train Models

### Logistic Regression
```python
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)
```

### Random Forest
```python
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
```

### XGBoost
```python
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_scaled, y_train)
```

### Neural Network with Keras
```python
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

nn_model.fit(X_train_scaled, y_train, validation_split=0.2, epochs=50, batch_size=64, callbacks=[es], verbose=0)
```

## Step 4: Evaluate Models

```python
def evaluate_model(model, X_test, y_test, is_keras=False):
    if is_keras:
        y_pred_proba = model.predict(X_test).ravel()
        y_pred = (y_pred_proba > 0.5).astype(int)
    else:
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        y_pred = model.predict(X_test)

    print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    return y_pred_proba, classification_report(y_test, y_pred, output_dict=True), roc_auc_score(y_test, y_pred_proba)

results = {}

# Logistic Regression
_, cr, auc = evaluate_model(log_model, X_test_scaled, y_test)
results['Logistic Regression'] = {'Classification Report': cr, 'ROC_AUC': auc}

# Random Forest
_, cr, auc = evaluate_model(rf_model, X_test_scaled, y_test)
results['Random Forest'] = {'Classification Report': cr, 'ROC_AUC': auc}

# XGBoost
_, cr, auc = evaluate_model(xgb_model, X_test_scaled, y_test)
results['XGBoost'] = {'Classification Report': cr, 'ROC_AUC': auc}

# Neural Network
_, cr, auc = evaluate_model(nn_model, X_test_scaled, y_test, is_keras=True)
results['Neural Network (Keras)'] = {'Classification Report': cr, 'ROC_AUC': auc}
```

## Step 5: Final Comparison

```python
# Compare all models
performance_data = []

for model_name, metrics in results.items():
    precision_1 = metrics['Classification Report']['1.0']['precision']
    recall_1 = metrics['Classification Report']['1.0']['recall']
    f1_score_1 = metrics['Classification Report']['1.0']['f1-score']

    performance_data.append({
        'Model': model_name,
        'ROC_AUC': metrics['ROC_AUC'],
        'Precision_1': precision_1,
        'Recall_1': recall_1,
        'F1-Score_1': f1_score_1
    })

performance_df = pd.DataFrame(performance_data)
performance_df = performance_df.sort_values(by='ROC_AUC', ascending=False)
print(performance_df)
```

## Step 6: Plot ROC Curves

```python
plt.figure(figsize=(10, 8))

for model_name in results.keys():
    if model_name == 'Neural Network (Keras)':
        y_pred_proba = nn_model.predict(X_test_scaled).ravel()
    else:
        y_pred_proba = eval(f"{model_name.lower().replace(' ', '_')}_model").predict_proba(X_test_scaled)[:, 1]

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
```

## Step 7: Conclusion

```python
print("\n--- Final Conclusion ---")
best_model = performance_df.iloc[0]['Model']
print(f"Based on ROC AUC and F1-Score for the minority class, the best performing model is likely: {best_model}")
```


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
print("--- Step 1: Loading and Preprocessing Data ---")
dataset = pd.read_csv('Extracted-dataset.csv')
print("Original Dataset Shape:", dataset.shape)
dataset.head()
# --- STEP 1: Encode binary target/label columns ---
dataset['RainToday'] = dataset['RainToday'].map({'No': 0, 'Yes': 1})
dataset['RainTomorrow'] = dataset['RainTomorrow'].map({'No': 0, 'Yes': 1})
dataset.head()
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

# Check for fully missing columns
fully_missing_cols = dataset.columns[dataset.isnull().all()]
print("Columns with all missing values:", fully_missing_cols.tolist())

# --- STEP 2: Handle Missing Values ---
print("--- Step 2: Handling Missing Values ---")

# Step 2.1: Impute object/categorical columns with mode
categorical_cols = dataset.select_dtypes(include=['object']).columns
for col in categorical_cols:
    mode_value = dataset[col].mode()[0]
    print(f"Imputing missing values in column '{col}' with mode value: '{mode_value}'")
    dataset[col] = dataset[col].fillna


# Step 2.2: Label Encode categorical columns
print("Label encoding categorical columns...")
lencoders = {}
for col in categorical_cols:
    lencoders[col] = LabelEncoder()
    dataset[col] = lencoders[col].fit_transform(dataset[col])

# Step 2.3: Impute numerical columns using Iterative Imputer (MICE)
print("Running MICE imputation on numerical features...")
mice_imputer = IterativeImputer(random_state=42)
dataset_imputed_array = mice_imputer.fit_transform(dataset)

# Ensure shape matches before converting to DataFrame
if dataset_imputed_array.shape[1] != dataset.shape[1]:
    print("Warning: Imputed array shape does not match original dataset. Fixing...")
    dataset = dataset.iloc[:, :dataset_imputed_array.shape[1]]

dataset_imputed = pd.DataFrame(dataset_imputed_array, columns=dataset.columns)

# Final check
print("Shape after imputation:", dataset_imputed.shape)
dataset_imputed.head(6)
# Step 1: Keep only numeric columns (needed for Pearson correlation)
numeric_df = dataset_imputed.select_dtypes(include=['float64', 'int64'])

# Step 2: Get correlation of each numeric column with 'RainTomorrow'
correlation_with_target = numeric_df.corr()['RainTomorrow'].sort_values(ascending=False)

# Step 3: Print correlation values
print("\n--- Pearson Correlation with 'RainTomorrow' ---")
print(correlation_with_target)
sns.barplot(
    x=correlation_with_target.drop('RainTomorrow').values,
    y=correlation_with_target.drop('RainTomorrow').index                #  .index gives us the row labels — which, in this case, are the names of the features
)
# Make a copy to avoid changing the original data unintentionally
data = dataset_imputed.copy()

# Remove 'RISK_MM' if it exists
if 'RISK_MM' in data.columns:
    data.drop('RISK_MM', axis=1, inplace=True)
    print("'RISK_MM' feature removed to prevent data leakage.")
else:
    print("'RISK_MM' already removed or not present.")

# Display the shape and the first few rows to confirm
print("\nUpdated dataset shape:", data.shape)
print("\nFirst 5 rows after removing 'RISK_MM':")
display(data.head())
# X = all features except 'RainTomorrow'
cleaned = data.copy()
# y = target variable
X = cleaned.drop('RainTomorrow', axis=1)
y = cleaned['RainTomorrow']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

# Check class balance
print("Original Class Distribution:\n", y.value_counts(normalize=True))
print("Training Set Class Distribution:\n", y_train.value_counts(normalize=True))
print("Testing Set Class Distribution:\n", y_test.value_counts(normalize=True))

# Plot class distribution in original training data

# Make sure y_train is a DataFrame with column name
y_train_df = pd.DataFrame(y_train, columns=['RainTomorrow'])

plt.figure(figsize=(6, 4))
sns.countplot(data=y_train_df, x='RainTomorrow', hue='RainTomorrow', palette='pastel', legend=False)
plt.title("Class Distribution Before Oversampling")
plt.xlabel("RainTomorrow (0 = No, 1 = Yes)")
plt.ylabel("Count")
plt.show()
from sklearn.utils import resample

# Assuming X_train and y_train are already defined
print("\n--- Step 3: Oversampling and Handling Outliers on Training Data ---")
df_train_combined = pd.concat([X_train, y_train], axis=1)
minority_class = df_train_combined[df_train_combined['RainTomorrow'] == 1]
majority_class = df_train_combined[df_train_combined['RainTomorrow'] == 0]

if not minority_class.empty:
    minority_oversampled = resample(minority_class,
                                    replace=True,
                                    n_samples=len(majority_class),
                                    random_state=42)
    df_train_resampled = pd.concat([majority_class, minority_oversampled])
else:
    print("⚠️ Minority class is empty. Oversampling not performed.")
    df_train_resampled = df_train_combined.copy()

print("Class distribution after oversampling:\n", df_train_resampled['RainTomorrow'].value_counts())
plt.figure(figsize=(6, 4))
sns.countplot(data=df_train_resampled, x='RainTomorrow', hue='RainTomorrow', palette='pastel', legend=False)

plt.title("Class Distribution After Oversampling", fontsize=14)
plt.xlabel("Rain Tomorrow (0 = No, 1 = Yes)", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.xticks([0, 1], ['No', 'Yes'])

plt.show()
# Now, remove outliers from the BALANCED training data
Q1 = df_train_resampled.quantile(0.25)
Q3 = df_train_resampled.quantile(0.75)
IQR = Q3 - Q1
cleaned_resampled = df_train_resampled[~((df_train_resampled < (Q1 - 1.5 * IQR)) | (df_train_resampled > (Q3 + 1.5 * IQR))).any(axis=1)]

X_train_final = cleaned_resampled.drop('RainTomorrow', axis=1)
y_train_final = cleaned_resampled['RainTomorrow']

print("Final shape after outlier removal:", cleaned_resampled.shape)
print("Final class distribution:\n", y_train_final.value_counts(normalize=True))

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_curve

print("\n--- Step 4: Training and Comparing Models ---")
models = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear', class_weight='balanced'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42,  eval_metric='logloss')
}

results = {}
for model_name, model in models.items():
    print(f"\n--- Training {model_name} ---")
    model.fit(X_train_final, y_train_final)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    results[model_name] = {
        'ROC_AUC': roc_auc_score(y_test, y_pred_proba),
        'Classification Report': classification_report(y_test, y_pred, output_dict=True)
    }

    print(f"--- {model_name} Evaluation on Test Data ---")
    print(f"ROC AUC Score: {results[model_name]['ROC_AUC']:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Rain', 'Rain'])
    disp.plot(cmap=plt.cm.Blues)
    disp.ax_.set_title(f"Confusion Matrix for {model_name}")
    plt.show()
    print("\n--- Step 5: Final Model Comparison Summary ---")

# Step 1: Collect rows in a list
performance_data = []

for model_name, metrics in results.items():
    precision_1 = metrics['Classification Report']['1.0']['precision']
    recall_1 = metrics['Classification Report']['1.0']['recall']
    f1_score_1 = metrics['Classification Report']['1.0']['f1-score']
    
    performance_data.append({
        'Model': model_name,
        'ROC_AUC': metrics['ROC_AUC'],
        'Precision_1': precision_1,
        'Recall_1': recall_1,
        'F1-Score_1': f1_score_1
    })

# Step 2: Convert to DataFrame
performance_df = pd.DataFrame(performance_data)
performance_df = performance_df.sort_values(by='ROC_AUC', ascending=False)
print(performance_df)

# Step 3: Plot ROC curves
plt.figure(figsize=(10, 8))
for model_name, model in models.items():
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve Comparison')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# Step 4: Conclusion
print("\n--- Final Conclusion ---")
best_model = performance_df.iloc[0]['Model']
print(f"Based on ROC AUC and F1-Score for the minority class, the best performing model is likely: {best_model}")
