In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (confusion_matrix, roc_curve, auc, accuracy_score,
                             f1_score, log_loss, recall_score)
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier,
                              ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier)
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as BayesianLDA, QuadraticDiscriminantAnalysis as BayesianQDA
from sklearn.naive_bayes import GaussianNB

# For CNN model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# Define hyperparameters for LightGBM
lgb_params = {
    "objective": "binary",
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "num_leaves": 31,
    "max_depth": -1,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "random_state": 5,
}

# Data preparation function
def prepare_data(input_file, show_info=True):
    df = pd.read_csv(input_file)

    # Fix types (all columns are converted to numeric; adjust if needed)
    for col in df.columns:
        if col in ["LandCover", "Geology", "UID"]:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # One-hot encode categorical columns
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_columns = encoder.fit_transform(df[["Geology", "LandCover"]])
    encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(["Geology", "LandCover"]))
    df = pd.concat([df.drop(["Geology", "LandCover"], axis=1), encoded_df], axis=1)

    # Separate target variable and features
    X = df.drop(columns='LS')
    y = df['LS'].astype('category').cat.codes

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Stratified train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.3, random_state=5, shuffle=True, stratify=y
    )

    return X_train, X_test, y_train, y_test, df

# Load and prepare data
input_file = "Landslides.csv"
X_train, X_test, y_train, y_test, df = prepare_data(input_file)

# Model Definitions (added NaiveBayes)
models = {
    "RandomForestClassifier": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "MultiLayerPerceptronClassifier": MLPClassifier(hidden_layer_sizes=(100,), max_iter=600, batch_size=32, alpha=0.05, learning_rate_init=0.001, random_state=0),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "LogisticClassifier": LogisticRegression(max_iter=1000),
    "LightGBMClassifier": lgb.LGBMClassifier(**lgb_params),
    "SVMClassifier": SVC(probability=True, random_state=0),
    "NaiveBayesClassifier": GaussianNB()
}

# Evaluate each non-CNN model and store results
results = []
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Use predict_proba if available
    if hasattr(model, "predict_proba"):
        y_probs = model.predict_proba(X_test)[:, 1]
    else:
        y_probs = y_pred

    # Compute metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mean_cross_entropy = log_loss(y_test, y_probs) if hasattr(model, "predict_proba") else np.nan
    recall_val = recall_score(y_test, y_pred)

    # Compute AUC if probabilities are available
    if hasattr(model, "predict_proba"):
        fpr, tpr, _ = roc_curve(y_test, y_probs)
        auc_val = auc(fpr, tpr)
    else:
        auc_val = np.nan

    # Compute specificity from confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity_val = tn / (tn + fp) if (tn + fp) > 0 else np.nan

    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Mean Cross Entropy": mean_cross_entropy,
        "F1 Score": f1,
        "Recall": recall_val,
        "AUC": auc_val,
        "Specificity": specificity_val
    })

    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Mean Cross Entropy: {mean_cross_entropy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Recall: {recall_val:.4f}")
    print(f"AUC: {auc_val:.4f}")
    print(f"Specificity: {specificity_val:.4f}\n")

# Convert results to a DataFrame for comparison
results_df = pd.DataFrame(results)
print("\nModel Comparison:")
print(results_df.sort_values(by="Accuracy", ascending=False))

# -------------------- CNN MODEL --------------------

# Reshape data for CNN (adding a channel dimension)
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Define CNN model architecture
def create_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

cnn_model = create_cnn_model((X_train.shape[1], 1))
# Train the CNN (using validation split from training data)
cnn_model.fit(X_train_cnn, y_train, epochs=50, batch_size=32, verbose=0, validation_split=0.2)

# Predict using CNN
y_pred_cnn_prob = cnn_model.predict(X_test_cnn).ravel()
y_pred_cnn = (y_pred_cnn_prob > 0.5).astype(int)

# Compute evaluation metrics for CNN
accuracy = accuracy_score(y_test, y_pred_cnn)
f1 = f1_score(y_test, y_pred_cnn)
mean_cross_entropy = log_loss(y_test, y_pred_cnn_prob)
recall_val = recall_score(y_test, y_pred_cnn)
fpr, tpr, _ = roc_curve(y_test, y_pred_cnn_prob)
auc_val = auc(fpr, tpr)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_cnn).ravel()
specificity_val = tn / (tn + fp) if (tn + fp) > 0 else np.nan

results.append({
    "Model": "CNNClassifier",
    "Accuracy": accuracy,
    "Mean Cross Entropy": mean_cross_entropy,
    "F1 Score": f1,
    "Recall": recall_val,
    "AUC": auc_val,
    "Specificity": specificity_val
})

print(f"Model: CNNClassifier")
print(f"Accuracy: {accuracy:.4f}")
print(f"Mean Cross Entropy: {mean_cross_entropy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Recall: {recall_val:.4f}")
print(f"AUC: {auc_val:.4f}")
print(f"Specificity: {specificity_val:.4f}\n")

# Update results DataFrame to include CNN
results_df = pd.DataFrame(results)
print("\nUpdated Model Comparison:")
print(results_df.sort_values(by="Accuracy", ascending=False))

# Plot ROC curve for all models (only models with probability estimates)
plt.figure(figsize=(10, 8))
for model_name, model in models.items():
    if hasattr(model, "predict_proba"):
        y_probs = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_probs)
        plt.plot(fpr, tpr, label=f"{model_name}")

# Add CNN ROC curve
plt.plot(fpr, tpr, label="CNNClassifier")


plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()
plt.savefig("roc_plot.png", dpi=300)