# **DATA CLEANING**

***Handling Missing Values:***

  a) The code first checks for missing values and provides options to impute them using mean, median, mode, or KNN imputation.

  b) It also allows you to remove rows or columns with excessive missing values.

***Outlier Detection and Treatment:***

  a) Outliers are detected using Z-scores and the Interquartile Range (IQR) method.

  b) You can choose to remove outliers, cap them to a certain range, or apply transformations like log transformation.

***Removing Inconsistencies:***

  a) Duplicate rows are identified and removed.

  b) Categorical variables are checked for inconsistent formatting or labeling errors.

  c) Coordinates are validated to ensure they fall within the valid range for latitude and longitude.

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from scipy import stats

# Load your dataset
df = pd.read_csv('dataset_landslide_imputed.csv')

# Display the first few rows of the dataset
print("Initial Dataset:")
print(df.head())

# Step 1: Handle Missing Values
# Check for missing values
print("\nMissing Values Summary:")
print(df.isnull().sum())

# Option 1: Impute missing values using mean, median, or mode
# For numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())  # or .median()

# For categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Option 3: Remove rows/columns with excessive missing values
# Drop rows with more than 50% missing values
df = df.dropna(thresh=len(df.columns)//2, axis=0)
# Drop columns with more than 50% missing values
df = df.dropna(thresh=len(df)//2, axis=1)

# Step 2: Outlier Detection and Treatment
# Detect outliers using Z-scores
z_scores = np.abs(stats.zscore(df[numerical_cols]))
outliers = (z_scores > 3).any(axis=1)
print("\nOutliers detected using Z-scores:")
print(df[outliers])

# Detect outliers using IQR
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1
outliers_iqr = ((df[numerical_cols] < (Q1 - 1.5 * IQR)) | (df[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)
print("\nOutliers detected using IQR:")
print(df[outliers_iqr])

# Decide on treatment: Remove, cap, or transform outliers
# Option 1: Remove outliers
df = df[~outliers]

# Step 3: Remove Inconsistencies
# Check for duplicate entries
print("\nDuplicate Rows:")
print(df[df.duplicated()])
df = df.drop_duplicates()

# Check for inconsistent formatting or labeling errors in categorical variables
print("\nUnique Values in Categorical Columns:")
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")

latitude_col = 'latitude'
longitude_col = 'longitude'

# Define valid ranges for latitude and longitude
valid_lat_range = (11.27, 11.98)
valid_lon_range = (75.80, 76.44)

# Filter out invalid coordinates
df = df[(df[latitude_col].between(*valid_lat_range)) & (df[longitude_col].between(*valid_lon_range))]

# Final cleaned dataset
print("\nCleaned Dataset:")
print(df.head())

# Save the cleaned dataset to a new file
df.to_csv('cleaned_dataset.csv', index=False)

# **STRATIFIED SAMPLING**

a) ***Stratified Sampling:***

  The train_test_split function is used with the stratify=y parameter to ensure that the proportion of landslide/no-landslide cases is the same in both the training and testing sets as in the original dataset.

b) ***Train-Test Split:***

  The dataset is split into 80% training and 20% testing (test_size=0.2).

  A random seed (random_state=42) is set for reproducibility.

c) ***Distribution of Target Variable:***

  The distribution of the target variable (label) is checked in both the training and testing sets to ensure balanced representation.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('cleaned_dataset.csv')

# Display the first few rows of the dataset
print("Dataset before splitting:")
print(df.head())

# Separate features (X) and target variable (y)
X = df.drop(columns=['label'])
y = df['label']

# Step 1: Split the dataset into training and testing sets using Stratified Sampling
# Use a 80-20 split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f"Train size: {X_train.shape}, Test size: {X_test.shape}")

# Display the shape of the resulting datasets
print("\nShape of Training Data:", X_train.shape)
print("Shape of Testing Data:", X_test.shape)

# Check the distribution of the target variable in the training and testing sets
print("\nDistribution of Target Variable in Training Set:")
print(y_train.value_counts(normalize=True))

print("\nDistribution of Target Variable in Testing Set:")
print(y_test.value_counts(normalize=True))

# Save the training and testing sets to new files
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

print("\nTraining and testing sets saved to files.")

# **DATA TRANSFORMATION**

***Feature Scaling:***

  a) *Standardization (Z-score normalization):*

        Transforms features to have a mean of 0 and a standard deviation of 1.

        Formula: z=(x−μ)σz=σ(x−μ)​

        Suitable for features with Gaussian (normal) distribution.
        

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the datasets
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Step 1: Feature Scaling
# Select numerical columns for scaling (excluding latitude, longitude, and the target label)
numerical_cols = X_train.select_dtypes(include=[np.number]).columns
numerical_cols = numerical_cols.drop(['id', 'latitude', 'longitude'], errors='ignore')

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("\nDataset after feature scaling:")
print("X_train head:\n", X_train.head())
print("X_test head:\n", X_test.head())

# Step 2: Convert categorical columns to the 'category' data type
categorical_cols = ['geology', 'geomorphology', 'lulc']

# Convert categorical columns to 'category' type
for col in categorical_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

print("\nDataset after converting categorical columns to 'category' type:")
print("X_train head:\n", X_train.head())
print("X_test head:\n", X_test.head())

# Save the transformed datasets to new files
X_train.to_csv('transformed_X_train.csv', index=False)
X_test.to_csv('transformed_X_test.csv', index=False)

# **DIMENTIONALITY REDUCTION**

1. ***Correlation Analysis:***

    a) A heatmap is created to visualize the correlation matrix of the dataset.

    b) This helps identify highly correlated features, which can be redundant for modeling.


2. ***Feature Selection:***

    a) SHAP Feature Importance:

            SHAP is an Explainability techniwque used to determine the feature contribution of a model. It is a game theory based approach. A LightGBM model is trained. SHAP Tree Explainer analyses the contribution of each feature in prediction.
            Features are ranked based on their importance, and the results are visualized using a summary violin plot.

    b) Categorical columns are converted to the 'category' data type. LightGBM can handle categorical features directly only if they are of type 'category'.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb  # Import LightGBM
import shap

# Load the transformed datasets
X_train = pd.read_csv('transformed_X_train.csv')
X_test = pd.read_csv('transformed_X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Step 1: Exclude latitude and longitude for correlation analysis
excluded_columns_for_corr = ['latitude', 'longitude', 'lulc', 'geology', 'geomorphology']
X_for_corr = X_train.drop(columns=excluded_columns_for_corr)

# Compute the correlation matrix
corr_matrix = X_for_corr.corr()

# Plot a heatmap of the correlation matrix
plt.figure(figsize=(20, 16))
sns.heatmap(
    corr_matrix,
    annot=True,
    cmap='Reds',
    fmt=".2f",
    linewidths=0.5,
    annot_kws={"size": 10}
)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(fontsize=12)
plt.title("Correlation Heatmap", fontsize=14, pad=20)
plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches='tight')
plt.show()

# Step 2: Remove highly correlated features
threshold = 0.83
highly_correlated_features = set()

# Identify pairs of highly correlated features
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname_i = corr_matrix.columns[i]
            colname_j = corr_matrix.columns[j]
            print(f"Highly correlated: {colname_i} and {colname_j} (corr = {corr_matrix.iloc[i, j]:.2f})")
            highly_correlated_features.add(colname_j)

# Drop highly correlated features
X_train_reduced = X_train.drop(columns=highly_correlated_features)
X_test_reduced = X_test.drop(columns=highly_correlated_features)

print("\nFeatures dropped due to high correlation:")
print(highly_correlated_features)

# Step 3: Exclude latitude and longitude for feature scaling
excluded_columns_for_reduction = ['latitude', 'longitude', 'lulc', 'geology', 'geomorphology']
X_train_reduced = X_train_reduced.drop(columns=excluded_columns_for_reduction)
X_test_reduced = X_test_reduced.drop(columns=excluded_columns_for_reduction)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_reduced), columns=X_train_reduced.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test_reduced), columns=X_test_reduced.columns)

# Step 4: Train LightGBM model
print("\nTraining LightGBM model...")
lgb_model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_model.fit(X_train_scaled, y_train)

# Step 5: Feature Selection using SHAP TreeExplainer
print("\nComputing feature importance using SHAP TreeExplainer...")
explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer.shap_values(X_train_scaled)

# Debugging: Check shape of SHAP values
print("Shape of shap_values:", np.array(shap_values).shape)

# Get SHAP feature importance
if isinstance(shap_values, list):  # Multi-class classification
    shap_importance = np.abs(shap_values).mean(axis=(0, 1))
else:  # Binary classification
    shap_importance = np.abs(shap_values).mean(axis=0)

# Debugging: Check shapes
print("Shape of shap_importance:", shap_importance.shape)
print("Number of features in X_train_scaled:", len(X_train_scaled.columns))
print("Features in X_train_scaled:", X_train_scaled.columns)

# Ensure shap_importance is 1-dimensional
if len(shap_importance.shape) > 1:
    raise ValueError("shap_importance is not 1-dimensional. Check the shape of shap_values.")

# Ensure the lengths match
if len(shap_importance) != len(X_train_scaled.columns):
    raise ValueError(f"Mismatch in lengths: shap_importance has {len(shap_importance)} values, but X_train_scaled has {len(X_train_scaled.columns)} features.")

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({'Feature': X_train_scaled.columns, 'Importance': shap_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance DataFrame:")
print(feature_importance_df)

# Step 6: Select top N features (including forced ones)
top_n = 6
selected_features = list(feature_importance_df.iloc[:top_n]['Feature'])

# Force include specific features
force_include_features = ['slope', 'ndvi', 'flow accumulation', 'twi', 'dist to riv', 'spi']
for feature in force_include_features:
    if feature in X_train_scaled.columns and feature not in selected_features:
        selected_features.append(feature)

# Update dataset with selected features
X_train_selected = X_train_scaled[selected_features]
X_test_selected = X_test_scaled[selected_features]

print("\nSelected Features using SHAP (including forced features):")
print(selected_features)

# Step 7: Plot SHAP summary plot
plt.figure(figsize=(12, 8))
if isinstance(shap_values, list):
    shap.summary_plot(shap_values, X_train_scaled, feature_names=X_train_scaled.columns, show=False)
else:
    shap.summary_plot(shap_values, X_train_scaled, feature_names=X_train_scaled.columns, show=False)
plt.savefig("shap_summary_plot.png", dpi=300, bbox_inches="tight")
print("\nSHAP summary plot saved as 'shap_summary_plot.png'")

# Step 8: Add latitude and longitude back to the final dataset
X_train_final = pd.concat([X_train_selected, X_train[excluded_columns_for_reduction]], axis=1)
X_test_final = pd.concat([X_test_selected, X_test[excluded_columns_for_reduction]], axis=1)

# Save the final datasets to new files
X_train_final.to_csv('final_X_train.csv', index=False)
X_test_final.to_csv('final_X_test.csv', index=False)

# **MACHINE LEARNING COMPARISON**

  a) Model Used:

      LightGBM

  b) Hyperparameter Tuning:

      Grid Search is used for hyperparameter tuning.

  c) Evaluation Metrics:

      ROC-AUC, Accuracy, Sensitivity (Recall), Specificity, F1-Score, Precision.

  d) Confusion Matrix:

      A confusion matrix is generated for each model and saved as an image.

  e) ROC Curve:

      ROC curves for all models are plotted and saved as an image.

  f) k-Fold Cross-Validation:

      Stratified k-fold cross-validation is used during hyperparameter tuning.

  e) Performance Comparison:

      A table comparing the performance of all models is generated.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, accuracy_score, confusion_matrix, f1_score,
    recall_score, precision_score, roc_curve, auc, ConfusionMatrixDisplay, precision_recall_curve
)
from lightgbm import LGBMClassifier
import joblib


# Load the dataset
X_train = pd.read_csv('final_X_train.csv')
X_test = pd.read_csv('final_X_test.csv')
y_train = pd.read_csv('y_train.csv').squeeze()
y_test = pd.read_csv('y_test.csv').squeeze()

# Separate numeric and categorical columns
numeric_columns = X_train.select_dtypes(include=[np.number]).columns
categorical_columns = X_train.select_dtypes(include=['category', 'object']).columns

# Apply StandardScaler only to numeric columns
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(X_train[numeric_columns])
X_test_numeric = scaler.transform(X_test[numeric_columns])

X_train_scaled = pd.DataFrame(X_train_numeric, columns=numeric_columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_numeric, columns=numeric_columns, index=X_test.index)

# Reattach the categorical features
X_train = pd.concat([X_train_scaled, X_train[categorical_columns]], axis=1)
X_test = pd.concat([X_test_scaled, X_test[categorical_columns]], axis=1)

# Convert categorical columns to category dtype
for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

# Define evaluation metrics
def evaluate_model(y_true, y_pred, y_pred_proba):
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_true)
    metrics = {
        'ROC-AUC': roc_auc_score(y_true, y_pred_proba),
        'Accuracy': accuracy_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'Specificity': recall_score(y_true, y_pred, pos_label=0),
        'F1-Score': f1_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Weighted_avg_auc': roc_auc_score(y_true, y_pred_proba, sample_weight=sample_weights)
    }
    return metrics

# Define LightGBM model with initial hyperparameters
lgb_model = LGBMClassifier(random_state=42)

# Define hyperparameter grid for tuning
param_grid = {
    'objective': ['binary'],
    'learning_rate': [0.01],
    'max_depth': [4],
    'min_data_in_leaf': [200],
    'lambda_l1': [1.0],
    'lambda_l2': [1.0],
    'feature_fraction': [0.6]
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring='roc_auc', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_lgb_model = grid_search.best_estimator_

# Save the trained model to a file using joblib
joblib.dump(best_lgb_model, 'lgb_model.pkl')
print("Model saved to 'lgb_model.pkl'.")

# Make predictions using the best model
y_pred_train = best_lgb_model.predict(X_train)
y_pred_test = best_lgb_model.predict(X_test)
y_pred_proba_train = best_lgb_model.predict_proba(X_train)[:, 1]
y_pred_proba_test = best_lgb_model.predict_proba(X_test)[:, 1]

# Evaluate the model on training and testing data
metrics_train = evaluate_model(y_train, y_pred_train, y_pred_proba_train)
metrics_test = evaluate_model(y_test, y_pred_test, y_pred_proba_test)

# Store results
results = {
    'LightGBM': {
        'Training AUC': metrics_train['ROC-AUC'],
        'Testing AUC': metrics_test['ROC-AUC'],
        'Overall AUC': metrics_test['Weighted_avg_auc'],
        'Training Accuracy': metrics_train['Accuracy'],
        'Testing Accuracy': metrics_test['Accuracy'],
        'Training F1-Score': metrics_train['F1-Score'],
        'Testing F1-Score': metrics_test['F1-Score'],
        'Training Recall': metrics_train['Recall'],
        'Testing Recall': metrics_test['Recall'],
        'Training Precision' : metrics_train['Precision'],
        'Testing Precision' : metrics_test['Precision'],
        'Training Specificity': metrics_train['Specificity'],
        'Testing Specificity': metrics_test['Specificity']
    }
}

# Convert results to DataFrame format
metrics_data = []
for metric_name, value in results['LightGBM'].items():
    split = metric_name.split()
    if split[0] in ('Training', 'Testing', 'Overall'):
        category = split[0]
        metric = ' '.join(split[1:])
    else:
        category = 'Overall'
        metric = metric_name
    metrics_data.append({'Metric': metric, 'Category': category, 'Value': value})

df = pd.DataFrame(metrics_data)

# 1. Grouped Bar Plot
plt.figure(figsize=(14, 8))
ax = plt.subplot()
metrics = df['Metric'].unique()
x = np.arange(len(metrics))
width = 0.25

for i, category in enumerate(['Training', 'Testing', 'Overall']):
    values = df[df['Category'] == category]['Value'].values
    if len(values) == len(metrics):
        ax.bar(x + i*width, values, width, label=category)

ax.set_xticks(x + width)
ax.set_xticklabels(metrics, rotation=45, ha='right')
plt.legend()
plt.title('Model Performance Metrics Comparison')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.tight_layout()
plt.savefig('metrics_comparison_barplot.png', dpi=300, bbox_inches='tight')
plt.show()

# 2. Training vs Testing Scatter Plot
train_metrics = []
test_metrics = []
labels = []

for metric in ['Accuracy', 'F1-Score', 'Recall', 'Precision', 'Specificity', 'AUC']:
    train_val = results['LightGBM'][f'Training {metric}']
    test_val = results['LightGBM'][f'Testing {metric}']
    train_metrics.append(train_val)
    test_metrics.append(test_val)
    labels.append(metric)

plt.figure(figsize=(10, 6))
plt.scatter(train_metrics, test_metrics, s=100, edgecolor='k', c='green')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('Training Score')
plt.ylabel('Testing Score')
plt.title('Training vs Testing Performance Comparison')
plt.grid(True)

for i, label in enumerate(labels):
    plt.annotate(label, (train_metrics[i], test_metrics[i]), 
                 xytext=(5, 5), textcoords='offset points')
plt.tight_layout()
plt.savefig('train_test_scatter.png', dpi=300, bbox_inches='tight')
plt.show()

# 3. Radar Chart (Add this function before calling it)
def create_radar_chart(categories, values1, values2, title):
    N = len(categories)
    angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
    values1 += values1[:1]
    values2 += values2[:1]
    angles += angles[:1]
    
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    ax.plot(angles, values1, 'o-', linewidth=2, label='Training')
    ax.fill(angles, values1, alpha=0.25)
    ax.plot(angles, values2, 'o-', linewidth=2, label='Testing')
    ax.fill(angles, values2, alpha=0.25)
    
    ax.set_theta_offset(np.pi/2)
    ax.set_theta_direction(-1)
    ax.set_thetagrids(np.degrees(angles[:-1]), categories)
    plt.ylim(0, 1)
    plt.title(title, y=1.08)
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
    plt.savefig('radar_chart.png', dpi=300, bbox_inches='tight')
    plt.show()

# Create radar chart
metrics_radar = ['Accuracy', 'F1-Score', 'Recall', 'Precision', 'Specificity', 'AUC']
train_values = [results['LightGBM'][f'Training {m}'] for m in metrics_radar]
test_values = [results['LightGBM'][f'Testing {m}'] for m in metrics_radar]
create_radar_chart(metrics_radar, train_values, test_values, 'Performance Radar Chart')

# 4. Heatmap
heatmap_data = df.pivot(index='Category', columns='Metric', values='Value')
plt.figure(figsize=(12, 4))
sns.heatmap(heatmap_data, annot=True, cmap='Blues', fmt=".2f", vmin=0, vmax=1)
plt.title('Performance Metrics Heatmap')
plt.tight_layout()
plt.savefig('metrics_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

# Confusion Matrix for training data
cm_train = confusion_matrix(y_train, y_pred_train)
tn_train, fp_train, fn_train, tp_train = cm_train.ravel()
print(f"Training Confusion Matrix for LightGBM:")
print(f"TN: {tn_train}, FP: {fp_train}, FN: {fn_train}, TP: {tp_train}")

# Confusion Matrix for testing data
cm_test = confusion_matrix(y_test, y_pred_test)
tn_test, fp_test, fn_test, tp_test = cm_test.ravel()
print(f"Testing Confusion Matrix for LightGBM:")
print(f"TN: {tn_test}, FP: {fp_test}, FN: {fn_test}, TP: {tp_test}")

# Feature Importance
feature_importance = best_lgb_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': range(X_train.shape[1]), 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature Importance Table:")
print(feature_importance_df)

# Plot Feature Importance
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importance_df['Importance'], y=feature_importance_df['Feature'], palette='viridis')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('LightGBM Feature Importance')
plt.savefig('feature_importance_lightgbm.png', dpi=300, bbox_inches='tight')
plt.show()

# Plot ROC Curve
plt.figure(figsize=(10, 8))
fpr, tpr, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label=f'LightGBM (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.savefig('roc_curve_lightgbm.png', dpi=300, bbox_inches='tight')
plt.show()

# Plot Precision-Recall Curve
plt.figure(figsize=(10, 8))
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba_test)
pr_auc = auc(recall, precision)
plt.plot(recall, precision, label=f'LightGBM (AUC = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.savefig('pr_curve_lightgbm.png', dpi=300, bbox_inches='tight')
plt.show()

# Plot ROC Curve for both training and testing data
plt.figure(figsize=(10, 8))
fpr_train, tpr_train, _ = roc_curve(y_train, y_pred_proba_train)
roc_auc_train = auc(fpr_train, tpr_train)
plt.plot(fpr_train, tpr_train, label=f'Training ROC (AUC = {roc_auc_train:.2f})', color='blue')
fpr_test, tpr_test, _ = roc_curve(y_test, y_pred_proba_test)
roc_auc_test = auc(fpr_test, tpr_test)
plt.plot(fpr_test, tpr_test, label=f'Testing ROC (AUC = {roc_auc_test:.2f})', color='green')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Train-Test Split')
plt.legend(loc='lower right')
plt.savefig('roc_curve_train_test.png', dpi=300, bbox_inches='tight')
plt.show()

# Plot Confusion Matrix for LightGBM (Test Data)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=["No Landslide", "Landslide"])
disp.plot(cmap='Reds')
plt.title('Confusion Matrix - LightGBM')
plt.savefig('confusion_matrix_lightgbm_test.png', dpi=300, bbox_inches='tight')
plt.show()

# Print results
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)