<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 1 | SETUP AND INITIALIZATION</p>

In [2]:
# Import libraries you'll need

import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from scipy.stats import loguniform 
from xgboost import XGBClassifier
import joblib
import folium


In [None]:
# Read file from github
url = "https://github.com/baariu/Analyzing-and-Predicting-Urbanization-in-Nairobi-using-GIS-and-Data-Mining-Techniques/raw/refs/heads/main/Spatial%20Data.xlsx"

df = pd.read_excel(url)

# Display the first few rows of the dataframe
df.head()

: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 2 | DATA EXPLORATION</p>

In [None]:
# Lets observe the dataset.
df.shape

: 

In [None]:
#Distribution of data. Is the data balanced?
df.describe()

: 

In [None]:
df["change_detection_value"].value_counts()

: 

In [None]:
#Remap this column, to enable its use in some models.  1.0 should become 0.0
#2.0 should become 1.0. Models like xgboost only use 0 and 1

df['change_detection_value'] = df['change_detection_value'].apply(lambda x: 1 if x == 2 else 0)

#Observe the change
df["change_detection_value"].value_counts()

: 

In [None]:
# The distribution of the dataset further by ploting a histogram of each feature

df.hist(figsize=(13,15))
plt.show()

: 

<div style="border-radius:10px; padding: 15px; background-color: rgb(142, 183, 190); 
            font-size:110%; text-align:LEFT; width: 95%; color: BLACK;">


## Key Observations
- **Inverse Relationship**:
  - Lower elevation ↔ Closer to urban infrastructure
  - Higher elevation ↔ More remote areas
- **Data Distribution**:
  - Sparse representation at elevation extremes
  - Potential outliers in high-elevation, low-distance points




<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 3 | DATA CLEANING</p>

In [None]:
# Calculate null percentage and add '%' symbol
null_percentage = (df.isna().mean() * 100).round(2).astype(str) + '%'

# Display results
print("Null Values (% of Total):")
print(null_percentage)

: 

In [None]:
#Drop 'Distance_to_industrial_Zones(Metres)'
df = df.drop('Distance_to_industrial_Zones(Metres)', axis=1)
df = df.drop('id', axis=1)

: 

In [None]:
#Impute the slope column with the median value
df['Slope'] = df['Slope'].fillna(df['Slope'].median())

: 

In [None]:
#Confirm if any more missing values are left
df.isna().sum()

: 

In [None]:
#Drop the missing values in persons per pixel 1km2
df = df.dropna(subset=['persons per  pixel 1km2'])

df.isna().sum()

: 

In [None]:
#Drop road names

df = df.drop('Road_Name', axis=1)


: 

In [None]:
df['change_detection_value'] = df['change_detection_value'].astype(str)


: 

In [None]:
# 1. Define your numerical columns (automatically detected)
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# 2. Calculate grid dimensions
n_cols = len(numerical_cols)
n_rows = (n_cols + 1) // 2  # Round up division

# 3. Create the plot with proper sizing
plt.figure(figsize=(15, 5*n_rows), facecolor='white')

for i, col in enumerate(numerical_cols, 1):
    ax = plt.subplot(n_rows, 2, i)
    
    # Calculate IQR bounds
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    # Identify and plot outliers
    is_outlier = (df[col] < lower_bound) | (df[col] > upper_bound)
    sns.scatterplot(x=df.index, y=df[col], 
                    hue=is_outlier, 
                    palette={False: 'blue', True: 'red'},
                    ax=ax)
    
    # Add IQR reference lines
    ax.axhline(y=lower_bound, color='orange', linestyle='--', alpha=0.7)
    ax.axhline(y=upper_bound, color='orange', linestyle='--', alpha=0.7)
    
    # Formatting
    ax.set_title(f'Outliers in {col} (IQR Method)', pad=10)
    ax.set_ylabel('Value')
    ax.legend(['Normal', 'Outlier', 'IQR Threshold'])
    
plt.tight_layout(pad=2.0)
plt.suptitle('Outlier Detection by Column (IQR Method)', y=1.02, fontsize=14)
plt.show()

: 

In [None]:
def remove_outliers_iqr(df, columns):
    original_rows = df.shape[0]
    
    # Compute IQR for specified columns
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out outliers
    df_cleaned = df[~((df[columns] < lower_bound) | (df[columns] > upper_bound)).any(axis=1)]
    cleaned_rows = df_cleaned.shape[0]
    rows_removed = original_rows - cleaned_rows
    removal_percentage = (rows_removed / original_rows) * 100
    
    # Print cleaning report
    print("Cleaning Report:")
    print(f"Original rows: {original_rows}")
    print(f"Cleaned rows: {cleaned_rows}")
    print(f"Rows removed: {rows_removed} ({removal_percentage:.2f}%)")
    
    return df_cleaned

numeric_columns = df.select_dtypes(include=[np.number]).columns  # Select numerical columns
df_cleaned = remove_outliers_iqr(df, numeric_columns)


: 

In [None]:
df = df_cleaned.copy()

: 

In [None]:
df['change_detection_value'] = pd.to_numeric(df['change_detection_value']).astype('Int64')

: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 4 | EXPLORATORY DATA ANALYSIS</p>

In [None]:
# Select only numerical columns for correlation
numerical_df = df.select_dtypes(include=['float64', 'int64'])

# Create correlation matrix
corr_matrix = numerical_df.corr()

# Generate heatmap
plt.figure(figsize=(10, 10))
heatmap = sns.heatmap(
    corr_matrix, 
    annot=True, 
    fmt=".2f",  # Show 2 decimal places
    cmap='coolwarm',  # Color scheme
    center=0,  # Center color at 0
    square=True,  # Make cells square
    linewidths=.5,  # Add lines between cells
    cbar_kws={"shrink": 0.8}  # Adjust colorbar size
)

# Improve readability
plt.title('Correlation Heatmap (Numerical Columns Only)', pad=20)
heatmap.set_xticklabels(
    heatmap.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
)

plt.tight_layout()
plt.show()

: 

<div style="border-radius:10px; padding: 15px; background-color: rgb(142, 183, 190); 
            font-size:110%; text-align:LEFT; width: 95%; color: BLACK;">


## Key Observations

### 1. Strong Negative Correlations
- **Elevation ↔ X-coordinate (-0.96)**  
  Indicates higher elevations are associated with decreasing X-values (likely western geographic locations)

- **Distance_to_Road ↔ Persons_per_Pixel (-0.38)**  
  Suggests populated areas tend to be closer to roads

### 2. Moderate Positive Correlations
- **Y-coordinate ↔ Distance_to_Airport (0.40)**  
  Northern locations (higher Y-values) are typically farther from airports

- **Change_Detection_Value ↔ X-coordinate (0.31)**  
  Potential regional pattern in land-use changes

### 3. Weak/No Correlations
- **Slope** shows minimal correlation with all variables  
- **Distance_to_Urban_Areas** appears independent of other features

### 4. Multicollinearity Warning
- **Elevation ↔ X-coordinate (-0.96)**  
  Extreme correlation may require remediation for regression modeling:
  - Consider removing one feature
  - Apply PCA
  - Use regularization techniques




In [None]:

# Calculate map center
mean_lat = df['y'].mean()
mean_lon = df['x'].mean()

# Create base map
m = folium.Map(location=[mean_lat, mean_lon], zoom_start=12.1)

# Add points with hollow circles
for idx, row in df.iterrows():
    color = 'green' if row['change_detection_value'] == 1 else 'red'
    folium.CircleMarker(
        location=[row['y'], row['x']],
        radius=2,
        color=color,
        fill=False,  
        weight=2,    # Border thickness
        popup=f"Change: {'Yes' if row['change_detection_value'] == 1 else 'No'}<br>"
              f"Coordinates: ({row['y']:.6f}, {row['x']:.6f})"
    ).add_to(m)

# Add legend
legend_html = '''
     <div style="position: fixed; 
                 bottom: 50px; left: 50px; width: 150px; height: 80px; 
                 border:2px solid grey; z-index:9999; font-size:14px;
                 background-color:white;
                 ">
     &nbsp; <strong>Legend</strong> <br>
     &nbsp; <span style="color:green">⬤</span> Change (1)<br>
     &nbsp; <span style="color:red">⬤</span> No Change (0)
     </div>
'''
m.get_root().html.add_child(folium.Element(legend_html))
m.save('coordinate_visualization.html')
# Display map
m

: 

>> After Feature Importance consider dropping Elevation since it is highly correlated with X and X is paramount in our modelling

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 5 | FEATURE SELECTION</p>

In [None]:
df.Airport_Name.value_counts()

: 

In [None]:
df['Airport_Name'] = df['Airport_Name'].apply(lambda x: 1 if x == 'Wilson Airport' else 0)

#Observe the change
df["Airport_Name"].value_counts()

: 

In [None]:
#Use Random Forest to get feature importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
df_copy = df

# Separate the target variable from the features
X = df_copy.drop('change_detection_value', axis=1)
y = df_copy['change_detection_value']

#Split your train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Initiate model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

#Train model
rf.fit(X_train, y_train)

#Measure feature Importances
importances = rf.feature_importances_


feature_names = X_train.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

importance_df

: 

In [None]:
# List of features to drop
features_to_drop = [
    'Elevation',
    'Slope', 
    'persons per  pixel 1km2',
    'Airport_Name'  # Extremely low importance (0.000539)
]

# Create new DataFrame without these features
df_filtered = df.drop(columns=features_to_drop)

# Verify the remaining columns
print("Remaining features:")
df_filtered.columns.tolist()

: 

In [None]:
df = df_filtered.copy()

df.head()

: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 6 | MODELLING</p>

In [None]:

# Separate features by type
coordinates = df[['x', 'y']]  # Geographic coordinates (preserve spatial relationships)
distance_features = df.filter(regex='Distance_to_')  # All distance columns
binary_target = df['change_detection_value']  # Binary target 

# 1. Scale coordinates using RobustScaler (preserves spatial relationships)
coord_scaler = RobustScaler()
scaled_coords = pd.DataFrame(coord_scaler.fit_transform(coordinates), 
                             columns=['x_scaled', 'y_scaled'],
                             index=df.index)

# Save the coordinate scaler
joblib.dump(coord_scaler, 'coord_scaler.joblib')

# 2. Scale distance features using StandardScaler
dist_scaler = StandardScaler()
scaled_distances = pd.DataFrame(dist_scaler.fit_transform(distance_features), 
                                columns=[f'scaled_{col}' for col in distance_features.columns],
                                index=df.index)

# Save the distance scaler
joblib.dump(dist_scaler, 'dist_scaler.joblib')

# 3. Combine scaled features with original binary target
scaled_df = pd.concat([
    scaled_coords,
    scaled_distances,
    binary_target
], axis=1)

print("Scaled Dataset Preview:")
scaled_df.head()



: 

In [None]:


# Separate the target variable from the features
X = scaled_df.drop('change_detection_value', axis=1)
y = scaled_df['change_detection_value']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



: 

In [None]:
# Check class imbalance first, to be sure
from collections import Counter

print("Original class distribution:", Counter(y_train))

: 

In [None]:

# Apply SMOTETomek to balance classes in training set
smote_tomek = SMOTETomek(sampling_strategy='auto', random_state=42)
X_train_resampled_smtk, y_train_resampled_smtk = smote_tomek.fit_resample(X_train, y_train)

# Check the original and new distribution
print("Original class distribution:", Counter(y_train))
print("Resampled class distribution:", Counter(y_train_resampled_smtk))

: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 6.1 | Random Forest</p>

In [None]:
# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Fit the model
rf.fit(X_train_resampled_smtk, y_train_resampled_smtk)

# Generate predictions
y_pred = rf.predict(X_test)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate and display confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)


plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted 0', 'Predicted 1'], 
            yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix Random Forest Base Model')
plt.show()

: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 6.1.2| Random Forest with RandomizedSearchCV</p>

In [None]:


# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': [100, 200, 300],     
    'max_depth': [10, 20, None],          
    'max_features': ['auto', 'sqrt', 'log2'],  
    'bootstrap': [True, False],           
    'criterion': ['gini', 'entropy']      
}

# Set up the Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf,
                                 param_distributions=param_dist,
                                 n_iter=10,  
                                 scoring='recall',
                                 cv=5,     
                                 random_state=42,
                                 n_jobs=-1,  
                                 verbose=2)

# Fit the tuned model
random_search.fit(X_train_resampled_smtk, y_train_resampled_smtk)

# Print best parameters
print("Best parameters found: ", random_search.best_params_)
print("Best recall score: ", random_search.best_score_)

# Get the best estimator
best_rf = random_search.best_estimator_

# Make predictions on test set
y_pred = best_rf.predict(X_test)

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate and display confusion matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualize confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted Negative', 'Predicted Positive'], 
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Random Forest (Optimized)')
plt.show()

: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 6.2| Logistic Regression</p>

In [None]:

# Initialize the model
logreg = LogisticRegression(random_state=42, max_iter=1000)

# Define the parameter grid for Logistic Regression
param_dist = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  
    'C': loguniform(1e-4, 100),                  
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'class_weight': [None, 'balanced'],         
    'l1_ratio': [0, 0.25, 0.5, 0.75, 1]         
}

# Set up RandomizedSearchCV with recall scoring
random_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_dist,
    n_iter=20,               
    scoring='recall',         
    cv=5,                     
    random_state=42,
    n_jobs=-1,                
    verbose=2
)

# Fit the tuned model
random_search.fit(X_train_resampled_smtk, y_train_resampled_smtk)

# Print best parameters and score
print("\nBest parameters found: ", random_search.best_params_)
print("Best recall score (CV): ", random_search.best_score_)

# Get the best estimator
best_logreg = random_search.best_estimator_

# Make predictions
y_pred = best_logreg.predict(X_test)
y_pred_proba = best_logreg.predict_proba(X_test)[:, 1]  # Probability scores for ROC curve

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix (Numerical)
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Confusion Matrix (Visual)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix - Logistic Regression (Optimized)')
plt.show()



: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 6.3| XGBoost</p>

In [None]:

# Initialize XGBoost model
xgb = XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)

# Define parameter grid for XGBoost
param_dist = {
    'n_estimators': [100, 200, 300, 400],       
    'max_depth': [3, 5, 7, 9],                  
    'learning_rate': [0.01, 0.05, 0.1, 0.2],    
    'subsample': [0.6, 0.8, 1.0],               
    'colsample_bytree': [0.6, 0.8, 1.0],        
    'gamma': [0, 0.1, 0.2],                     
    'min_child_weight': [1, 3, 5],              
    'scale_pos_weight': [1, (len(y_train_resampled_smtk) - sum(y_train_resampled_smtk)) / sum(y_train_resampled_smtk)]  # Handle class imbalance
}

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=25,                  
    scoring='recall',
    cv=5,                       
    random_state=42,
    n_jobs=-1,
    verbose=2
)

# Fit the model
random_search.fit(X_train_resampled_smtk, y_train_resampled_smtk)

# Best parameters and score
print("\nBest Parameters:", random_search.best_params_)
print("Best Recall Score (CV):", random_search.best_score_)

# Get best estimator
best_xgb = random_search.best_estimator_

# Predictions
y_pred = best_xgb.predict(X_test)
y_pred_proba = best_xgb.predict_proba(X_test)[:, 1]  # Probabilities for positive class

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Negative', 'Predicted Positive'],
            yticklabels=['Actual Negative', 'Actual Positive'])
plt.title('XGBoost Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()



: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 7| Evaluation and Model Selection</p>

In [None]:
models = {
    'Random Forest (Baseline)': rf.fit(X_train_resampled_smtk, y_train_resampled_smtk),
    'Random Forest (Optimized)': best_rf.fit(X_train_resampled_smtk, y_train_resampled_smtk),
    'Logistic Regression': best_logreg.fit(X_train_resampled_smtk, y_train_resampled_smtk),
    'XGBoost': best_xgb.fit(X_train_resampled_smtk, y_train_resampled_smtk)
}

# Now calculate metrics
metrics = []
for name, model in models.items():
    try:
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else [0]*len(X_test)
        
        metrics.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred),
            'ROC AUC': roc_auc_score(y_test, y_proba) if hasattr(model, "predict_proba") else None
        })
    except Exception as e:
        print(f"Error with {name}: {str(e)}")
        metrics.append({
            'Model': name,
            'Error': str(e)
        })


: 

In [None]:
plt.figure(figsize=(14, 8))
plt.title('Model Performance Comparison (Bar Graph)', fontsize=16, pad=20)

# Metrics to plot
metrics_to_plot = ['Accuracy', 'Precision', 'Recall', 'F1']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']  # Distinct colors

# Create x-axis positions
model_names = [m['Model'] for m in metrics]
x = np.arange(len(model_names))
width = 0.2  # Width of each bar

# Plot each metric as grouped bars
for i, metric in enumerate(metrics_to_plot):
    values = [m[metric] for m in metrics]
    bars = plt.bar(x + i*width, values, width, color=colors[i], label=metric)
    
    # Add value labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                 f'{height:.2f}',
                 ha='center', va='bottom',
                 fontsize=10)

# Customize plot
plt.xticks(x + 1.5*width, model_names, rotation=45, ha='right', fontsize=12)
plt.yticks(np.linspace(0, 1.1, 12))
plt.xlabel('Machine Learning Models', fontsize=12)
plt.ylabel('Performance Score', fontsize=12)
plt.ylim(0, 1.2)  # Increased upper limit for value labels
plt.grid(True, axis='y', linestyle='--', alpha=0.6)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12)

# Add horizontal reference lines
for y in np.arange(0.2, 1.1, 0.2):
    plt.axhline(y=y, color='gray', linestyle=':', alpha=0.3)

plt.tight_layout()
plt.show()

: 

<div style="border-radius:10px; padding: 15px; background-color: rgb(142, 183, 190); 
            font-size:110%; text-align:LEFT; width: 95%; color: BLACK;">

### Model Performance Evaluation and Best Model Selection

#### **Key Performance Metrics**
| Metric               | Random Forest (Baseline) | Random Forest (Optimized) | Logistic Regression | XGBoost |
|----------------------|-------------------------|--------------------------|---------------------|---------|
| **Accuracy**         | 0.85                   | 0.88                    | 0.77               | 0.88    |
| **Precision**        | 0.42                   | 0.45                    | 0.28               | 0.45    |
| **Recall**           | 0.63                   | 0.65                    | 0.74               | 0.65    |
| **F1 Score**         | 0.50                   | 0.53                    | 0.41               | 0.53    |

#### **Model Strengths and Weaknesses**

**Random Forest (Optimized)**
- ✅ **Highest accuracy** (tied with XGBoost)
- ✅ **Best precision/F1 balance**
- ❌ Slightly lower recall than Logistic Regression

**XGBoost**  
- ✅ **Equal best accuracy/precision**
- ✅ **Handles class imbalance well**
- ❌ More complex to interpret

**Logistic Regression**
- ✅ **Highest recall** (74%)
- ❌ **Lowest precision/F1**
- ❌ Significant false positives

**Baseline Random Forest**
- ✅ Simple implementation
- ❌ Underperforms optimized versions

#### **Confusion Matrix Insights**

| Model                | True Negatives | False Positives | False Negatives | True Positives |
|----------------------|---------------|-----------------|-----------------|----------------|
| RF (Optimized)       | 1164          | 251             | 55              | 102            |
| XGBoost              | 1163          | 252             | 55              | 102            |
| Logistic Regression  | 970           | 295             | 41              | 116            |
| RF (Baseline)        | 1150          | 265             | 60              | 97             |

#### **Recommended Model Selection**

1. **For Production Deployment**:
   - 🏆 **XGBoost** - Best overall performance
   - 🥈 Random Forest (Optimized) - Close second, more interpretable

2. **When High Recall is Critical** (e.g., medical diagnosis):
   - 🚨 Logistic Regression - Despite lower precision

3. **Baseline Reference**:
   - 🔍 Random Forest (Baseline) - Demonstrates optimization impact



In [None]:
XGbest_model = best_xgb.fit(X_train_resampled_smtk, y_train_resampled_smtk)


: 

In [None]:
# Save your already-trained XGBoost model
joblib.dump(XGbest_model, 'urban_change_model.joblib')

print("Model successfully saved as 'urban_change_model.joblib'")

: 

<a id="setup"></a>
# <p style="background-color:rgb(64, 96, 143); font-family:calibri; color:white; font-size:50%; font-family:Verdana; text-align:center; border-radius:5px 5px;">Step 8| Deployment</p>

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib

# Load model and scalers
model = joblib.load('urban_change_model.joblib')
coord_scaler = joblib.load('coord_scaler.joblib')
dist_scaler = joblib.load('dist_scaler.joblib')

def main():
    st.set_page_config(page_title="Urban Change Predictor", layout="wide")
    
    # Custom CSS for background image
    st.markdown(
        f"""
        <style>
            .stApp {{
                background: url("https://imgix.brilliant-africa.com/Nairobi-National-Park-1.jpg?auto=format,enhance,compress&fit=crop&crop=entropy,faces,focalpoint&w=1880&h=740&q=30") no-repeat center center fixed;
                background-size: cover;
            }}
        </style>
        """,
        unsafe_allow_html=True
    )
    
    st.title("🌆 Urban Change Prediction Dashboard")
    
    # Input Selection
    input_method = st.radio("Select Input Method:", ["📍 Manual Entry", "📁 Upload CSV"], horizontal=True)
    
    if input_method == "📍 Manual Entry":
        col1, col2 = st.columns(2)
        
        with col1:
            x = st.number_input("Longitude (x)", value=36.922205, format="%.6f")
            y = st.number_input("Latitude (y)", value=-1.336726, format="%.6f")
        
        with col2:
            road = st.number_input("Distance to Road (m)", value=412.73)
            airport = st.number_input("Distance to Airport (m)", value=1987.40)
            urban = st.number_input("Distance to Urban (m)", value=10142.31)
        
        input_df = pd.DataFrame({
            'x': [x], 'y': [y],
            'Distance_to_Road(Metres)': [road],
            'Distance_to_Airport(Metres)': [airport],
            'Distance_to_Urban_Areas(Metres)': [urban]
        })
    else:
        uploaded_file = st.file_uploader("Upload CSV File", type=["csv"])
        if uploaded_file:
            input_df = pd.read_csv(uploaded_file)
            st.dataframe(input_df.head(3))
    
    if st.button("🚀 Predict") and 'input_df' in locals():
        with st.spinner("Analyzing... Please wait."):
            # Scale features
            coords = coord_scaler.transform(input_df[['x', 'y']])
            dists = dist_scaler.transform(input_df.filter(regex='Distance_to_'))
            X = np.hstack([coords, dists])
            
            # Make predictions
            preds = model.predict(X)
            probas = model.predict_proba(X)[:, 1]  # Probability of Urban Change
            
            # Format results
            results = input_df.copy()
            results['Prediction'] = np.where(preds == 1, 'Urban Change', 'No Change')
            results['Probability'] = [f"{p:.1%}" for p in probas]
            results['Confidence'] = np.select(
                [probas > 0.7, probas > 0.4],
                ['High', 'Medium'],
                default='Low'
            )
            
            st.success("✅ Prediction Complete!")
            
            # Apply styling
            def highlight_urban_change(row):
                return ['background-color: #90EE90' if row['Prediction'] == 'Urban Change' else ''] * len(row)
            
            st.dataframe(
                results.style
                .apply(highlight_urban_change, axis=1)
                .format({
                    'x': '{:.6f}', 'y': '{:.6f}',
                    'Distance_to_Road(Metres)': '{:.2f}',
                    'Distance_to_Airport(Metres)': '{:.2f}',
                    'Distance_to_Urban_Areas(Metres)': '{:.2f}'
                })
            )
            
            # Download button
            csv = results.to_csv(index=False).encode('utf-8')
            st.download_button("💾 Download Results", csv, "urban_change_predictions.csv", "text/csv")

if __name__ == "__main__":
    main()


: 