In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
import joblib
import warnings
warnings.filterwarnings('ignore')

# Step 1: Create a Sample Dataset with 20,000 Rows
def create_sample_dataset():
    np.random.seed(42)
    data = {
        'Latitude': np.random.uniform(-90, 90, 20000),
        'Longitude': np.random.uniform(-180, 180, 20000),
        'Elevation': np.random.uniform(0, 3000, 20000),
        'Landcover_Type': np.random.choice(['Forest', 'Wetland', 'Urban', 'Water', 'Agriculture'], 20000)
    }
    df = pd.DataFrame(data)

    # Map Landcover_Type to numerical values
    df['Landcover_Type_Num'] = df['Landcover_Type'].map({
        'Forest': 0, 'Wetland': 1, 'Urban': 2, 'Water': 3, 'Agriculture': 4
    })

    df.to_csv('geospatial_dataset.csv', index=False)
    return df

dataset = create_sample_dataset()
print("Sample dataset created and saved as 'geospatial_dataset.csv'")

# Step 2: Load and Preprocess Dataset
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    X = df[['Latitude', 'Longitude', 'Elevation']]
    y = df['Landcover_Type_Num']

    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)

    return X_pca, y, scaler, pca

X, y, scaler, pca = load_and_preprocess_data('geospatial_dataset.csv')

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train Models
def train_models(X_train, y_train):
    rf = RandomForestClassifier(random_state=42)
    svm = SVC(kernel='linear', probability=True)
    knn = KNeighborsClassifier()

    rf.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    knn.fit(X_train, y_train)

    return rf, svm, knn

rf_model, svm_model, knn_model = train_models(X_train, y_train)

# Step 5: Hyperparameter Tuning
def hyperparameter_tuning(model, param_grid, X_train, y_train):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

rf_model = hyperparameter_tuning(RandomForestClassifier(random_state=42), rf_param_grid, X_train, y_train)

# Step 6: Evaluate Models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc}\n")
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return acc

rf_acc = evaluate_model(rf_model, X_test, y_test)
svm_acc = evaluate_model(svm_model, X_test, y_test)
knn_acc = evaluate_model(knn_model, X_test, y_test)

# Step 7: Save the Best Model
best_model = rf_model if rf_acc > max(svm_acc, knn_acc) else (svm_model if svm_acc > knn_acc else knn_model)
joblib.dump(best_model, 'best_model.sav')
print("Best model saved as 'best_model.sav'")

# Additional: Save preprocessing tools
joblib.dump(scaler, 'scaler.sav')
joblib.dump(pca, 'pca.sav')

# Step 8: Ready-to-Use Script
if __name__ == "__main__":
    # Load data and preprocess
    X, y, scaler, pca = load_and_preprocess_data('geospatial_dataset.csv')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train and evaluate models
    rf_model, svm_model, knn_model = train_models(X_train, y_train)
    rf_acc = evaluate_model(rf_model, X_test, y_test)
    svm_acc = evaluate_model(svm_model, X_test, y_test)
    knn_acc = evaluate_model(knn_model, X_test, y_test)

    # Save the best model
    best_model = rf_model if rf_acc > max(svm_acc, knn_acc) else (svm_model if svm_acc > knn_acc else knn_model)
    joblib.dump(best_model, 'best_model.sav')
    print("Ready-to-use script executed and best model saved.")


Sample dataset created and saved as 'geospatial_dataset.csv'
Accuracy: 0.20125

Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.21      0.20       816
           1       0.22      0.24      0.23       791
           2       0.19      0.18      0.18       783
           3       0.18      0.18      0.18       774
           4       0.21      0.20      0.21       836

    accuracy                           0.20      4000
   macro avg       0.20      0.20      0.20      4000
weighted avg       0.20      0.20      0.20      4000

Accuracy: 0.2015

Classification Report:
               precision    recall  f1-score   support

           0       0.20      0.64      0.31       816
           1       0.20      0.36      0.26       791
           2       0.00      0.00      0.00       783
           3       0.00      0.00      0.00       774
           4       0.00      0.00      0.00       836

    accuracy                           