In [1]:
# ********************************************************************************
# IMPORTANT: BEFORE RUNNING THIS CELL, PERFORM A "FACTORY RESET RUNTIME" (Colab)
# OR THE EQUIVALENT DEEPEST RESTART IN YOUR ENVIRONMENT (e.g., Kaggle Session Restart).
# THEN, RUN THIS CELL AS THE VERY FIRST CODE IN YOUR NOTEBOOK.
# ********************************************************************************
!pip install -U scikit-learn==1.3.2 imbalanced-learn==0.12.3 xgboost==2.0.3
import pandas as pd
import numpy as np
import os
from collections import Counter
import ast
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
import json
import xgboost as xgb
import gc

# Now, the imports should work if the environment is truly clean
try:
    from imblearn.over_sampling import SMOTE
    print("\nSuccessfully imported SMOTE.")
except ImportError as e:
    print(f"\nCRITICAL ERROR: Failed to import SMOTE even after aggressive reinstallation: {e}")
    print("This indicates a severe, persistent environment issue.")
    print("Please double-check that you performed a 'Factory reset runtime' (Colab) or equivalent.")
    exit()

# 1. Inspect and Load GeoJSON Files (Modified for Zero Imputation)
data_dir = "/kaggle/input/mar-oct"  # Replace with your folder path
all_features = []
all_labels = []
invalid_samples = []
invalid_bands = Counter()
species_counts = Counter()

# Updated bands list to include all relevant bands
bands = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12', 'NDVI', 'EVI', 'SAVI', 'NDWI', 'DEM']
months = ['', '_1', '_2', '_3', '_4', '_5', '_6', '_7']
band_columns = [band + month for month in months for band in bands]

# Inspect first file
first_file = os.path.join(data_dir, os.listdir(data_dir)[0]) if os.listdir(data_dir) else None
if first_file and first_file.endswith(".geojson"):
    gdf = gpd.read_file(first_file)
    print("Inspecting first 2 rows of first GeoJSON file:")
    for idx in range(min(2, len(gdf))):
        print(f"\nRow {idx}:")
        for band in ['B1', 'B2', 'B11', 'NDVI', 'DEM', 'B2_1', 'NDVI_7']:
            if band in gdf.columns:
                data = gdf[band].iloc[idx]
                try:
                    parsed_data = ast.literal_eval(data) if isinstance(data, str) else data
                    array = np.array(parsed_data, dtype=np.float32)
                    print(f"  Band {band}: shape={array.shape}, first few values={array.flatten()[:5]}")
                except (ValueError, SyntaxError, TypeError) as e:
                    print(f"  Band {band}: Error parsing/converting: {e}")
            else:
                print(f"  Band {band}: Not found in GeoJSON file")

# Load all GeoJSON files
total_samples_attempted = 0
for file in os.listdir(data_dir):
    if file.endswith(".geojson"):
        try:
            gdf = gpd.read_file(os.path.join(data_dir, file))
            print(f"Processing file: {file}, Rows: {len(gdf)}")
            total_samples_attempted += len(gdf)
            for idx, row in gdf.iterrows():
                try:
                    patch = []
                    for col in band_columns:
                        if col not in gdf.columns:
                            invalid_bands[col] += 1
                            array = np.zeros((5, 5), dtype=np.float32)  # Impute with zeros
                            patch.append(array)
                            continue
                        data = row[col]
                        if data is None or (isinstance(data, str) and data.lower() == 'none'):
                            invalid_bands[col] += 1
                            array = np.zeros((5, 5), dtype=np.float32)  # Impute with zeros
                            patch.append(array)
                            continue
                        try:
                            parsed_data = ast.literal_eval(data) if isinstance(data, str) else data
                            array = np.array(parsed_data, dtype=np.float32).reshape(5, 5)
                        except (ValueError, SyntaxError, TypeError) as e:
                            invalid_bands[col] += 1
                            array = np.zeros((5, 5), dtype=np.float32)  # Impute for parsing errors
                            patch.append(array)
                            continue
                        patch.append(array)
                    patch = np.stack(patch, axis=-1)
                    if patch.shape != (5, 5, 136):  # Expected shape: 17 bands * 8 months
                        raise ValueError(f"Unexpected patch shape: {patch.shape}")
                    all_features.append(patch)
                    all_labels.append(row['l3_species'])
                    species_counts[row['l3_species']] += 1
                except (ValueError, SyntaxError, TypeError) as e:
                    invalid_samples.append((file, idx, str(e)))
                    continue
        except Exception as e:
            print(f"Failed to process file {file}: {e}")
            continue

# Log invalid samples and bands
print(f"\nTotal samples attempted: {total_samples_attempted}")
print(f"Valid samples processed: {len(all_features)}")
if invalid_samples:
    print(f"\nSkipped {len(invalid_samples)} invalid samples:")
    for file, idx, error in invalid_samples:
        print(f"File: {file}, Row: {idx}, Error: {error}")
if invalid_bands:
    print("\nBands with None or missing values:")
    for band, count in invalid_bands.most_common():
        print(f"  {band}: {count} times")
print("\nValid samples per species:")
for species, count in species_counts.most_common():
    print(f"  {species}: {count}")

# Convert to NumPy arrays
if not all_features:
    print("\nError: No valid samples loaded. Using Random Forest with dummy data.")
    X_dummy = np.random.rand(100, 5*5*136)  # Updated for 136 channels
    y_dummy = np.random.randint(0, 5, 100)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_dummy, y_dummy)
    print("Random Forest dummy accuracy:", rf.score(X_dummy, y_dummy))
    print("Please re-export data with updated GEE code.")
    exit()

X = np.array(all_features, dtype=np.float32)  # Shape: (N, 5, 5, 136)
y = np.array(all_labels)

Collecting scikit-learn==1.3.2
  Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.12.3
  Downloading imbalanced_learn-0.12.3-py3-none-any.whl.metadata (8.3 kB)
Downloading scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imbalanced_learn-0.12.3-py3-none-any.whl (258 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-

In [2]:
# 2. Encode Labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 3. Train-Test-Validation Split (Before SMOTE!)
X_flat = X.reshape(X.shape[0], -1)  # Flatten for ML
X_train, X_test, y_train, y_test = train_test_split(
    X_flat, y_encoded, test_size=0.15, random_state=42, stratify=y_encoded)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1765, random_state=42, stratify=y_train)

# 4. Apply SMOTE **only to training set**
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# 5. Scale features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"\nTraining set after SMOTE: {X_train_scaled.shape}, Classes: {len(np.unique(y_train_resampled))}")
print(f"Data range after scaling: min={X_train_scaled.min():.4f}, max={X_train_scaled.max():.4f}")

# 6. Compute Class Weights for resampled training data
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_resampled), y=y_train_resampled)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Create sample weights array for training samples
sample_weights = np.array([class_weight_dict[label] for label in y_train_resampled])

# Clear memory
gc.collect()

# 7. Define and Train XGBoost Model
model5 = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(np.unique(y_encoded)),
    max_depth=6,
    learning_rate=0.1,
    n_estimators=500,
    subsample=0.8,
    colsample_bytree=0.8,
    device='cuda',
    random_state=42
)

model5.fit(
    X_train_scaled, y_train_resampled,
    sample_weight=sample_weights,
    eval_set=[(X_val_scaled, y_val)],
    eval_metric='mlogloss',
    early_stopping_rounds=30,
    verbose=True
)

# Clear memory
gc.collect()

# 8. Evaluate Model
y_pred = model5.predict(X_test_scaled)
test_accuracy = (y_pred == y_test).mean()
print(f"\nTest Accuracy: {test_accuracy:.4f}")

# 9. Additional Metrics
report5 = classification_report(y_test, y_pred, target_names=label_encoder.classes_, output_dict=True)
print("\nClassification Report:")
print(json.dumps(report5, indent=4))
print(f"Recall: {recall_score(y_test, y_pred, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")

# 10. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8), dpi=100)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('confusion6.png', dpi=100)
plt.close()

gc.collect()

# 11. Save Outputs
joblib.dump(model5, 'model6.pkl')
json.dump(report5, open('report6.json', 'w'), indent=4)
np.save('confusion6.npy', cm)
joblib.dump(label_encoder, 'labelencoder6.pkl')
joblib.dump(scaler, 'scaler6.pkl')

print("Saved: model6.pkl, report6.json, confusion6.npy, confusion6.png, labelencoder6.pkl, scaler6.pkl")

gc.collect()


Training set after SMOTE: (71668, 3400), Classes: 19
Data range after scaling: min=-220.6690, max=229.0108




[0]	validation_0-mlogloss:2.62985
[1]	validation_0-mlogloss:2.44005
[2]	validation_0-mlogloss:2.29696
[3]	validation_0-mlogloss:2.18091
[4]	validation_0-mlogloss:2.08492
[5]	validation_0-mlogloss:2.00165
[6]	validation_0-mlogloss:1.92759
[7]	validation_0-mlogloss:1.86227
[8]	validation_0-mlogloss:1.80609
[9]	validation_0-mlogloss:1.75312
[10]	validation_0-mlogloss:1.70640
[11]	validation_0-mlogloss:1.66339
[12]	validation_0-mlogloss:1.62333
[13]	validation_0-mlogloss:1.58793
[14]	validation_0-mlogloss:1.55441
[15]	validation_0-mlogloss:1.52292
[16]	validation_0-mlogloss:1.49406
[17]	validation_0-mlogloss:1.46715
[18]	validation_0-mlogloss:1.44264
[19]	validation_0-mlogloss:1.41944
[20]	validation_0-mlogloss:1.39646
[21]	validation_0-mlogloss:1.37581
[22]	validation_0-mlogloss:1.35615
[23]	validation_0-mlogloss:1.33703
[24]	validation_0-mlogloss:1.31882
[25]	validation_0-mlogloss:1.30268
[26]	validation_0-mlogloss:1.28725
[27]	validation_0-mlogloss:1.27212
[28]	validation_0-mlogloss:1.2

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.





Test Accuracy: 0.7169

Classification Report:
{
    "alder": {
        "precision": 0.6146788990825688,
        "recall": 0.6242236024844721,
        "f1-score": 0.6194144838212635,
        "support": 322.0
    },
    "birch": {
        "precision": 0.5254691689008043,
        "recall": 0.5297297297297298,
        "f1-score": 0.5275908479138627,
        "support": 370.0
    },
    "black pine": {
        "precision": 0.6923076923076923,
        "recall": 0.5806451612903226,
        "f1-score": 0.631578947368421,
        "support": 62.0
    },
    "cherry": {
        "precision": 0.75,
        "recall": 0.4864864864864865,
        "f1-score": 0.5901639344262296,
        "support": 37.0
    },
    "douglas fir": {
        "precision": 0.8412698412698413,
        "recall": 0.8079268292682927,
        "f1-score": 0.8242612752721619,
        "support": 328.0
    },
    "english oak": {
        "precision": 0.6707616707616708,
        "recall": 0.6484560570071259,
        "f1-score": 0.6594

42

In [3]:
# 9. Evaluate Model on Final Test Data
test_data_dir = "/kaggle/input/final-test-data"
test_features = []
test_labels = []
test_invalid_samples = []
test_invalid_bands = Counter()
total_samples_attempted = 0  # Track total samples processed

# Check if directory exists and list files
if not os.path.exists(test_data_dir):
    print(f"\nError: Test data directory {test_data_dir} does not exist.")
    exit()
geojson_files = [f for f in os.listdir(test_data_dir) if f.endswith(".geojson")]
print(f"\nFound {len(geojson_files)} GeoJSON files in {test_data_dir}")

# Load all GeoJSON files from test data directory
for file in geojson_files:
    try:
        file_path = os.path.join(test_data_dir, file)
        gdf = gpd.read_file(file_path)
        print(f"Processing file: {file}, Rows: {len(gdf)}")
        total_samples_attempted += len(gdf)  # Count all rows in the file
        for idx, row in gdf.iterrows():
            try:
                patch = []
                for col in band_columns:
                    if col not in gdf.columns:
                        test_invalid_bands[col] += 1
                        array = np.zeros((5, 5), dtype=np.float32)  # Impute with zeros
                        patch.append(array)
                        continue
                    data = row[col]
                    if data is None or (isinstance(data, str) and data.lower() == 'none'):
                        test_invalid_bands[col] += 1
                        array = np.zeros((5, 5), dtype=np.float32)  # Impute with zeros
                        patch.append(array)
                        continue
                    try:
                        parsed_data = ast.literal_eval(data) if isinstance(data, str) else data
                        array = np.array(parsed_data, dtype=np.float32).reshape(5, 5)
                    except (ValueError, SyntaxError, TypeError) as e:
                        test_invalid_bands[col] += 1
                        array = np.zeros((5, 5), dtype=np.float32)  # Impute for parsing errors
                        patch.append(array)
                        continue
                    patch.append(array)
                patch = np.stack(patch, axis=-1)
                if patch.shape != (5, 5, 136):  # Expected shape: 17 bands * 8 months
                    raise ValueError(f"Unexpected patch shape: {patch.shape}")
                test_features.append(patch)
                test_labels.append(row['l3_species'])
            except (ValueError, SyntaxError, TypeError) as e:
                test_invalid_samples.append((file, idx, str(e)))
                continue
    except Exception as e:
        print(f"Failed to process file {file}: {e}")
        continue

# Log invalid samples and bands
print(f"\nTotal samples attempted: {total_samples_attempted}")
print(f"Valid samples processed: {len(test_features)}")
if test_invalid_samples:
    print(f"\nSkipped {len(test_invalid_samples)} invalid test samples:")
    for file, idx, error in test_invalid_samples:
        print(f"File: {file}, Row: {idx}, Error: {error}")
if test_invalid_bands:
    print("\nBands with missing/None/parsing issues in test data:")
    for band, count in test_invalid_bands.most_common():
        print(f"  {band}: {count} times")

# Convert to NumPy arrays
if not test_features:
    print("\nError: No valid test samples loaded. Cannot evaluate model.")
    exit()

X_test_final = np.array(test_features, dtype=np.float32)  # Shape: (N, 5, 5, 136)
y_test_final = np.array(test_labels)

# Preprocess test data
try:
    y_test_final_encoded = label_encoder.transform(y_test_final)  # Use same LabelEncoder
except ValueError as e:
    print(f"Error in label encoding: {e}")
    unknown_labels = set(y_test_final) - set(label_encoder.classes_)
    print(f"Unknown labels in test data: {unknown_labels}")
    exit()
X_test_final_flat = X_test_final.reshape(X_test_final.shape[0], -1)  # Flatten for prediction
X_test_final_scaled = scaler.transform(X_test_final_flat)  # Use same StandardScaler

# Diagnostic: Compare class distributions
print("\nTraining class distribution:")
print(pd.Series(label_encoder.inverse_transform(y_train_resampled)).value_counts())
print("\nTest class distribution:")
print(pd.Series(y_test_final).value_counts())
missing_classes = set(label_encoder.classes_) - set(y_test_final)
print(f"Classes missing in test data: {missing_classes}")

# Diagnostic: Compare feature distributions
print("\nTraining feature stats (after scaling):")
print(f"Mean: {X_train_scaled.mean():.4f}, Std: {X_train_scaled.std():.4f}")

print("\nTest feature stats (after scaling):")
print(f"Mean: {X_test_final_scaled.mean():.4f}, Std: {X_test_final_scaled.std():.4f}")


# Evaluate model on test data
y_pred_final = model5.predict(X_test_final_scaled)
test_accuracy_final = (y_pred_final == y_test_final_encoded).mean()
print(f"\nFinal Test Data Accuracy: {test_accuracy_final:.4f}")

# Get unique labels in test data to avoid mismatch
unique_test_labels = np.unique(y_test_final_encoded)
unique_test_label_names = label_encoder.inverse_transform(unique_test_labels)

# Additional metrics for test data
report_final = classification_report(
    y_test_final_encoded,
    y_pred_final,
    labels=unique_test_labels,
    target_names=unique_test_label_names,
    output_dict=True
)
print("\nClassification Report for Final Test Data:")
print(json.dumps(report_final, indent=4))
print(f"Recall (Final Test): {recall_score(y_test_final_encoded, y_pred_final, average='weighted'):.4f}")
print(f"F1-Score (Final Test): {f1_score(y_test_final_encoded, y_pred_final, average='weighted'):.4f}")

# Confusion matrix for test data
cm_final = confusion_matrix(y_test_final_encoded, y_pred_final, labels=unique_test_labels)
plt.figure(figsize=(10, 8), dpi=100)
sns.heatmap(cm_final, annot=True, fmt='d', xticklabels=unique_test_label_names, yticklabels=unique_test_label_names, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix for Final Test Data')
plt.savefig('confusion_final.png', dpi=100)
plt.close()

# Total number of test points
print(f"\nTotal Number of Test Points: {len(y_test_final)}")

# Save outputs for test data
json.dump(report_final, open('report_final.json', 'w'), indent=4)
np.save('confusion_final.npy', cm_final)
print("Saved: report_final.json, confusion_final.npy, confusion_final.png")



Found 19 GeoJSON files in /kaggle/input/final-test-data
Processing file: needleleaf_douglas fir_douglas firmar-oct-2022.geojson, Rows: 506
Processing file: broadleaf_short-lived deciduous_aldermar-oct-2022.geojson, Rows: 420
Processing file: broadleaf_beech_european beechmar-oct-2022.geojson, Rows: 1703
Processing file: needleleaf_larch_japanese larchmar-oct-2022.geojson, Rows: 135
Processing file: broadleaf_short-lived deciduous_poplarmar-oct-2022.geojson, Rows: 77
Processing file: needleleaf_pine_scots pinemar-oct-2022.geojson, Rows: 1202
Processing file: broadleaf_oak_english oakmar-oct-2022.geojson, Rows: 645
Processing file: needleleaf_larch_european larchmar-oct-2022.geojson, Rows: 221
Processing file: broadleaf_long-lived deciduous_sycamore maplemar-oct-2022.geojson, Rows: 725
Processing file: broadleaf_long-lived deciduous_european ashmar-oct-2022.geojson, Rows: 432
Processing file: broadleaf_short-lived deciduous_birchmar-oct-2022.geojson, Rows: 353
Processing file: broadleaf