In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
import geopandas as gpd
import ast
import os
from collections import Counter

# 1. Inspect and Load GeoJSON Files
data_dir = "/content/drive/MyDrive/TreeSatAI1"  # Replace with your folder path
all_features = []
all_labels = []
invalid_samples = []
invalid_bands = Counter()
species_counts = Counter()

bands = ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12', 'NDVI', 'EVI', 'EVI2', 'SAVI', 'NDWI']
months = ['', '_1', '_2', '_3', '_4', '_5', '_6', '_7']
band_columns = [band + month for month in months for band in bands]

# Inspect first file
first_file = os.path.join(data_dir, os.listdir(data_dir)[0]) if os.listdir(data_dir) else None
if first_file and first_file.endswith(".geojson"):
    gdf = gpd.read_file(first_file)
    print("Inspecting first 2 rows of first GeoJSON file:")
    for idx in range(min(2, len(gdf))):
        print(f"\nRow {idx}:")
        for band in ['B2', 'B11', 'NDVI', 'B2_1', 'NDVI_7']:
            data = gdf[band].iloc[idx]
            try:
                parsed_data = ast.literal_eval(data) if isinstance(data, str) else data
                array = np.array(parsed_data, dtype=np.float32)
                print(f"  Band {band}: shape={array.shape}, first few values={array.flatten()[:5]}")
            except (ValueError, SyntaxError, TypeError) as e:
                print(f"  Band {band}: Error parsing/converting: {e}")

# Load all GeoJSON files
for file in os.listdir(data_dir):
    if file.endswith(".geojson"):
        gdf = gpd.read_file(os.path.join(data_dir, file))
        for idx, row in gdf.iterrows():
            try:
                patch = []
                for col in band_columns:
                    data = row[col]
                    if data is None or (isinstance(data, str) and data.lower() == 'none'):
                        invalid_bands[col] += 1
                        raise ValueError(f"None value in band {col}")
                    parsed_data = ast.literal_eval(data) if isinstance(data, str) else data
                    array = np.array(parsed_data, dtype=np.float32).reshape(5, 5)
                    patch.append(array)
                patch = np.stack(patch, axis=-1)
                if patch.shape != (5, 5, 120):
                    raise ValueError(f"Unexpected patch shape: {patch.shape}")
                all_features.append(patch)
                all_labels.append(row['l3_species'])
                species_counts[row['l3_species']] += 1
            except (ValueError, SyntaxError, TypeError) as e:
                invalid_samples.append((file, idx, str(e)))
                continue

# Log invalid samples and bands
if invalid_samples:
    print(f"\nSkipped {len(invalid_samples)} invalid samples:")
    for file, idx, error in invalid_samples[:5]:
        print(f"File: {file}, Row: {idx}, Error: {error}")
if invalid_bands:
    print("\nBands with None values:")
    for band, count in invalid_bands.most_common():
        print(f"  {band}: {count} times")
print("\nValid samples per species:")
for species, count in species_counts.most_common():
    print(f"  {species}: {count}")

# Convert to NumPy arrays
if not all_features:
    print("\nError: No valid samples loaded. Using Random Forest with dummy data.")
    X_dummy = np.random.rand(100, 5*5*120)
    y_dummy = np.random.randint(0, 5, 100)
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_dummy, y_dummy)
    print("Random Forest dummy accuracy:", rf.score(X_dummy, y_dummy))
    print("Please re-export data with updated GEE code.")
    exit()

X = np.array(all_features, dtype=np.float32)  # Shape: (21868, 5, 5, 120)
y = np.array(all_labels)

# 2. Preprocess Data
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_onehot = tf.keras.utils.to_categorical(y_encoded)
num_classes = len(label_encoder.classes_)

print(f"\nData shape: {X.shape}, Number of classes: {num_classes}")
print(f"Data range: min={X.min():.4f}, max={X.max():.4f}")

# Compute class weights for imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weight_dict = dict(enumerate(class_weights))

# Train-test-validation split
X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.15, random_state=42, stratify=y_onehot)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1765, random_state=42, stratify=y_train)

# 3. Define CNN Model
def build_cnn(input_shape=(5, 5, 120), num_classes=num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

model = build_cnn()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# 4. Data Augmentation
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical"),
    layers.RandomRotation(0.2),
])

# 5. Train Model
history = model.fit(
    data_augmentation(X_train), y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=64,  # Increased for larger dataset
    class_weight=class_weight_dict,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
    ]
)

# 6. Evaluate Model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_accuracy:.4f}")

# 7. Additional Metrics
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)
print("\nClassification Report:")
print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))
precision = precision_score(y_test_classes, y_pred_classes, average='weighted')
recall = recall_score(y_test_classes, y_pred_classes, average='weighted')
f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')
print(f"Weighted Precision: {precision:.4f}")
print(f"Weighted Recall: {recall:.4f}")
print(f"Weighted F1-Score: {f1:.4f}")

# 8. Confusion Matrix
import seaborn as sns
import matplotlib.pyplot as plt
cm = confusion_matrix(y_test_classes, y_pred_classes)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix (CNN)')
plt.show()

# 9. Save Label Encoder
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/TreeSatAI1'