<a href="https://colab.research.google.com/github/ced-sys/.py/blob/main/Untitled51.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')


In [None]:
plt.style.use('default')
sns.set_palette('husl')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os

base_path='/content/drive/MyDrive/cropland-mapping'
folders=['data', 'notebooks', 'submissions', 'src']

for folder in folders:
  folder_path=os.path.join(base_path, folder)
  os.makedirs(folder_path, exist_ok=True)
  print(f"Created/Verified: {folder_path}")

print("\nProject structure ready")

In [None]:
!pip install geopandas fiona shapely pyproj --quiet

In [None]:
data_path='/content/drive/MyDrive/Zindi Hackathons'

import geopandas as gpd
import glob
import zipfile

#load training data from Train folder
train_folder=f'{data_path}/Train'

#Check if Train.zip needs to be extracted
if os.path.exists(f'{data_path}/Train.zip'):
  print("Extracting train.zip...")
  with zipfile.ZipFile(f'{data_path}/Train.zip', 'r')as zip_ref:
    zip_ref.extractall(data_path)

#Find all shapefiles in the Train folder
shapefile_paths=glob.glob(f'{train_folder}/**/*.shp', recursive=True)
print(f"Foud {len(shapefile_paths)} shapefiles:")
for path in shapefile_paths:
  print(f"  -{path}")

#Load training shapefiles
training_data_list=[]
for shp_path in shapefile_paths:
  try:
    gdf=gpd.read_file(shp_path)
    region_name=os.path.basename(os.path.dirname(shp_path))
    gdf['Region']=region_name
    training_data_list.append(gdf)
    print(f"Loaded {region_name}: {gdf.shape[0]} samples")
    print(f"Columns: {list(gdf.columns)}")
  except Exception as e:
    print(f"Error loading {shp_path}: {e}")

#Combine all training data
if training_data_list:
  train_gdf=pd.concat(training_data_list, ignore_index=True)
  print(f"\n Combined trainig data shape: {train_gdf.shape}")
  print(f"Training data columns: {list(train_gdf.columns)}")
else:
  print(f"Training Shapefiles loaded successfully")

train_sentinel1=pd.read_csv(f'{data_path}/Sentinel1.csv')
train_sentinel2=pd.read_csv(f'{data_path}/Sentinel2.csv')

#load test data (for predictions)
test_data=pd.read_csv(f'{data_path}/Test.csv')

sample_submission=pd.read_csv(f'{data_path}/SampleSubmission.csv')

print(f"\nData Overview:")
print(f"Training polygons:{train_gdf.shape[0]} samples")
print(f"Sentinel-1 time series: {train_sentinel1.shape}")
print(f"Sentinel-2 time series: {train_sentinel2.shape}")
print(f"Test data: {test_data.shape}")
print(f"Sample submission: {sample_submission.shape}")

#Peek at the training data structure
print(f"\nTraining Data Structure:")
print(train_gdf.head())
print(f"\nTraining data info:")
print(train_gdf.info())

#Look for target/label column in training data
possible_target_cols=['ID', 'Cropland', 'geometry', 'Region']
target_col=None
for col in possible_target_cols:
  if col in train_gdf.columns:
    target_col=col
    break

if target_col:
  print(f"\nFound target column: '{target_col}'")
  print(f"Target distribution:")
  print(train_gdf[target_col].value_counts())
else:
  print(f"\nTarget column not found. Available columns: {list(train_gdf.columns)}")
  #lLet's examine the data to understand the structure
  print(f"\nFirst few rows of training data:")
  print(train_gdf.head())

print(f"\nSentinel Data Structure:")
print(f"\nSentinel-1 columns (first 10): {train_sentinel1.columns.tolist()[:10]}")
print(f"Sentinel-2 columns (first 10): {train_sentinel2.columns.tolist()[:10]}")
print(f"Test data columns (first 10): {test_data.columns.tolist()[:10]}")

print(f"\nSample submission format:")
print(sample_submission.head())

In [None]:
common_id_cols=[]
if 'train_gdf' in locals():
  for col in train_gdf.columns:
    if col in train_sentinel1.columns or col in train_sentinel2.columns:
      common_id_cols.append(col)

if common_id_cols:
  print(f"\nFound common ID columns:{common_id_cols}")
else:
  print(f"\nNo obvious ID matching columns found. We may need to use spatial relationships")

In [None]:
def calculate_ndvi_time_series(df):
    """Calculate NDVI time series from NIR and Red bands for each unique ID."""
    # Define potential NIR and Red band columns, considering different naming conventions
    nir_cols = [col for col in df.columns if 'B8' in col.upper() or 'NIR' in col.upper()]
    red_cols = [col for col in df.columns if 'B4' in col.upper() or 'RED' in col.upper()]

    if not nir_cols or not red_cols:
        print("Warning: Could not find NIR or Red band columns.")
        return None

    # Ensure 'ID' and 'date' columns exist
    if 'ID' not in df.columns or 'date' not in df.columns:
        print("Error: 'ID' and 'date' columns are required.")
        return None

    # Sort data by ID and date to ensure correct time series order
    df = df.sort_values(by=['ID', 'date'])

    # Group by ID and calculate NDVI for each time step within the group
    ndvi_list = []
    for id, group in df.groupby('ID'):
        ndvi_values = []
        # Assuming first column in nir_cols is the primary NIR band and same for red_cols
        nir = group[nir_cols[0]]
        red = group[red_cols[0]]
        # Calculate NDVI, handle division by zero
        ndvi = (nir - red) / (nir + red + 1e-8)
        ndvi_list.append({'ID': id, 'NDVI_values': ndvi.tolist()})

    # Convert the list of dictionaries to a DataFrame
    # This will result in a DataFrame where each row is an ID and a list of NDVI values
    ndvi_df = pd.DataFrame(ndvi_list)

    # We need to pivot this to have time steps as columns.
    # This requires padding lists to the same length if time series are of different lengths.
    max_len = ndvi_df['NDVI_values'].apply(len).max()
    ndvi_padded = ndvi_df['NDVI_values'].apply(lambda x: x + [np.nan] * (max_len - len(x)))
    ndvi_time_series_df = pd.DataFrame(ndvi_padded.tolist())
    ndvi_time_series_df.columns = [f'NDVI_t{i:02d}' for i in range(max_len)]
    ndvi_time_series_df['ID'] = ndvi_df['ID'] # Add ID back

    print(f"NDVI calculated: {max_len} time steps per ID (variable length time series padded with NaN)")
    return ndvi_time_series_df

In [None]:
def create_time_series_features(df, time_cols, prefix=""):
  features=pd.DataFrame({'ID': df['ID']})
  values=df[time_cols].values

  #Statistical features
  features[f'{prefix}mean']=np.mean(values, axis=1)
  features[f'{prefix}std']=np.std(values, axis=1)
  features[f'{prefix}min']=np.min(values, axis=1)
  features[f'{prefix}max']=np.max(values, axis=1)
  features[f'{prefix}range']=features[f'{prefix}max']-features[f'{prefix}min']
  features[f'{prefix}trend']=np.polyfit(range(len(time_cols)), values.T, 1)[0]

  #Seaonal features
  features[f'{prefix}peak_idx']=np.argmax(values, axis=1)
  features[f'{prefix}valley_idx']=np.argmin(values, axis=1)

  return features



In [None]:
# Use spectral bands from train_sentinel2 for both train and test feature creation
main_data = train_sentinel2
spectral_cols = [col for col in main_data.columns if col not in ['ID', 'cloud_pct', 'date', 'solar_azimuth', 'solar_zenith', 'translated_lat', 'translated_lon']]

# Create spectral features for training data
features_df = create_time_series_features(main_data, spectral_cols, prefix="spectral_")

# Create spectral features for test data (using the same spectral columns)
# We need to ensure test_data has the same structure as main_data for feature creation
# However, test_data only has ID, location, translated_lat, translated_lon
# Since test_data does not have the spectral bands, we cannot directly create spectral features for it in the same way.
# We need to rethink how to handle the test data features.

# Given the structure of the test data, it seems we cannot create time series features from it directly using the spectral bands.
# We will proceed by creating features from the training data and then figure out how to handle the test data later,
# possibly by matching IDs and using the same feature columns created from the training data.

# For now, let's focus on creating features for the training data.
print(f"Spectral features created for training data: {features_df.shape}")

# We will need to address feature creation for the test data in a subsequent step.
# Placeholder for test features - we will need to decide how to handle this based on the test data structure.
test_features_df = pd.DataFrame({'ID': test_data['ID']})
print(f"Placeholder created for test features: {test_features_df.shape}")

feature_type = "Spectral-based (Train only for now)"

print(f"\n{feature_type} features created: {features_df.shape} (train), {test_features_df.shape} (test - placeholder)")

In [None]:
# Convert 'ID' column in train_gdf to object type to match features_df
train_gdf['ID'] = train_gdf['ID'].astype(str)

# Aggregate features_df by ID
# We need to decide on the aggregation method (e.g., mean, median, min, max, etc.)
# For now, let's use the mean as an example. You might want to explore other aggregation methods later.
aggregated_features_df = features_df.groupby('ID').mean().reset_index()

# Merge training features and labels using an inner merge starting from train_gdf
# This ensures we only keep IDs that are present in both the training labels and the aggregated features
train_data = pd.merge(train_gdf[['ID', 'Cropland']], aggregated_features_df, on='ID', how='inner')

print("Merged training data shape:", train_data.shape)
print("\nMerged training data columns:", list(train_data.columns))
print("\nFirst 5 rows of merged training data:")
display(train_data.head())

In [None]:
target_col=None
for col in ['Cropland']:
  if col in train_data.columns: # Changed from main_data to train_data
    target_col=col
    break

if target_col:
  print(f"\nFound target: '{target_col}'")

  # Drop rows with NaN in the target column before preparing data
  train_data_cleaned = train_data.dropna(subset=[target_col])
  print(f"Training data shape after dropping NaNs: {train_data_cleaned.shape}")

  #Prepare training data
  #features_with_target=features_df.merge(main_data[['ID', target_col]], on='ID') # Removed this line
  X=train_data_cleaned.drop(['ID', target_col], axis=1) # Changed to use train_data_cleaned
  y=train_data_cleaned[target_col] # Changed to use train_data_cleaned

  print(f"Training shape: {X.shape}")
  print(f"Class Distribution: {y.value_counts().to_dict()}") # Corrected value_count() to value_counts()

  #Train Random Forest with cross-validation
  rf=RandomForestClassifier(
      n_estimators=100,
      max_depth=10,
      min_samples_split=10,
      min_samples_leaf=5,
      random_state=42,
      n_jobs=-1
  )

  cv_scores=cross_val_score( # Corrected cv_sores to cv_scores
      rf, X, y,
      cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
      scoring='accuracy'
  )
  print(f"\nCross-validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std()* 2:.4f})")

  #train final model
  rf.fit(X, y)

  #Feature importance
  feature_importance=pd.DataFrame({
      'feature':X.columns,
      'importance':rf.feature_importances_
  }).sort_values('importance', ascending=False)

  print(f"\nTop 5 important Features:")
  print(feature_importance.head())

  #VISUALIZATION
  fig, axes=plt.subplots(2, 2, figsize=(15, 10))
  fig.suptitle('Cropland Mapping Analysis', fontsize=16)

  #class distribution
  y.value_counts().plot(kind='pie', ax=axes[0,0], autopct='1.1f%%')
  axes[0, 0].set_title('CLass Distribution')

  #Feature distribution by class
  top_feature=feature_importance.iloc[0]['feature']
  # Use train_data_cleaned for plotting feature distribution
  train_data_cleaned[train_data_cleaned[target_col]==0][top_feature].hist(
      bins=30, alpha=0.7, label='Non-Cropland', ax=axes[1, 0]
  )
  train_data_cleaned[train_data_cleaned[target_col]==1][top_feature].hist(
      bins=30, alpha=0.7, label='Cropland', ax=axes[1, 0]
  )
  axes[1, 0].set_title(f'{top_feature} Distribution')
  axes[1, 0].legend()

  #CV scores
  axes[1, 1].bar(range(1, 6), cv_scores)
  axes[1, 1].axhline(y=cv_scores.mean(), color='r', linestyle='--', label=f'Mean: {cv_scores.mean():.3f}')
  axes[1, 1].set_title('Cross-Validation Scores')
  axes[1, 1].set_xlabel('Fold')
  axes[1, 1].legend()

  plt.tight_layout()
  plt.show()

  #Make predictions
  # We still need to create test_features_df with the same columns as X
  # For now, we will use the placeholder test_features_df, which only has 'ID'
  # This will cause an error in prediction, which we will address next.
  test_X=test_features_df.drop(['ID'], axis=1)
  predictions=rf.predict(test_X)

  #Create Submission
  submission=pd.DataFrame({
      'ID':test_features_df['ID'],
      'Target':predictions
  })

  print(f"\nPredictions completed")
  print(f"Cropland predictions: {sum(predictions)} ({sum(predictions)/len(predictions)*100:.1f}%)")

  #Save submission
  submission_path=f'{base_path}/submissions/baseline_submission_day1.csv'
  submission.to_csv(submission_path, index=False)
  print(f"Submission saved to: {submission_path}")

In [None]:
# Save submission to Colab runtime
runtime_submission_path = '/tmp/baseline_submission_day1.csv'
submission.to_csv(runtime_submission_path, index=False)
print(f"Submission also saved to Colab runtime: {runtime_submission_path}")