<a href="https://colab.research.google.com/github/ced-sys/.py/blob/main/Untitled52.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install geopandas folium scikit-learn matplotlib seaborn shapely fiona

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def load_sentinel_data(s1_path, s2_path):
  print("Loading Sentinel-1 data...")
  s1_data=pd.read_csv(s1_path)
  print(f"Sentinel-1 shape: {s1_data.shape}")
  print("Sentinel-1 columns:", list(s1_data.columns))

  print("\nLoading Sentinel-2 data...")
  s2_data=pd.read_csv(s2_path)
  print(f"Sentinel-2 shape: {s2_data.shape}")
  print("Sentinel-2 columns:", list(s2_data.columns))

  #Convert date columns
  s1_data['date']=pd.to_datetime(s1_data['date'])
  s2_data['date']=pd.to_datetime(s2_data['date'])

  print("\nData loaded successfully")
  return s1_data, s2_data

In [None]:
def load_training_shapefiles(train_folder_path):
  import os
  import glob

  print(f"Loading shapefiles from: {train_folder_path}")

  #Find all shapefiles in the folder
  shapefile_pattern=os.path.join(train_folder_path, "*.shp")
  shapefile_paths=glob.glob(shapefile_pattern)

  if not shapefile_paths:
    print("No shapefiles found, Please check the folder path.")
    return {}

  training_shapes={}

  for shp_path in shapefile_paths:
    region_name=os.path.basename(shp_path).replace('.shp', '')
    print(f"Loading {region_name}...")

    try:
      gdf=gpd.read_file(shp_path)
      training_shapes[region_name]=gdf
      print(f"   -Shape: {gdf.shape}")
      print(f"   -Columns: {list(gdf.columns)}")
      print(f"   -CRS: {gdf.crs}")


      #Print unique classes if available
      class_columns=[col for col in gdf.columns
                     if 'class' in col.lower() or 'land' in col.lower() or 'crop' in col.lower()]
      for col in class_columns:
        print(f"  -Unique values in {col}: {gdf[col].unique()}")


    except Exception as e:
      print(f"Error loading {shp_path}: {e}")


  print(f"\nLoaded {len(training_shapes)} shapefiles")
  return training_shapes

In [None]:
def explore_satellite_data(s1_data, s2_data):
  print("\n--- SENTINEL-1 DATA ---")
  print(f"Date range: {s1_data['date'].min()} to {s1_data['date'].max()}")
  print(f"Unique IDs: {s1_data['ID'].nunique()}")
  print(f"Unique orbits: {s1_data['orbit'].nunique() if 'orbit' in s1_data.columns else 'N/A'}")


  #Check for missing values
  print("\nMissing values:")
  print(s1_data.isnull().sum())

  #Stats for VH and VV
  print("\nVH/VV Statistics:")
  print(s1_data[['VH', 'VV']].describe())

  #Sentinel-2 exploration
  print("\n--- SENTINEL-2 DATA ---")
  print(f"Date range: {s2_data['date'].min()} to {s2_data['date'].max()}")
  print(f"Unique IDs: {s2_data['ID'].nunique()}")

  #Check for missing values
  print("\nMissing values:")
  print(s2_data.isnull().sum())

  #Spectral bands analysis
  spectral_bands=['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12']
  available_bands=[band for band in spectral_bands if band in s2_data.columns]
  print(f"\nAvailabel spectral bands: {available_bands}")

  if available_bands:
    print("\nSpectral band statistics:")
    print(s2_data[available_bands].describe())

  #Cloud coverage analysis
  if 'cloud_pct' in s2_data.columns:
    print(f"\nCloud coverage statistics:")
    print(s2_data['cloud_pct'].describe())
    print(f"Clear observations (<20% cloud): {(s2_data['cloud_pct']<20).mean()*100:.1f}%")

  #Solar angle analysis
  solar_cols=['solar_azimuth', 'solar_zenith']
  available_solar=[col for col in solar_cols if col in s2_data.columns]
  if available_solar:
    print(f"\nSolar angle statistics:")
    print(s2_data[available_solar].describe())


In [None]:
def create_temporal_features(data, id_col='ID', date_col='date', value_cols=None):
  if value_cols is None:
    #Auto-detect value columns
    exclude_cols=[id_col, date_col, 'translated_lat', 'translated_lon',
                  'orbit', 'polarization', 'rel_orbit', 'cloud_pct',
                  'solar_azimuth', 'solar_zenith']
    value_cols=[col for col in data.columns
                if col not in exclude_cols and
                data[col].dtype in ['float64', 'int64']]

  print(f"Creating temporal features for columns: {value_cols}")

  features_list=[]

  for point_id in data[id_col].unique():
    point_data=data[data[id_col]==point_id].sort_values(date_col)

    if len(point_data)<2:
      continue


    feature_dict={id_col: point_id}


    #Add location
    feature_dict['lat']=point_data['translated_lat'].iloc[0]
    feature_dict['lon']=point_data['translated_lon'].iloc[0]

    #Temporal metadata
    feature_dict['n_observations']=len(point_data)
    feature_dict['date_range_days']=(point_data[date_col].max()-
                                     point_data[date_col].min()).days
    feature_dict['first_obs']=point_data[date_col].min().dayofyear
    feature_dict['last_obs']=point_data[date_col].max().dayofyear

    #Statistcial features for eah value column
    for col in value_cols:
      if col in point_data.columns:
        values=point_data[col].dropna()
        if len(values)>0:
          feature_dict[f'{col}_mean']=values.mean()
          feature_dict[f'{col}_std']=values.std()
          feature_dict[f'{col}_min']=values.min()
          feature_dict[f'{col}_max']=values.max()
          feature_dict[f'{col}_median']=values.median()
          feature_dict[f'{col}_range']=values.max()-values.min()

          #Percentiles
          feature_dict[f'{col}_p25']=values.quantile(0.25)
          feature_dict[f'{col}_p75']=values.quantile(0.75)

          #Temporal trend(slope of linear fit)
          if len(values)>1:
            x=np.arange(len(values))
            slope=np.polyfit(x, values, 1)[0]
            feature_dict[f'{col}_slope']=slope

          #Coefficient of variation
          if values.mean()!=0:
            feature_dict[f'{col}_cv']=values.std()/ values.mean()

    #add cloud coverage statistics for s2 data
    if 'cloud_pct' in point_data.columns:
      cloud_values=point_data['cloud_pct'].dropna()
      if len(cloud_values)>0:
        feature_dict['cloud_pct_mean']=cloud_values.mean()
        feature_dict['cloud_pct_min']=cloud_values.min()
        feature_dict['cloud_pct_max']=cloud_values.max()
        feature_dict['cloud_obs_ratio']=(cloud_values<20).mean()

    #Add cloud coverage statistics for s2 data
    for angle_col in ['solar_azimuth', 'solar_zenit']:
      if angle_col in point_data.columns:
        angle_values=point_data[angle_col].dropna()
        if len(angle_values)>0:
          feature_dict[f'{angle_col}_mean']=angle_values.mean()
          feature_dict[f'{angle_col}_std']=angle_values.std()

    features_list.append(feature_dict)

  return pd.DataFrame(features_list)


In [None]:
def calculate_vegetation_indices(s2_data):
  print("Calculating vegetation indices...")

  s2_enhanced=s2_data.copy()

  #NDVI (Normalized Difference Vegetation Indices)
  if 'B4' in s2_data.columns and 'B8' in s2_data.columns:
    s2_enhanced['NDVI']=(s2_data['B8']-s2_data['B4'])/ (s2_data['B8']+s2_data['B4']+1e-8)

    #EVI (Enhanced Vegetation Index)
    if all(band in s2_data.columns for band in ['B2', 'B4', 'B8']):
      s2_enhanced['EVI']=2.5 ((s2_data['B8']-s2_data['B4'])/
                              (s2_data['B8']+6* s2_data['B4']-7.5*s2_data['B2']+1))

    #NDWI (Normalized Difference Water Index)
    if 'B3' in s2_data.columns and 'B8' is s2_data.columns:
      s2_enhanced['NDWI']=(s2_data['B3']-s2_data['B8'])/ (s2_data['B3']+s2_data['B8']+1e-8)

    #SAVI (Soil Adjusted Vegetation Index)
    if 'B4' in s2_data.columns and 'B8' in s2_data.columns:
      L=0.5
      s2_enhanced['SAVI']=((s2_data['B8']-s2_data['B4'])/
                           (s2_data['B8']+s2_data['B4']+L))* (1+L)

    #NDRE (Normalized Difference Red Edge)
    if 'B5' in s2_data.columns and 'B8' in s2_data.columns:
      s2_enhanced['NDRE']=(s2_data['B8']-s2_data['B5'])/ (s2_data['B8']+s2_data['B5']+1e-8)

    #MCARI (Modified Chlorophyll Absorption Index)
    if all(band in s2_data.columns for band in ['B3', 'B4', 'B5']):
      s2_enhanced['MCARI']=((s2_data['B5']-s2_data['B4'])-0.2*
                            (s2_data['B5']-s2_data['B3']))* (s2_data['B5']/ s2_data['B4'])

    #NBR (Normalized Burn ratio) - useful for detecting bare soil
    if 'B8' in s2_data.columns and 'B4' in s2_data.columns:
      s2_enhanced['NBR']=(s2_data['B8']-s2_data['B12'])/(s2_data['B8']+s2_data['B12']+1e-8)


    #Simple band ratios
    if 'B8' in s2_data.columns and 'B4' in s2_data.columns:
      s2_enhanced['NIR_Red_ratio']=s2_data['B8']/ (s2_data['B4']+1e-8)

    if 'B11' in s2_data.columns and 'B12' in s2_data.columns:
      s2_enhanced['SWIR_SWIR2_ratio']=s2_data['B11']/ (s2_data['B12']+1e-8)


    print(f"Added {len(s2_enhanced.columns)- len(s2_data.columns)} vegetation indices")
    return s2_enhanced



In [None]:
def process_all_satellite_data(s1_data, s2_data):
  print("Processing Sentinel-1 features...")
  s1_features=create_temporal_features(s1_data, value_cols=['VH', 'VV'])

  print('Processing Sentinel-2 features...')
  #Get s2 Spectral columns
  s2_spectral_bands=['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12']
  #include cloud and solar angle information
  s2_value_cols=s2_spectral_bands+['cloud_pct', 'solar_azimuth', 'solar_zenith']
  #Only use columns that exist in the data
  available_s2_cols=[col for col in s2_value_cols if col in s2_data.columns]
  print(f"Available S2 columns: {available_s2_cols}")

  s2_features=create_temporal_features(s2_data, value_cols=available_s2_cols)

  #Merge S1 and S2 features
  print("Merging S1 and S2 features...")
  all_features=pd.merge(s1_features, s2_features, on='ID', how='outer', suffixes=('_s1', '_s2'))

  #Handle coordinate duplicates
  if 'lat_s1' in all_features.columns and 'lat_s2' in all_features.columns:
    all_features['lat']=all_features['lat_s1'].fillna(all_features['lat_s2'])
    all_features['lon']=all_features['lon_s1'].fillna(all_features['lon_s2'])
    all_features=all_features.drop(['lat_s1', 'lat_s2', 'lon_s1', 'lon_s2'], axis=1)

  print(f"Final feature matrix shape: {all_features.shape}")
  return all_features

In [None]:
def spatial_join_training_data(s1_data, training_shapes, buffer_distance=0.001):
  from shapely.geometry import Point

  print("Performing spatial join with training data...")

  #Combine all training data
  all_training=[]
  for region, gdf in training_shapes.items():
    gdf_copy=gdf.copy()
    gdf_copy['region']=region
    all_training.append(gdf_copy)

  if not all_training:
    print("No training data available")
    return None

  combined_training=gpd.GeoDataFrame(pd.concat(all_training, ignore_index=True))

  #Create a point geometries from satellite data coordinates
  unique_points=s1_data[['ID', 'translated_lat', 'translated_lon']].drop_duplicates()

  geometry=[Point(lon, lat) for lon, lat in zip(unique_points['translated_lon'],
                                                unique_points['translated_lat'])]

  points_gdf=gpd.GeoDataFrame(unique_points, geometry=geometry)

  #Set CRS (assuming WGS84- adjust if needed)
  points_gdf.crs='EPSG:4326'
  if combined_training.crs is  None:
    combined_training.crs='EPSG:4326'

  #Perform spatial join
  print("Executing spatial join...")
  joined_data=gpd.sjoin(points_gdf, combined_training, how='inner', predicate='within')

  if len(joined_data)==0:
    print("No Spatial matches found! Trying with buffer...")
    #Buffer the training polygons
    combined_training['geometry']=combined_training.geometry.buffer(buffer_distance)
    joined_data=gpd.sjoin(points_gdf, combined_training, how='inner', predicate='within')

  print(f"Spatial join completed: {len(joined_data)} matches found")
  return joined_data

In [None]:
def prepare_training_dataset(processed_features, spatial_joined_data):
  #Find the class column in spatial joined data
  potential_class_cols=[col for col in spatial_joined_data.columns
                        if any (keyword in col.lower() for keyword in
                                ['class', 'land', 'crop', 'type', 'cover'])]

  if not potential_class_cols:
    print("No class column found in training data")
    return None, None

  class_col=potential_class_cols[0]
  print(f"Using '{class_col}' as the class column")


  #Merge features with labels
  # Use 'ID_left' from spatial_joined_data as it contains the original satellite data ID
  training_data=pd.merge(
      processed_features,
      spatial_joined_data[['ID_left', class_col, 'region']],
      left_on='ID', # Merge processed_features on 'ID'
      right_on='ID_left', # Merge spatial_joined_data on 'ID_left'
      how='inner'
  )

  # Drop the redundant 'ID_left' column after merging
  training_data = training_data.drop('ID_left', axis=1)


  print(f"Training dataset shape; {training_data.shape}")
  print(f"Class Distribution:")
  print(training_data[class_col].value_counts())

  return training_data, class_col

In [None]:
def train_cropland_model(training_data, class_col):
  print("Training the classification model...")

  #Prepare features and labels
  feature_cols=[col for col in training_data.columns
                if col not in ['ID', class_col, 'region']]
  X=training_data[feature_cols]
  y=training_data[class_col]

  #Handle missing values
  X=X.fillna(X.mean())

  #Initialize components
  scaler=StandardScaler()
  label_encoder=LabelEncoder()

  #Encode labels
  y_encoded=label_encoder.fit_transform(y)

  #Scale features
  X_scaled=scaler.fit_transform(X)

  #Split data
  X_train, X_test, y_train, y_test=train_test_split(
      X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
  )

  #Train Random Forest Model
  model=RandomForestClassifier(
      n_estimators=200,
      max_depth=15,
      min_samples_split=5,
      min_samples_leaf=2,
      random_state=42,
      n_jobs=-1
  )

  model.fit(X_train, y_train)

  #Evaluate model
  y_pred=model.predict(X_test)
  accuracy=accuracy_score(y_test, y_pred)

  print(f"Model Accuracy: {accuracy:.3f}")
  print("\nClassification Report:")
  # Ensure target_names is a list of strings
  target_names_list = label_encoder.classes_.tolist()
  print(classification_report(y_test, y_pred,
                             target_names=target_names_list))

  #Feature Importance
  feature_importance=pd.DataFrame({
      'feature':feature_cols,
      'importance':model.feature_importances_
  }).sort_values('importance', ascending=False)

  #Cross-validation
  cv_scores=cross_val_score(model, X_scaled, y_encoded, cv=5)
  print(f"\nCross-validation scores: {cv_scores}")
  print(f"Mean Cv score: {cv_scores.mean():3f}(+/-{cv_scores.std()*2:.3f})")

  #Return all components
  return{
      'model':model,
      'scaler': scaler,
      'label_encoder': label_encoder,
      'feature_cols':feature_cols,
      'X_test': X_test,
      'y_test': y_test,
      'y_pred': y_pred,
      'feature_importance': feature_importance,
      'accuracy': accuracy,
      'cv_scores':cv_scores
  }

In [None]:
def predict_cropland(model_dict, new_features):
  model=model_dict['model']
  scaler=model_dict['scaler']
  label_encoder=model_dict['label_encoder']
  feature_cols=model_dict['feature_cols']

  #Prepare features
  X_new=new_features[feature_cols].fillna(new_features[feature_cols].mean())

  #Scale features
  X_new_scaled=scaler.transform(X_new)

  #Make predictions
  predictions=model.predict(X_new_scaled)
  probabilities=model.predict_proba(X_new_scaled)

  #Decode predictions
  predicted_labels=label_encoder.inverse_transform(predictions)

  return predicted_labels, probabilities


In [None]:
def plot_model_results(model_dict):

  y_test=model_dict['y_test']
  y_pred=model_dict['y_pred']
  feature_importance=model_dict['feature_importance']
  label_encoder=model_dict['label_encoder']

  fig, axes=plt.subplots(2, 2, figsize=(15, 12))

  #Confusion matrix
  cm=confusion_matrix(y_test, y_pred)
  sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
              xticklabels=label_encoder.classes_,
              yticklabels=label_encoder.classes_,
              ax=axes[0, 0])
  axes[0, 0].set_title('Confusion Matrix')
  axes[0, 0].set_ylabel('True Label')
  axes[0, 0].set_xlabel('Predicted Label')

  #feature Importance
  top_features=feature_importance.head(15)
  axes[0, 1].barh(top_features['feature'], top_features['importance'])
  axes[0, 1].set_title('Top 15 Feature Importance')
  axes[0, 1].set_xlabel('Importance')

  #Class Distribution
  class_counts=pd.Series(y_test).value_counts()
  class_names=[label_encoder.classe_[i] for i in class_counts.index]
  axes[1,0].pie(class_counts.values, labels=class_names, autopct='%1.1f%%')
  axes[1,0].set_title('Test Set Class Distribution')

  #Accuracy by class
  from sklearn.metrics import classification_report
  report=classification_report(y_test, y_pred, output_dict=True)
  classes=list(report.keys())[:-3]
  f1_scores=[report[cls]['f1_score'] for cls in classes]
  class_labels=[label_encoder.classes_[int(cls)] if cls.isdigit() else cls for cls in classes]

  axes[1, 1].bar(class_labels, f1_scores)
  axes[1, 1].set_title('F1-Score by Class')
  axes[1, 1].set_ylabel('F1-Score')
  axes[1, 1].tick_params(axis='x', rotation=45)

  plt.tight_layout()
  plt.show()

In [None]:
def visualize_satellite_data(s1_data, s2_data, processed_features):
  fig, axes=plt.subplots(2, 3, figsize=(18, 12))

  #1.Observation count distribution
  if 'n_observations_s1' in processed_features.columns:
    processed_features['n_observation_s1'].hist(bins=30, ax=axes[0, 0])
    axes[0, 0].set_title("S1 Obswrvation Count Distribution")
    axes[0, 0].set_xlabel('Number of Observations')
    axes[0, 0].set_ylabel('Frequency')


  #2.VH vs VV Scatter
  if 'VH_mean' in processed_features.columns and 'VV_mean' in processed_features.columns:
    axes[0, 1].scatter(processed_features['VH_mean'],
                       processed_features['VV_mean'], alpha=0.5)
    axes[0, 1].set_title('VH vs VV Mean Values')
    axes[0, 1].set_xlabel('VH Mean')
    axes[0, 1].set_ylabel('VV Mean')


  #3. Geographic distribution
  if 'lat' in processed_features.columns and 'lon' in processed_features.columns:
    axes[0, 2].scatter(processed_features['lon'],
                       processed_features['lat'], alpa=0.5)
    axes[0, 2].set_title('Geographic Distribution of Points')
    axes[0, 2].set_xlabel('Longitude')
    axes[0, 2].set_ylabel('Latitude')

  #4. Seasonal Patterns
  if 'first_obs_s1' in processed_features.columns:
    processed_features['first_obs_s1'].hist(bins=30, ax=axes[1, 0])
    axes[1, 0].set_title('First Observation Day of the Yeat (S1)')
    axes[1, 0].set_xlabel('Day Of Year')


  #5. Date range distribution
  if 'date_range_days_s1' in processed_features.columns:
    processed_features['date_range_days_s1'].hist(bins=30, ax=axes[1, 1])
    axes[1,1].set_title('Observation Period Length (S1)')
    axes[1, 1].set_xlabel('Days')
    axes[1, 1].set_ylabel('Frequency')


  #6. VH standard deviation
  if 'VH_std' in processed_features.columns:
    processed_features['VH_std'].hist(bins=30, ax=axes[1, 2])
    axes[1, 2].set_title('VH Standard Deviation')
    axes[1, 2].set_xlabel('VH Std')
    axes[1, 2].set_ylabel('Frequency')

  plt.tight_layout()
  plt.show()


In [None]:
def visualize_satellite_data(s1_data, s2_data, processed_features):
    """
    Create visualizations of the satellite data

    Args:
        s1_data (DataFrame): Sentinel-1 data
        s2_data (DataFrame): Sentinel-2 data
        processed_features (DataFrame): Processed features
    """
    fig, axes = plt.subplots(3, 3, figsize=(20, 15))

    # 1. Observation count distribution
    if 'n_observations_s1' in processed_features.columns:
        processed_features['n_observations_s1'].hist(bins=30, ax=axes[0,0])
        axes[0,0].set_title('S1 Observation Count Distribution')
        axes[0,0].set_xlabel('Number of Observations')
        axes[0,0].set_ylabel('Frequency')

    # 2. VH vs VV scatter
    if 'VH_mean' in processed_features.columns and 'VV_mean' in processed_features.columns:
        axes[0,1].scatter(processed_features['VH_mean'],
                        processed_features['VV_mean'], alpha=0.5)
        axes[0,1].set_title('VH vs VV Mean Values')
        axes[0,1].set_xlabel('VH Mean')
        axes[0,1].set_ylabel('VV Mean')

    # 3. Geographic distribution
    if 'lat' in processed_features.columns and 'lon' in processed_features.columns:
        axes[0,2].scatter(processed_features['lon'],
                        processed_features['lat'], alpha=0.5)
        axes[0,2].set_title('Geographic Distribution of Points')
        axes[0,2].set_xlabel('Longitude')
        axes[0,2].set_ylabel('Latitude')

    # 4. NDVI distribution
    if 'NDVI_mean' in processed_features.columns:
        processed_features['NDVI_mean'].hist(bins=30, ax=axes[1,0])
        axes[1,0].set_title('NDVI Mean Distribution')
        axes[1,0].set_xlabel('NDVI Mean')
        axes[1,0].set_ylabel('Frequency')

    # 5. Cloud coverage distribution
    if 'cloud_pct_mean' in processed_features.columns:
        processed_features['cloud_pct_mean'].hist(bins=30, ax=axes[1,1])
        axes[1,1].set_title('Cloud Coverage Distribution')
        axes[1,1].set_xlabel('Mean Cloud Percentage')
        axes[1,1].set_ylabel('Frequency')

    # 6. NIR vs Red scatter (B8 vs B4)
    if 'B8_mean' in processed_features.columns and 'B4_mean' in processed_features.columns:
        axes[1,2].scatter(processed_features['B4_mean'],
                        processed_features['B8_mean'], alpha=0.5)
        axes[1,2].set_title('NIR (B8) vs Red (B4) Mean Values')
        axes[1,2].set_xlabel('Red (B4) Mean')
        axes[1,2].set_ylabel('NIR (B8) Mean')

    # 7. Seasonal patterns (S2)
    if 'first_obs_s2' in processed_features.columns:
        processed_features['first_obs_s2'].hist(bins=30, ax=axes[2,0])
        axes[2,0].set_title('First Observation Day of Year (S2)')
        axes[2,0].set_xlabel('Day of Year')
        axes[2,0].set_ylabel('Frequency')

    # 8. EVI vs NDVI comparison
    if 'EVI_mean' in processed_features.columns and 'NDVI_mean' in processed_features.columns:
        axes[2,1].scatter(processed_features['NDVI_mean'],
                        processed_features['EVI_mean'], alpha=0.5)
        axes[2,1].set_title('EVI vs NDVI Mean Values')
        axes[2,1].set_xlabel('NDVI Mean')
        axes[2,1].set_ylabel('EVI Mean')

    # 9. SWIR band correlation (B11 vs B12)
    if 'B11_mean' in processed_features.columns and 'B12_mean' in processed_features.columns:
        axes[2,2].scatter(processed_features['B11_mean'],
                        processed_features['B12_mean'], alpha=0.5)
        axes[2,2].set_title('SWIR1 (B11) vs SWIR2 (B12)')
        axes[2,2].set_xlabel('B11 Mean')
        axes[2,2].set_ylabel('B12 Mean')

    plt.tight_layout()
    plt.show()

    # Additional plot for cloud-free observation analysis
    if 'clear_obs_ratio' in processed_features.columns:
        plt.figure(figsize=(10, 6))
        plt.subplot(1, 2, 1)
        processed_features['clear_obs_ratio'].hist(bins=30)
        plt.title('Clear Observation Ratio Distribution')
        plt.xlabel('Ratio of Clear Observations (<20% cloud)')
        plt.ylabel('Frequency')

        plt.subplot(1, 2, 2)
        #Correlation between cloud coverage and vegetation indices
        if 'NDVI_mean' in processed_features.columns and 'cloud_pct_mean' in processed_features.columns:
          plt.scatter(processed_features['cloud_pct_mean'],
                      processed_features['NDVI_mean'], alpha=0.5)
          plt.title('Cloud Coverage vs NDVI')
          plt.xlabel('Mean Cloud Percentage')
          plt.ylabel('NDVI Mean')

        plt.tight_layout()
        plt.show()

In [None]:
s1_path='/content/drive/MyDrive/Zindi Hackathons/Sentinel1.csv'
s2_path='/content/drive/MyDrive/Zindi Hackathons/Sentinel2.csv'
train_folder='/content/drive/MyDrive/Zindi Hackathons/Train'

In [None]:
print("File paths set:")
print(f"Sentinel-1: {s1_path}")
print(f"Sentinel-2: {s2_path}")
print(f"Training folder: {train_folder}")


In [None]:
s1_data, s2_data=load_sentinel_data(s1_path, s2_path)

training_shapes=load_training_shapefiles(train_folder)

In [None]:
#Basic data exploration
explore_satellite_data(s1_data, s2_data)

print("\nSentinel-1 sample:")
print(s1_data.head())

print("\nSentinel-2 sample:")
print(s2_data.head())

#Show training data info
for region, gdf in training_shapes.items():
  print(f"\n{region} training data sample:")
  print(gdf.head())

In [None]:
processed_features=process_all_satellite_data(s1_data, s2_data)

print(f"Processed features shape: {processed_features.shape}")
print(f"Feature Columns: {processed_features.columns.tolist()}")
print(f"\nFirst few row:")
print(processed_features.head())

In [None]:
from shapely.geometry import Point
import geopandas as gpd

unique_points = s1_data[['ID', 'translated_lat', 'translated_lon']].drop_duplicates()
geometry = [Point(lon, lat) for lon, lat in zip(unique_points['translated_lon'], unique_points['translated_lat'])]
points_gdf = gpd.GeoDataFrame(unique_points, geometry=geometry, crs='EPSG:4326') # Explicitly set CRS
print("\nCRS of points_gdf:")
print(points_gdf.crs)

In [None]:
# Perform spatial join with an even larger buffer distance
spatial_joined = spatial_join_training_data(s1_data, training_shapes, buffer_distance=0.01) # Increased buffer further

if spatial_joined is not None:
  print(f"Spatial join successful: {len(spatial_joined)} points matched")
  print(f"Columns: {spatial_joined.columns.tolist()}")
else:
  print("Spatial join failed - check if there is actual spatial overlap or try a larger buffer.")

In [None]:
training_data, class_col=prepare_training_dataset(processed_features, spatial_joined)

if training_data is not None:
  print(f"Training dataset ready!")
  print(f"Shape: {training_data.shape}")
  print(f"Class column: {class_col}")

  #Show class distributio by region
  print(f"\nClass Distribution by region:")
  print(training_data.groupby(['region', class_col]).size().unstack(fill_value=0))
else:
  print("Training dataset preparation failed")

In [None]:
if training_data is not None:
  model_dict=train_cropland_model(training_data, class_col)
  print("Model training completed")
else:
  print("Cannot train model- no training data avalilable!")

# Task
Save the trained model and generate a submission file named "SampleSubmission.csv" using the test data.

## Load test data

### Subtask:
Load the test data for prediction. Since a separate test file is not provided, we'll use the test set created during the training process.


**Reasoning**:
Extract the IDs corresponding to the test set samples from the original training data using the indices from the y_test numpy array.



**Reasoning**:
The previous code failed because `model_dict` was not defined due to the preceding cell failing. Re-run the training cell first to define `model_dict`, then extract the test set IDs.

