<a href="https://colab.research.google.com/github/ced-sys/.py/blob/main/Crop_mapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
base_path='/content/drive/MyDrive/cropland-mapping'
folders=['data', 'notebooks', 'submissions', 'src']

In [None]:
print("Setting up project structure...")
for folder in folders:
  folder_path=os.path.join(base_path, folder)
  os.makedirs(folder_path, exist_ok=True)
  print(f"Created: {folder_path}")

In [None]:
zindi_path='/content/drive/MyDrive/Zindi Hackathons'
if os.path.exists(zindi_path):
  print(f"Found zindi hackathons folder at : {zindi_path}")
  print("Contents:")
  for item in os.listdir(zindi_path):
    print(f" -{item}")
else:
  print("Zindi Hackathons folder not found. Please check the path")

In [None]:
try:
  train_data=None
  test_data=None

  for root, dirs, files in os.walk(zindi_path):
    for file in files:
      if 'train' in file.lower() and (file.endswith('.csv') or file.endswith('.parquet')):
        train_path=os.path.join(root, file)
        print(f"Found potential training data: {train_path}")

        if file.endswith('.csv'):
          train_data=pd.read_csv(train_path)
        elif file.endswith('.parquet'):
          train_data=pd.read_parquet(train_path)
        break

      if 'test' in file.lower() and (file.endswith('.csv')or file.endswith('.parquet')):
        test_path=os.path.join(root, file)
        print(f"Found potential test data: {test_path}")

        if file.endswith('.csv'):
          test_data=pd.read_csv(test_path)
        elif file.endswith('.parquet'):
          test_data=pd.read_parquet(test_path)

  if train_data is not None:
    print(f"Training data loaded: {train_data.shape}")
    print(f"Columns:{list(train_data.columns)}")
    print(f"First few rows:")
    print(train_data.head())

    #Check for target column
    target_columns=['target', 'Target', 'label', 'Label', 'class', 'Class']
    target_col=None
    for col in target_cols:
      if col in train_data.columns:
        target_col=col
        break

    if target_col:
      print(f"Target Column found: {target_col}")
      print(f"Target distribution:")
      print(train_data[target_col].value_counts())
    else:
      print("Target column ouldnotbe immediately identified")

  if test_data is not None:
    print(f"Test data loaded: {test_data.shape}")
    print(f"Columns: {list(test_data.columns)}")

except Exception as e:
  print(f"Error loading data: {e}")
  print("Check the file paths")



# Task
Extract the contents of the zipped file "Train.zip", which contains a folder of satellite training data, and prepare it for integration with the existing test data.

## Explore extracted data

### Subtask:
Examine the contents of the extracted folder to understand the file types and structure of the satellite data.


**Reasoning**:
List the contents of the directory where "Train.zip" was extracted to understand the file types and structure of the satellite data. Assuming "Train.zip" was extracted to a 'Train' folder within the `zindi_path`.



In [None]:
train_extracted_path = os.path.join(zindi_path, 'Train')

if os.path.exists(train_extracted_path):
    print(f"Contents of {train_extracted_path}:")
    for item in os.listdir(train_extracted_path):
        item_path = os.path.join(train_extracted_path, item)
        print(f"- {item} ({'Directory' if os.path.isdir(item_path) else 'File'})")

        # Examine contents of a sub-directory if it's a folder (sample the first one found)
        if os.path.isdir(item_path):
            print(f"  Contents of {item}:")
            sub_items = os.listdir(item_path)
            for sub_item in sub_items[:5]: # List only first 5 items
                 print(f"  - {sub_item}")
            if len(sub_items) > 5:
                print("  ...")
else:
    print(f"Directory not found: {train_extracted_path}. Please ensure 'Train.zip' was extracted.")

## Identify data loading method

### Subtask:
Based on the file types, determine the appropriate method to load the satellite data (e.g., specific libraries for geospatial data).


## Load and preprocess data

### Subtask:
Load the satellite data and perform any necessary preprocessing steps (e.g., handling different bands, time series data).


**Reasoning**:
Import the geopandas library, construct the full file path to the shapefiles, and load them into GeoDataFrames.



In [None]:
import geopandas as gpd

# Construct paths to shapefiles
fergana_shp_path = os.path.join(zindi_path, 'Train', 'Fergana_training_samples.shp')
orenburg_shp_path = os.path.join(zindi_path, 'Train', 'Orenburg_training_samples.shp')

# Load shapefiles into GeoDataFrames
try:
    fergana_train_gdf = gpd.read_file(fergana_shp_path)
    print("Fergana training data loaded:")
    display(fergana_train_gdf.head())
    print("Fergana columns:", fergana_train_gdf.columns.tolist())
    print("\nFergana target distribution:")
    display(fergana_train_gdf['crop_type'].value_counts())
except Exception as e:
    print(f"Error loading Fergana shapefile: {e}")
    fergana_train_gdf = None

try:
    orenburg_train_gdf = gpd.read_file(orenburg_shp_path)
    print("\nOrenburg training data loaded:")
    display(orenburg_train_gdf.head())
    print("Orenburg columns:", orenburg_train_gdf.columns.tolist())
    print("\nOrenburg target distribution:")
    display(orenburg_train_gdf['crop_type'].value_counts())

except Exception as e:
    print(f"Error loading Orenburg shapefile: {e}")
    orenburg_train_gdf = None


**Reasoning**:
The shapefiles were loaded successfully, but the target column is named 'Cropland' instead of 'crop_type'. I need to inspect the dataframes to confirm the target column and its values before proceeding.



**Reasoning**:
An error occurred because `fergana_train_gdf` is None. This is likely due to the error handling in the previous step that set the variable to None if loading failed. However, based on the previous output, the shapefiles *did* load successfully, just with a different column name than expected. I need to re-load the shapefiles without the error handling that sets the variable to None, and then proceed with inspecting the correct target column.



In [None]:
# Load shapefiles into GeoDataFrames again, without the error handling that sets to None
fergana_train_gdf = gpd.read_file(os.path.join(zindi_path, 'Train', 'Fergana_training_samples.shp'))
orenburg_train_gdf = gpd.read_file(os.path.join(zindi_path, 'Train', 'Orenburg_training_samples.shp'))

print("Fergana training data info:")
fergana_train_gdf.info()
print("\nOrenburg training data info:")
orenburg_train_gdf.info()

print("\nFergana 'Cropland' value counts:")
display(fergana_train_gdf['Cropland'].value_counts())
print("\nOrenburg 'Cropland' value counts:")
display(orenburg_train_gdf['Cropland'].value_counts())

## Integrate with test data

### Subtask:
Integrate the loaded training data with the existing test data.


**Reasoning**:
Concatenate the two training GeoDataFrames and prepare them for integration with the test data by ensuring a location column exists. Then, inspect both the combined training data and the test data to determine a common integration strategy, which appears to be based on location.



In [None]:
# 1. Concatenate the two training GeoDataFrames
# Add a 'location' column to each GeoDataFrame before concatenating
fergana_train_gdf['location'] = 'Fergana'
orenburg_train_gdf['location'] = 'Orenburg'

combined_train_gdf = pd.concat([fergana_train_gdf, orenburg_train_gdf], ignore_index=True)

print("Combined training data loaded:")
display(combined_train_gdf.head())
print("\nCombined training data columns:", combined_train_gdf.columns.tolist())
print("\nCombined training data info:")
combined_train_gdf.info()

# 2. and 3. Ensure a column for relating to test data and examine structures
# The 'location' column is added to the training data.
# The test_data already has a 'location' column.
# Examine the test data structure again for comparison
print("\nTest data structure:")
display(test_data.head())
print("\nTest data columns:", test_data.columns.tolist())
print("\nTest data info:")
test_data.info()

# The 'location' column seems to be the common key for a potential integration
# based on the provided data.

**Reasoning**:
Based on the previous output, the combined training data and the test data both have a 'location' column. However, the 'ID' columns are different (integer in training, string in test). A direct merge by 'ID' or 'location' alone is not sufficient for precise integration as there is no common unique identifier across both datasets. The training data has point geometries and the test data has translated lat/lon. The most appropriate integration strategy is likely spatial, by joining the test points to the training geometries based on location. This requires converting the test data into a GeoDataFrame first.



In [None]:
from shapely.geometry import Point
import geopandas as gpd

# 5. Convert test_data to a GeoDataFrame
# Create a 'geometry' column from 'translated_lon' and 'translated_lat'
geometry = [Point(xy) for xy in zip(test_data['translated_lon'], test_data['translated_lat'])]
test_gdf = gpd.GeoDataFrame(test_data, geometry=geometry, crs=combined_train_gdf.crs) # Use the same CRS as training data

print("Test data converted to GeoDataFrame:")
display(test_gdf.head())
print("\nTest GeoDataFrame columns:", test_gdf.columns.tolist())
print("\nTest GeoDataFrame info:")
test_gdf.info()

# Perform a spatial join between the test points and training geometries based on location
# This assumes the training geometries represent areas/points that the test points should be associated with.
# We can join test points to training geometries that they intersect or are within.
# A simple approach is to find which training geometry each test point falls within.
# Since the training data contains geometries (likely points or polygons representing samples),
# and the test data contains points, a spatial join (sjoin) is appropriate.

# Perform spatial join based on location and geometry
# First, ensure both GeoDataFrames have a common column for location to filter before joining
integrated_data = gpd.sjoin(test_gdf, combined_train_gdf, how="left", predicate="intersects")

print("\nIntegrated data after spatial join:")
display(integrated_data.head())
print("\nIntegrated data columns:", integrated_data.columns.tolist())
print("\nIntegrated data info:")
integrated_data.info()


## Integrate with test data retry 1

### Subtask:
Integrate the loaded training data with the existing test data. This is a retry attempt after the initial spatial join was unsuccessful in linking test points to training geometries.


**Reasoning**:
Since the spatial join based on intersection failed, I will focus on making the training data information available alongside the test data using the common 'location' column, as suggested in the instructions. This involves ensuring both the combined training data and the test data are loaded and accessible, potentially grouped by location for future analysis or modeling.



In [None]:
# Re-examine the 'location' column in both datasets to confirm it can serve as a key.
print("Value counts for 'location' in combined training data:")
display(combined_train_gdf['location'].value_counts())

print("\nValue counts for 'location' in test data:")
display(test_data['location'].value_counts())

# Ensure both dataframes are readily available.
# combined_train_gdf and test_data (or test_gdf) are already loaded and available from previous steps.
# The 'location' column exists in both and can be used for grouping or merging if needed later.

print("\nCombined training data head:")
display(combined_train_gdf.head())

print("\nTest data head:")
display(test_data.head())


## Summary:

### Data Analysis Key Findings

*   The extracted training data is in Esri shapefile format, consisting of `.shp`, `.shx`, and `.dbf` files, structured by location (`Fergana` and `Orenburg`).
*   The `geopandas` library was successfully used to load the shapefiles into GeoDataFrames.
*   The training data includes an `ID`, a `Cropland` column (the target variable with values 0 and 1), and geometry information.
*   The two training GeoDataFrames were concatenated and a `location` column was added, successfully combining the Fergana and Orenburg training data.
*   The test data was converted to a GeoDataFrame (`test_gdf`) using the provided latitude and longitude coordinates, and the Coordinate Reference System (CRS) was matched to the training data.
*   An initial spatial join attempt between the test points and training geometries using the 'intersects' predicate failed to link test points to training labels effectively, resulting in almost entirely null values for training data columns in the integrated dataset.
*   Both the combined training data and the test data contain a `location` column with the same unique values (`Fergana` and `Orenburg`), which can serve as a key for location-based analysis or grouping.

### Insights or Next Steps

*   Since a direct spatial join for transferring training labels to test points was unsuccessful, alternative methods for using the training data are needed, potentially involving training a model on the training data and then using it to predict labels for the test points based on their attributes or surrounding features.
*   Leverage the common `location` column to perform location-specific analysis or modeling, or to ensure that any integration or analysis respects the geographical origin of the data points.


## Integrate with test data

### Subtask:
Integrate the loaded training data with the existing test data.

**Reasoning**:
Concatenate the two training GeoDataFrames and prepare them for integration with the test data by ensuring a location column exists. Then, inspect both the combined training data and the test data to determine a common integration strategy, which appears to be based on location.

In [None]:
# 1. Concatenate the two training GeoDataFrames
# Add a 'location' column to each GeoDataFrame before concatenating
fergana_train_gdf['location'] = 'Fergana'
orenburg_train_gdf['location'] = 'Orenburg'

combined_train_gdf = pd.concat([fergana_train_gdf, orenburg_train_gdf], ignore_index=True)

print("Combined training data loaded:")
display(combined_train_gdf.head())
print("\nCombined training data columns:", combined_train_gdf.columns.tolist())
print("\nCombined training data info:")
combined_train_gdf.info()

# 2. and 3. Ensure a column for relating to test data and examine structures
# The 'location' column is added to the training data.
# The test_data already has a 'location' column.
# Examine the test data structure again for comparison
print("\nTest data structure:")
display(test_data.head())
print("\nTest data columns:", test_data.columns.tolist())
print("\nTest data info:")
test_data.info()

# The 'location' column seems to be the common key for a potential integration
# based on the provided data.