In [3]:

# 1. Load and inspect the two datasets

import pandas as pd
import os
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder



# Load both CSVs
s1 = pd.read_csv("subnational_1_tree_cover_loss.csv")
s2 = pd.read_csv("subnational_2_tree_cover_loss.csv")

print(" Data Loaded Successfully!")
print(f"Subnational 1 Shape: {s1.shape}")
print(f"Subnational 2 Shape: {s2.shape}")

# Quick look at columns
print("\n--- Subnational 1 Columns ---")
print(s1.columns.tolist())
print("\n--- Subnational 2 Columns ---")
print(s2.columns.tolist())

# Combine both sheets into one DataFrame
combined = pd.concat([s1, s2], ignore_index=True)
print(f"\nCombined dataset shape: {combined.shape}")


 Data Loaded Successfully!
Subnational 1 Shape: (288, 30)
Subnational 2 Shape: (5328, 31)

--- Subnational 1 Columns ---
['country', 'subnational1', 'threshold', 'area_ha', 'extent_2000_ha', 'extent_2010_ha', 'gain_2000-2020_ha', 'tc_loss_ha_2001', 'tc_loss_ha_2002', 'tc_loss_ha_2003', 'tc_loss_ha_2004', 'tc_loss_ha_2005', 'tc_loss_ha_2006', 'tc_loss_ha_2007', 'tc_loss_ha_2008', 'tc_loss_ha_2009', 'tc_loss_ha_2010', 'tc_loss_ha_2011', 'tc_loss_ha_2012', 'tc_loss_ha_2013', 'tc_loss_ha_2014', 'tc_loss_ha_2015', 'tc_loss_ha_2016', 'tc_loss_ha_2017', 'tc_loss_ha_2018', 'tc_loss_ha_2019', 'tc_loss_ha_2020', 'tc_loss_ha_2021', 'tc_loss_ha_2022', 'tc_loss_ha_2023']

--- Subnational 2 Columns ---
['country', 'subnational1', 'subnational2', 'threshold', 'area_ha', 'extent_2000_ha', 'extent_2010_ha', 'gain_2000-2020_ha', 'tc_loss_ha_2001', 'tc_loss_ha_2002', 'tc_loss_ha_2003', 'tc_loss_ha_2004', 'tc_loss_ha_2005', 'tc_loss_ha_2006', 'tc_loss_ha_2007', 'tc_loss_ha_2008', 'tc_loss_ha_2009', 'tc_lo

In [2]:

# 2. Clean and standardize column names

combined.columns = (
    combined.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
    .str.replace("__+", "_", regex=True)
)

# Drop duplicates and null-heavy columns
combined = combined.drop_duplicates()
combined = combined.dropna(axis=1, thresh=len(combined) * 0.5)

print(" Basic cleaning completed.")


 Basic cleaning completed.


In [5]:

# 3. Handle missing values

from sklearn.impute import KNNImputer
import numpy as np

numeric_cols = combined.select_dtypes(include=np.number).columns
imputer = KNNImputer(n_neighbors=3)

combined[numeric_cols] = imputer.fit_transform(combined[numeric_cols])
print(" Missing numeric values imputed using KNN.")


 Missing numeric values imputed using KNN.


In [6]:

# 4. Label Encode categorical variables

from sklearn.preprocessing import LabelEncoder

cat_cols = combined.select_dtypes(exclude=np.number).columns

for col in cat_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))

print(" Label encoding completed for categorical columns.")


 Label encoding completed for categorical columns.


In [8]:
print(combined.columns.tolist())


['country', 'subnational1', 'threshold', 'area_ha', 'extent_2000_ha', 'extent_2010_ha', 'gain_2000-2020_ha', 'tc_loss_ha_2001', 'tc_loss_ha_2002', 'tc_loss_ha_2003', 'tc_loss_ha_2004', 'tc_loss_ha_2005', 'tc_loss_ha_2006', 'tc_loss_ha_2007', 'tc_loss_ha_2008', 'tc_loss_ha_2009', 'tc_loss_ha_2010', 'tc_loss_ha_2011', 'tc_loss_ha_2012', 'tc_loss_ha_2013', 'tc_loss_ha_2014', 'tc_loss_ha_2015', 'tc_loss_ha_2016', 'tc_loss_ha_2017', 'tc_loss_ha_2018', 'tc_loss_ha_2019', 'tc_loss_ha_2020', 'tc_loss_ha_2021', 'tc_loss_ha_2022', 'tc_loss_ha_2023', 'subnational2']


In [9]:
# --- Domain-specific feature engineering ---

# Ensure key columns exist (adjusted for your dataset)
required = ["extent_2000_ha", "gain_2000-2020_ha"]
for col in required:
    if col not in combined.columns:
        print(f" Missing column: {col}")

# Calculate total tree cover loss from yearly columns
loss_cols = [c for c in combined.columns if "tc_loss_ha_" in c]
combined["tree_cover_loss_total"] = combined[loss_cols].sum(axis=1)

# Compute ratios and density metrics
combined["loss_gain_ratio"] = (combined["tree_cover_loss_total"] + 1) / (combined["gain_2000-2020_ha"] + 1)
combined["loss_extent_ratio"] = (combined["tree_cover_loss_total"] + 1) / (combined["extent_2000_ha"] + 1)
combined["gain_extent_ratio"] = (combined["gain_2000-2020_ha"] + 1) / (combined["extent_2000_ha"] + 1)

print(" Derived ratios and density metrics calculated.")


 Derived ratios and density metrics calculated.


In [10]:
# 6. Compute Green Deficit Index (GDI)
required = ["extent_2000_ha", "gain_2000-2020_ha", "tree_cover_loss_total"]

if all(col in combined.columns for col in required):
    combined['GDI'] = (
        (combined['tree_cover_loss_total'] - combined['gain_2000-2020_ha'])
        / (combined['extent_2000_ha'] + 1)
    )

    # Categorize based on defined thresholds
    def categorize_gdi(value):
        if value <= -5:
            return "Excellent (Net Gain)"
        elif -5 < value <= 0:
            return "Acceptable"
        elif 0 < value <= 10:
            return "Concerning"
        else:
            return "High-Risk"

    combined['GDI_Category'] = combined['GDI'].apply(categorize_gdi)
    print(" GDI calculated and categorized successfully.")
else:
    print(" Required columns missing for GDI computation.")



 GDI calculated and categorized successfully.


In [11]:
# 7. Generate correlation-based insights

# Compute numeric correlations only
corr_matrix = combined.select_dtypes(include=['float64', 'int64']).corr()

if "GDI" in corr_matrix.columns:
    strong_corr = corr_matrix["GDI"].sort_values(ascending=False).head(10)
    print("\n Top 10 features correlated with GDI:\n")
    print(strong_corr)
else:
    print(" 'GDI' not found in correlation matrix.")



 Top 10 features correlated with GDI:

GDI                      1.000000
loss_gain_ratio          0.043219
tc_loss_ha_2007          0.037334
tc_loss_ha_2006          0.037154
tc_loss_ha_2005          0.036179
tc_loss_ha_2008          0.035928
tc_loss_ha_2023          0.035795
tc_loss_ha_2011          0.035616
tc_loss_ha_2003          0.034932
tree_cover_loss_total    0.034897
Name: GDI, dtype: float64


In [12]:
# 9. Summary Insights
# ===========================================
if "GDI" in combined.columns:
    print("\n GREEN DEFICIT INDEX SUMMARY")
    print("-" * 45)
    print(f" Average GDI: {combined['GDI'].mean():.2f}")
    print(f" Minimum GDI: {combined['GDI'].min():.2f}")
    print(f" Maximum GDI: {combined['GDI'].max():.2f}")

    print("\n Category Distribution:")
    print(combined['GDI_Category'].value_counts())

    print("\n Interpretation:")
    print(" - Negative GDI → Net Green Gain (Good ecological balance).")
    print(" - Positive GDI → Net Green Deficit (Loss per hectare, needs reforestation).")
    print(" - Higher GDI → Higher ecological risk requiring urgent mitigation.")
else:
    print(" Could not compute summary — GDI missing.")



 GREEN DEFICIT INDEX SUMMARY
---------------------------------------------
 Average GDI: -40.57
 Minimum GDI: -6904.00
 Maximum GDI: 0.33

 Category Distribution:
Acceptable              3485
Excellent (Net Gain)    1104
Concerning              1027
Name: GDI_Category, dtype: int64

 Interpretation:
 - Negative GDI → Net Green Gain (Good ecological balance).
 - Positive GDI → Net Green Deficit (Loss per hectare, needs reforestation).
 - Higher GDI → Higher ecological risk requiring urgent mitigation.


In [14]:

# 8. Export clean + feature engineered dataset

combined.to_csv("feature_engineered_greenpulse.csv", index=False)
print(" Saved as feature_engineered_greenpulse.csv")


 Saved as feature_engineered_greenpulse.csv
