In [None]:
# Install required packages (if needed)
!pip install -q openpyxl scikit-learn matplotlib seaborn
!pip install scikeras


# Import core libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Import deep learning tools
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU, BatchNormalization
from scikeras.wrappers import KerasRegressor

In [None]:
# STEP 2: Upload your dataset (Excel file) in Colab
from google.colab import files
uploaded = files.upload()



In [None]:

#Step 2.1 Once uploaded, read the Excel file (replace the filename if different)
filename = list(uploaded.keys())[0]
data = pd.read_excel(io.BytesIO(uploaded[filename]),
                     sheet_name='Agrofood_co2_emission_imputedKN')


In [None]:
# STEP 2: Preprocessing – Drop Non-Numeric / Low-Relevance Columns
data_cleaned = data.drop(columns=["Region", "Area", "Year"])

# Remove features with low correlation to target
cor_matrix = data_cleaned.corr()
low_corr = cor_matrix["total_emission"].abs()[cor_matrix["total_emission"].abs() < 0.2].index.tolist()

#  Safely remove the target column if present
if "total_emission" in low_corr:
    low_corr.remove("total_emission")

print(f"Features dropped due to low correlation: {low_corr}")
data_cleaned = data_cleaned.drop(columns=low_corr)


In [None]:
#  STEP 3: Define Features and Target + Train/Test Split
# Define feature matrix X and target y
X = data_cleaned.drop(columns=["total_emission"])
y = data_cleaned["total_emission"]

# Split the dataset (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
#  STEP 4: Feature Scaling
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
#  STEP 5: Build and Train MLP Model
# Define the MLP model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)  # Linear output for regression
])

# Compile model
model.compile(optimizer='adam', loss='mse')

# Add early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=200,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)


In [None]:
# STEP 6: Plot Training & Validation Loss
plt.figure(figsize=(8, 5))
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.title('MLP Training & Validation Loss')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# STEP 7: Evaluate Model Performance on Test Set
# Predict on test set
y_pred = model.predict(X_test_scaled).flatten()

# Compute metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n🔍 Performance Metrics (MLP - Test Set):")
print(f"MSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R²: {r2:.4f}")


In [None]:
# STEP 8: Visualize Actual vs Predicted
results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})

plt.figure(figsize=(6, 6))
sns.scatterplot(x="Actual", y="Predicted", data=results, alpha=0.6)
plt.plot([results.min().min(), results.max().max()], [results.min().min(), results.max().max()],
         linestyle="--", color="red")
plt.title("Actual vs Predicted CO₂ Emissions (MLP)")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.grid(True)
plt.show()


In [None]:
# Step 9  Visualize Feature Importance with Permutation Importance
try:
    import eli5
except ImportError:
    !pip install eli5
    import eli5

from eli5.sklearn import PermutationImportance


perm = PermutationImportance(estimator, random_state=42).fit(X_test_scaled, y_test)
eli5.show_weights(perm, feature_names=X.columns.tolist())


In [None]:
# STEP 9: Visualize Residuals
residuals = y_test - y_pred
plt.figure(figsize=(8, 4))
sns.histplot(residuals, kde=True, color='orange')
plt.axvline(0, linestyle='--', color='black')
plt.title("Residuals Distribution")
plt.xlabel("Residuals (Actual - Predicted)")
plt.grid(True)
plt.show()
