In [None]:
pip install xgboost



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import json
import asyncio # Required for async operations if running outside a context that manages it
from google.colab import userdata # Import userdata to access secrets

# --- 1. Define Emission Factors ---
# Units are typically kgCO2e (kilograms of CO2 equivalent) per unit of activity.
EMISSION_FACTORS = {
    'electricity_kwh': 0.8,      # kgCO2e per kWh (e.g., from your electricity provider's grid mix)
    'natural_gas_kwh': 0.19,     # kgCO2e per kWh
    'petrol_liter': 2.3,         # kgCO2e per liter of petrol
    'diesel_liter': 2.65,         # kgCO2e per liter of diesel
    'bus_km': 0.1,               # kgCO2e per km (average bus travel, varies by capacity/fuel)
    'train_km': 0.096,            # kgCO2e per km (e.g., electric train, depends on grid energy source)
    'flight_short_haul_km': 0.15, # kgCO2e per passenger-km (less than ~1000km)
    'flight_long_haul_km': 0.1,  # kgCO2e per passenger-km (over ~1000km, generally lower per km due to efficiency)
    'red_meat_meal': 10.0,        # kgCO2e per meal (e.g., beef, lamb - highly variable, can be much higher)
    'chicken_meal': 1.6,         # kgCO2e per meal (e.g., chicken)
    'veg_meal': 0.3,             # kgCO2e per meal (e.g., plant-based, minimal dairy/eggs)
    'waste_kg': 2.0,             # kgCO2e per kg of unrecycled general waste
}

# --- 2. Create a Simulated Dataset ---
def generate_simulated_data(num_samples=1000):
    """
    Generates a synthetic dataset of personal lifestyle activities.
    Each row represents a person's typical monthly activities.
    """
    np.random.seed(42) # for reproducibility of the random data

    data = {
        'electricity_kwh_month': np.random.randint(50, 500, num_samples), # Monthly electricity consumption
        'natural_gas_kwh_month': np.random.randint(0, 300, num_samples), # Monthly natural gas consumption (some might not use)
        'car_km_month': np.random.randint(0, 1500, num_samples),         # Monthly kilometers driven by car
        'car_fuel_type': np.random.choice(['petrol', 'diesel', 'electric'], num_samples, p=[0.6, 0.3, 0.1]), # Type of car fuel
        'car_avg_mileage_km_per_liter': np.random.uniform(10, 25, num_samples), # Avg mileage for petrol/diesel cars
        'bus_km_month': np.random.randint(0, 300, num_samples),          # Monthly kilometers by bus
        'train_km_month': np.random.randint(0, 500, num_samples),        # Monthly kilometers by train
        'flights_short_haul_per_year': np.random.randint(0, 5, num_samples), # Number of short-haul flights per year
        'flights_long_haul_per_year': np.random.randint(0, 3, num_samples),  # Number of long-haul flights per year
        'red_meat_meals_per_week': np.random.randint(0, 10, num_samples),  # Red meat meals per week
        'chicken_meals_per_week': np.random.randint(0, 10, num_samples),   # Chicken meals per week
        'veg_meals_per_week': np.random.randint(5, 20, num_samples),       # Vegetarian meals per week (often higher)
        'waste_kg_month': np.random.uniform(5, 25, num_samples),         # Kilograms of general waste per month
        'is_recycler': np.random.choice([0, 1], num_samples, p=[0.2, 0.8]), # 0=No, 1=Yes (binary for recycling habit)
        'ac_hours_day_summer': np.random.uniform(0, 12, num_samples),    # Average AC usage hours per day during summer
        'num_people_household': np.random.randint(1, 6, num_samples),    # Number of people in household (for potential normalization)
    }

    df = pd.DataFrame(data)

    # Simulate AC electricity usage:
    # Assume a typical AC unit consumes 1.5 kW.
    # Calculate average monthly usage over a year, assuming summer for 4 months.
    df['ac_kwh_month'] = df['ac_hours_day_summer'] * 1.5 * 30 * (4/12)

    return df

# Generate our simulated dataset
df_raw = generate_simulated_data(num_samples=2000)
# print("--- Simulated Raw Data Head (First 5 Rows) ---")
# print(df_raw.head())
# print("\n--- Simulated Raw Data Info (Column Types and Non-Null Counts) ---")
# df_raw.info()

# --- 3. Feature Engineering: Calculate Carbon Footprint from Activities ---
# This function calculates the total carbon footprint and also breaks it down by category.
# The total footprint will be our target variable (y) for the ML model.
# The breakdown will be used for generating personalized suggestions.
def calculate_footprint_components(row, emission_factors):
    """
    Calculates individual and total carbon footprint components for a given row of activity data.
    """
    # Initialize components to 0
    cf_electricity = 0
    cf_gas = 0
    cf_ac = 0
    cf_car = 0
    cf_bus = 0
    cf_train = 0
    cf_flights_short = 0
    cf_flights_long = 0
    cf_diet_red_meat = 0
    cf_diet_chicken = 0
    cf_diet_veg = 0
    cf_waste = 0

    # Home Energy
    cf_electricity = row['electricity_kwh_month'] * emission_factors['electricity_kwh']
    cf_gas = row['natural_gas_kwh_month'] * emission_factors['natural_gas_kwh']
    cf_ac = row['ac_kwh_month'] * emission_factors['electricity_kwh'] # AC energy comes from electricity grid

    # Transportation
    if row['car_fuel_type'] == 'petrol':
        # Avoid division by zero if mileage is 0 or very small
        liters_consumed = row['car_km_month'] / row['car_avg_mileage_km_per_liter'] if row['car_avg_mileage_km_per_liter'] > 0 else 0
        cf_car = liters_consumed * emission_factors['petrol_liter']
    elif row['car_fuel_type'] == 'diesel':
        liters_consumed = row['car_km_month'] / row['car_avg_mileage_km_per_liter'] if row['car_avg_mileage_km_per_liter'] > 0 else 0
        cf_car = liters_consumed * emission_factors['diesel_liter']
    # For 'electric' cars, their electricity consumption is (implicitly) part of 'electricity_kwh_month'
    # or could be a separate input/calculation if you want to model charging specifically.

    cf_bus = row['bus_km_month'] * emission_factors['bus_km']
    cf_train = row['train_km_month'] * emission_factors['train_km']

    # Flights (convert annual flights to monthly equivalent for consistency with other monthly data)
    # Assume average distances for short and long haul for illustrative purposes.
    cf_flights_short = (row['flights_short_haul_per_year'] * 1000 * emission_factors['flight_short_haul_km']) / 12
    cf_flights_long = (row['flights_long_haul_per_year'] * 5000 * emission_factors['flight_long_haul_km']) / 12

    # Diet (convert weekly meals to monthly equivalent: 52 weeks / 12 months approx 4.33)
    monthly_factor = 52 / 12
    cf_diet_red_meat = (row['red_meat_meals_per_week'] * emission_factors['red_meat_meal']) * monthly_factor
    cf_diet_chicken = (row['chicken_meals_per_week'] * emission_factors['chicken_meal']) * monthly_factor
    cf_diet_veg = (row['veg_meals_per_week'] * emission_factors['veg_meal']) * monthly_factor

    # Waste (adjust if recycling is done, assume a reduction in emissions for recyclers)
    cf_waste = row['waste_kg_month'] * emission_factors['waste_kg']
    if row['is_recycler'] == 1:
        cf_waste *= 0.2 # Illustrative: 80% reduction in waste emissions for recyclers

    # Sum all individual components for the total monthly carbon footprint
    total_cf = (cf_electricity + cf_gas + cf_ac + cf_car + cf_bus + cf_train +
                cf_flights_short + cf_flights_long +
                cf_diet_red_meat + cf_diet_chicken + cf_diet_veg +
                cf_waste)

    # Store individual components back into the row for later use in suggestions
    row['cf_electricity'] = cf_electricity
    row['cf_gas'] = cf_gas
    row['cf_ac'] = cf_ac
    row['cf_car'] = cf_car
    row['cf_bus'] = cf_bus
    row['cf_train'] = cf_train
    row['cf_flights_total'] = cf_flights_short + cf_flights_long
    row['cf_diet_total'] = cf_diet_red_meat + cf_diet_chicken + cf_diet_veg
    row['cf_waste'] = cf_waste
    row['total_carbon_footprint_kgco2e_month'] = total_cf # This will be our target 'y'

    return row

# Apply the calculation function to each row of our DataFrame
df_processed = df_raw.apply(lambda row: calculate_footprint_components(row, EMISSION_FACTORS), axis=1)

# print("\n--- Data with Calculated Carbon Footprint and Components Head ---")
# print(df_processed[['electricity_kwh_month', 'car_km_month', 'total_carbon_footprint_kgco2e_month', 'cf_car', 'cf_electricity', 'cf_diet_total']].head())

# --- 4. Prepare Data for ML Model Training ---
# Define features (X) and target (y)
# We exclude the calculated footprint components from X, as they are part of 'y' or for post-prediction analysis.
# 'ac_hours_day_summer' is also removed from X as 'ac_kwh_month' is directly used and is more specific.
features_for_model = [col for col in df_processed.columns if col not in [
    'total_carbon_footprint_kgco2e_month', # This is our target variable
    'cf_electricity', 'cf_gas', 'cf_ac', 'cf_car', 'cf_bus', 'cf_train',
    'cf_flights_total', 'cf_diet_total', 'cf_waste', # These are calculated components of the target
    'ac_hours_day_summer' # Replaced by ac_kwh_month as a direct input for the model
]]

X = df_processed[features_for_model]
y = df_processed['total_carbon_footprint_kgco2e_month']

# Handle categorical features using one-hot encoding
# 'drop_first=True' avoids multicollinearity for binary categories
X = pd.get_dummies(X, columns=['car_fuel_type'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print("\n--- Training and Testing Data Shapes ---")
# print(f"X_train shape: {X_train.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"X_test shape: {X_test.shape}")
# print(f"y_test shape: {y_test.shape}")

# Important: Ensure that X_test has the same columns and order as X_train after one-hot encoding.
# This prevents errors if one-hot encoding results in different columns for train/test sets (e.g., if a category is missing in one split).
train_cols = X_train.columns
test_cols = X_test.columns

# Add missing columns to X_test (and fill with 0) if they were present in X_train
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0

# Add missing columns to X_train (and fill with 0) if they were present in X_test (less common, but good practice)
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X_train[c] = 0

# Ensure the column order is identical
X_test = X_test[train_cols]

# --- 5. Train the RandomForestRegressor Model ---
print("\n--- Training RandomForestRegressor Model ---")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("RandomForestRegressor Model training complete.")

# --- 6. Evaluate the Model ---
y_pred_RF = model.predict(X_test)

mae_RF = mean_absolute_error(y_test, y_pred_RF)
r2_RF = r2_score(y_test, y_pred_RF)

#--- Train with XGBoost Model
print("\n--- Training XGBoost Model ---")
model_xgb = XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model_xgb.fit(X_train, y_train)
print("XGBoost Model training complete.")
y_pred_XGB = model_xgb.predict(X_test)

mae_XGB = mean_absolute_error(y_test, y_pred_XGB)
r2_XGB = r2_score(y_test, y_pred_XGB)


print(f"\n--- RandomForestRegressor Model Evaluation on Test Set ---")
print(f"Mean Absolute Error (MAE): {mae_RF:.2f} kgCO2e/month") # Average absolute difference between predicted and actual
print(f"R-squared (R2): {r2_RF:.2f}") # Proportion of variance in the dependent variable that is predictable from the independent variables

print(f"\n--- XGBoost Model Evaluation on Test Set ---")
print(f"Mean Absolute Error (MAE): {mae_XGB:.2f} kgCO2e/month") # Average absolute difference between predicted and actual
print(f"R-squared (R2): {r2_XGB:.2f}") # Proportion of variance in the dependent variable that is predictable from the independent variables


# Feature Importance: Identify which input features were most influential in the model's predictions.
# Using feature importance from the XGBoost model as it was the last one trained
feature_importances = pd.Series(model_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\n--- Top 10 Feature Importances (What drives the prediction - from XGBoost) ---")
print(feature_importances.head(10))

# --- 7. Prediction for a New Person (Demonstration) ---
# This simulates getting new data from a single user.
new_person_data = {
    'electricity_kwh_month': [350],
    'natural_gas_kwh_month': [50],
    'car_km_month': [800],
    'car_fuel_type': ['petrol'],
    'car_avg_mileage_km_per_liter': [18],
    'bus_km_month': [50],
    'train_km_month': [0],
    'flights_short_haul_per_year': [1],
    'flights_long_haul_per_year': [0],
    'red_meat_meals_per_week': [4],
    'chicken_meals_per_week': [3],
    'veg_meals_per_week': [10],
    'waste_kg_month': [15],
    'is_recycler': [1],
    'ac_hours_day_summer': [6],
    'num_people_household': [2],
}
new_person_df_raw = pd.DataFrame(new_person_data)

# Recalculate AC KWH for the new person based on their input
new_person_df_raw['ac_kwh_month'] = new_person_df_raw['ac_hours_day_summer'] * 1.5 * 30 * (4/12)

# Prepare the new person's data for prediction:
# 1. Select the same features as used during training.
# 2. Apply one-hot encoding, ensuring all training columns are present.
new_person_X = pd.get_dummies(new_person_df_raw[features_for_model], columns=['car_fuel_type'], drop_first=True)

# Add any missing columns (from training data) to the new person's data and set to 0
for col in X_train.columns:
    if col not in new_person_X.columns:
        new_person_X[col] = 0
# Ensure the order of columns matches the training data
new_person_X = new_person_X[X_train.columns]

# Predict the carbon footprint for the new person using the XGBoost model (last trained)
predicted_cf_total = model_xgb.predict(new_person_X)[0]
print(f"\n--- Predicted Total Carbon Footprint for the New Person: {predicted_cf_total:.2f} kgCO2e/month ---")

# --- 8. Generate Carbon Footprint Breakdown for Suggestions ---
# This is done by running the `calculate_footprint_components` function on the new person's raw data.
new_person_processed_df = new_person_df_raw.apply(lambda row: calculate_footprint_components(row, EMISSION_FACTORS), axis=1)

# Extract the individual carbon footprint components for the new person
cf_breakdown = {
    'Electricity (incl. AC)': new_person_processed_df['cf_electricity'].iloc[0] + new_person_processed_df['cf_ac'].iloc[0],
    'Natural Gas': new_person_processed_df['cf_gas'].iloc[0],
    'Car Travel': new_person_processed_df['cf_car'].iloc[0],
    'Public Transport (Bus/Train)': new_person_processed_df['cf_bus'].iloc[0] + new_person_processed_df['cf_train'].iloc[0],
    'Flights': new_person_processed_df['cf_flights_total'].iloc[0],
    'Diet': new_person_processed_df['cf_diet_total'].iloc[0],
    'Waste': new_person_processed_df['cf_waste'].iloc[0],
}

print("\n--- Carbon Footprint Breakdown for New Person ---")
# Sort the breakdown to easily identify major contributors
sorted_breakdown = sorted(cf_breakdown.items(), key=lambda item: item[1], reverse=True)
for category, value in sorted_breakdown:
    print(f"- {category}: {value:.2f} kgCO2e/month")

# --- 9. Generate AI-Powered Suggestions (Not Rule-Based) ---
async def generate_ai_suggestions(cf_breakdown_data):
    """
    Generates personalized carbon footprint reduction suggestions using a generative AI model.
    """
    prompt = "Given the following monthly carbon footprint breakdown for a person (in kgCO2e), " \
             "please provide 3-5 actionable and friendly suggestions to help them reduce their environmental impact. " \
             "Focus on the areas with the highest contribution. Ensure the tone is encouraging and practical, avoiding jargon.\n\n"

    prompt += "Carbon Footprint Breakdown:\n"
    for category, value in cf_breakdown_data.items():
        prompt += f"- {category}: {value:.2f} kgCO2e/month\n"

    # Add a concluding instruction for the AI
    prompt += "\nSuggestions:"

    chatHistory = []
    chatHistory.append({ "role": "user", "parts": [{ "text": prompt }] })

    # Fetch API Key from Colab secrets
    try:
        apiKey = "" #API key
    except Exception as e:
        return f"Error fetching API key from Colab secrets: {e}. Please ensure you have added your GOOGLE_API_KEY to Colab secrets."

    if not apiKey:
         return "API key not found in Colab secrets. Please add your GOOGLE_API_KEY to Colab secrets."

    # Using the appropriate model name for the API call
    apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={apiKey}"

    try:
        import requests # Use requests for standard HTTP calls in Python
        response = requests.post(apiUrl, headers={"Content-Type": "application/json"}, data=json.dumps({ "contents": chatHistory }))
        response.raise_for_status() # Raise an exception for bad status codes
        result = response.json()

        if result.get('candidates') and len(result['candidates']) > 0 and \
           result['candidates'][0].get('content') and result['candidates'][0]['content'].get('parts') and \
           len(result['candidates'][0]['content']['parts']) > 0:
            return result['candidates'][0]['content']['parts'][0]['text']
        else:
             # Handle potential error responses from the API
            error_message = result.get('error', {}).get('message', 'Unknown error from API.')
            return f"Could not generate AI suggestions. API returned: {error_message}"

    except Exception as e:
        return f"An error occurred while generating AI suggestions: {e}"

# Run the asynchronous function to get AI suggestions
# Note: In a typical Python script, you'd use asyncio.run() or await this from an async main function.
# In this interactive environment, the 'await' keyword will work as expected.
print("\n--- Generating AI-Powered Suggestions ---")
# asyncio.run(generate_ai_suggestions(cf_breakdown)) # Removed asyncio.run as it's not needed in this context for await
ai_suggestions = await generate_ai_suggestions(cf_breakdown)
print(ai_suggestions)


print("\nRemember, consistency in small changes can lead to significant impact over time!")


--- Training RandomForestRegressor Model ---
RandomForestRegressor Model training complete.

--- Training XGBoost Model ---
XGBoost Model training complete.

--- RandomForestRegressor Model Evaluation on Test Set ---
Mean Absolute Error (MAE): 48.79 kgCO2e/month
R-squared (R2): 0.89

--- XGBoost Model Evaluation on Test Set ---
Mean Absolute Error (MAE): 33.56 kgCO2e/month
R-squared (R2): 0.95

--- Top 10 Feature Importances (What drives the prediction - from XGBoost) ---
red_meat_meals_per_week         0.459043
electricity_kwh_month           0.150414
car_fuel_type_electric          0.133028
flights_long_haul_per_year      0.070475
car_km_month                    0.063334
ac_kwh_month                    0.042010
car_avg_mileage_km_per_liter    0.018537
flights_short_haul_per_year     0.013088
chicken_meals_per_week          0.011103
is_recycler                     0.010361
dtype: float32

--- Predicted Total Carbon Footprint for the New Person: 772.49 kgCO2e/month ---

--- Carbon Foo