In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load datasets
crops_df = pd.read_csv("../data/raw/crops_conditions.csv")
domesticated_plants_df = pd.read_csv("../data/raw/domesticated_plants.csv")
locations_df = pd.read_csv("../data/raw/locations_climate.csv")

# ✅ Define df as a copy of locations_df
df = locations_df.copy()

# Step 1: Handle missing values
df.fillna(df.select_dtypes(include=[np.number]).median(), inplace=True)  # Fill numeric missing values with median
df.fillna("Unknown", inplace=True)  # Fill categorical missing values with 'Unknown'

# Step 2: Convert categorical features to numerical (One-Hot Encoding)
categorical_cols = [col for col in ["country", "region", "county", "topography", "air quality", "soil nutrients"] if col in df.columns]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Step 3: Encode ordinal features
ordinal_features = {
    "soil moisture": ["Low", "Medium", "High"],
    "wind conditions": ["Calm", "Breezy", "Windy", "Stormy"],
    "heat stress": ["None", "Mild", "Moderate", "Severe"],
    "cold stress": ["None", "Mild", "Moderate", "Severe"]
}

for feature, categories in ordinal_features.items():
    if feature in df.columns:
        df[feature] = df[feature].astype(pd.CategoricalDtype(categories=categories, ordered=True))
        df[feature] = df[feature].cat.codes  # Convert to numerical codes

# Step 4: Feature Engineering - Temperature Range Extraction
def extract_temperature_range(temp):
    try:
        if "-" in str(temp):
            min_temp, max_temp = map(float, str(temp).split('-'))
            return max_temp - min_temp
        return np.nan
    except:
        return np.nan

if "temperature range (°c)" in df.columns:
    df["temperature_range"] = df["temperature range (°c)"].apply(extract_temperature_range)

# Normalize day length (if column exists)
if "day length (hours)" in df.columns:
    df["day_length_normalized"] = df["day length (hours)"] / 24  # Normalize to 0-1 range

# Step 5: Scale numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()

# ✅ Ensure numerical_features is not empty before applying StandardScaler
if numerical_features:
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Step 6: Drop unnecessary columns
if "temperature range (°c)" in df.columns:
    df.drop(columns=["temperature range (°c)"], inplace=True)  # Drop original column

# ✅ Final processed DataFrame
print(df.head())




  Country   Region           County Climate Conditions Temperature Range (°C)  \
0   Kenya  Nairobi  Nairobi Central          Temperate                10 - 25   
1   Kenya  Nairobi         Kasarani          Temperate                10 - 25   
2   Kenya  Nairobi         Lang'ata          Temperate                10 - 25   
3   Kenya  Nairobi        Dagoretti          Temperate                10 - 25   
4   Kenya    Coast          Mombasa           Tropical                25 - 35   

  Light Intensity (hours/day) Relative Humidity (%) Annual Precipitation (mm)  \
0                       5 - 7               60 - 80                800 - 1000   
1                       5 - 7               60 - 80                800 - 1000   
2                       5 - 7               60 - 80                800 - 1000   
3                       5 - 7               60 - 80                800 - 1000   
4                       6 - 8               70 - 90               1000 - 1200   

    Wind Conditions Soil M