In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# ‚úÖ Load datasets
crops_df = pd.read_csv("../data/raw/crops_conditions.csv")
domesticated_plants_df = pd.read_csv("../data/raw/domesticated_plants.csv")
locations_df = pd.read_csv("../data/raw/locations_climate.csv")

# ‚úÖ Define df as a copy of locations_df
df = locations_df.copy()

# ‚úÖ Check available columns before setting the target
print("Available Columns in Dataset:", df.columns)

# ‚úÖ Define Target Variable (Modify as per dataset)
TARGET_COLUMN = "crop_suitability"  # Change if necessary

if TARGET_COLUMN not in df.columns:
    raise ValueError(f"‚ö†Ô∏è Target column '{TARGET_COLUMN}' not found in dataset!")

# ‚úÖ Handle Missing Values
df.fillna(df.median(numeric_only=True), inplace=True)  # Fill numeric columns with median
df.fillna("Unknown", inplace=True)  # Fill categorical columns with 'Unknown'

# ‚úÖ One-Hot Encode Categorical Features (Only Existing Columns)
categorical_cols = [col for col in ["country", "region", "county", "topography", "air quality", "soil nutrients"] if col in df.columns]
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# ‚úÖ Encode Ordinal Features Safely
ordinal_features = {
    "soil moisture": ["Low", "Medium", "High"],
    "wind conditions": ["Calm", "Breezy", "Windy", "Stormy"],
    "heat stress": ["None", "Mild", "Moderate", "Severe"],
    "cold stress": ["None", "Mild", "Moderate", "Severe"]
}
for feature, categories in ordinal_features.items():
    if feature in df.columns:
        df[feature] = df[feature].astype(pd.CategoricalDtype(categories=categories, ordered=True)).cat.codes

# ‚úÖ Feature Engineering - Extract Temperature Range
def extract_temperature_range(temp):
    try:
        temp = str(temp).strip()
        if "-" in temp:
            min_temp, max_temp = map(float, temp.split('-'))
            return max_temp - min_temp
        elif temp.isdigit():  # If it's a single value
            return 0
        return np.nan
    except Exception as e:
        print(f"‚ö†Ô∏è Error processing temperature '{temp}': {e}")
        return np.nan

if "temperature range (¬∞c)" in df.columns:
    df["temperature_range"] = df["temperature range (¬∞c)"].apply(extract_temperature_range)

# ‚úÖ Normalize day length if exists
if "day length (hours)" in df.columns:
    df["day_length_normalized"] = df["day length (hours)"] / 24

# ‚úÖ Scale Numeric Features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# ‚úÖ Drop Original Temperature Column (If Transformed)
if "temperature range (¬∞c)" in df.columns:
    df.drop(columns=["temperature range (¬∞c)"], inplace=True)

# ‚úÖ Convert Target Column to Numeric if Categorical
if df[TARGET_COLUMN].dtype == 'object':
    df[TARGET_COLUMN] = df[TARGET_COLUMN].astype('category').cat.codes

# ‚úÖ Final Check Before Training
print("Dataset Columns After Processing:", df.columns)
print("Unique Values in Target Column:", df[TARGET_COLUMN].unique())

# üìå Split Data into Features & Target
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

# üìå Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# üìå Initialize Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# üìå Hyperparameter Tuning (Optional)
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

# üìå Best Model from Grid Search
best_rf_model = grid_search.best_estimator_

# üìå Make Predictions
y_pred = best_rf_model.predict(X_test)

# üìå Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print(f"‚úÖ Model Accuracy: {accuracy:.4f}")
print("üîπ Classification Report:")
print(classification_report(y_test, y_pred))

print(df.head())




Available Columns in Dataset: Index(['Country', 'Region', 'County', 'Climate Conditions',
       'Temperature Range (¬∞C)', 'Light Intensity (hours/day)',
       'Relative Humidity (%)', 'Annual Precipitation (mm)', 'Wind Conditions',
       'Soil Moisture', 'Soil Nutrients', 'Atmospheric Gases', 'Altitude (m)',
       'Topography', 'Frost', 'Heat Stress', 'Cold Stress',
       'Day Length (hours)', 'Air Quality', 'Other'],
      dtype='object')


ValueError: ‚ö†Ô∏è Target column 'crop_suitability' not found in dataset!

In [3]:
print(df.head())


  Country   Region           County Climate Conditions Temperature Range (¬∞C)  \
0   Kenya  Nairobi  Nairobi Central          Temperate                10 - 25   
1   Kenya  Nairobi         Kasarani          Temperate                10 - 25   
2   Kenya  Nairobi         Lang'ata          Temperate                10 - 25   
3   Kenya  Nairobi        Dagoretti          Temperate                10 - 25   
4   Kenya    Coast          Mombasa           Tropical                25 - 35   

  Light Intensity (hours/day) Relative Humidity (%) Annual Precipitation (mm)  \
0                       5 - 7               60 - 80                800 - 1000   
1                       5 - 7               60 - 80                800 - 1000   
2                       5 - 7               60 - 80                800 - 1000   
3                       5 - 7               60 - 80                800 - 1000   
4                       6 - 8               70 - 90               1000 - 1200   

    Wind Conditions Soil 