In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


In [2]:
DATA_PATH = "data/AgriQ_Final_Tulkarm_Data.csv"

df = pd.read_csv(DATA_PATH)
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'data/AgriQ_Final_Tulkarm_Data.csv'

In [None]:
label_encoder = LabelEncoder()
df["crop_encoded"] = label_encoder.fit_transform(df["label"])

In [None]:
FEATURES = [
    "N",
    "P",
    "K",
    "temperature",
    "humidity",
    "water_access",
    "market_demand",
    "farming_type"
]


In [None]:
X = df[FEATURES]
y = df["crop_encoded"]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

model.fit(X_train, y_train)


0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [None]:
df = df.rename(columns={"label": "crop"})


In [None]:
crop_stats = df.groupby("crop").agg({
    "water_access": "mean",
    "production_cost": "mean"
}).reset_index()


In [None]:
max_cost = crop_stats["production_cost"].max()


In [None]:
def recommend_crops(farmer_input, top_k=3):
    """
    تُرجع أفضل المحاصيل المقترحة لمزارع واحد
    مع: score + water + profit
    """

    # 1️⃣ نحول input إلى DataFrame
    input_df = pd.DataFrame([farmer_input])

    # 2️⃣ نعمل scaling للمدخلات (مهم جدًا)
    input_scaled = scaler.transform(input_df)

    # 3️⃣ نحسب احتمالات كل المحاصيل
    probabilities = model.predict_proba(input_scaled)[0]

    # 4️⃣ نأخذ أعلى top_k محاصيل
    top_indices = np.argsort(probabilities)[::-1][:top_k]

    recommendations = []

    for idx in top_indices:
        crop_name = label_encoder.inverse_transform([idx])[0]
        score = float(probabilities[idx])

        # 5️⃣ نجيب بيانات الماء والتكلفة من crop_stats
        crop_row = crop_stats[crop_stats["crop"] == crop_name]

        if not crop_row.empty:
            water_access = float(crop_row["water_access"].values[0])
            water = round((1 - water_access) * 100, 1)  
            cost = float(crop_row["production_cost"].values[0])
        else:
            water = 0.0
            cost = 0.0

        # 6️⃣ تقدير الربح (Rule-based بسيط)
        profit = (score * 100) - (cost / max_cost) * 40
        profit = round(profit, 1)


        recommendations.append({
            "crop": crop_name,
            "score": round(score, 3),
            "water": round(water, 1),
            "profit": round(profit, 1)
        })

    # 7️⃣ فلترة المحاصيل غير المناسبة محليًا
    UNSUITABLE_CROPS = ["mango", "banana", "papaya"]

    recommendations = [
        r for r in recommendations
        if r["crop"] not in UNSUITABLE_CROPS
    ]

    return recommendations


In [None]:
example_farmer = {
    "N": 90,
    "P": 40,
    "K": 40,
    "temperature": 25,
    "humidity": 65,
    "water_access": 0.5,
    "market_demand": 0.8,
    "farming_type": 1  # بيت بلاستيكي
}

print(recommend_crops(example_farmer))


[{'crop': 'orange', 'score': 0.36, 'water': 61.6, 'profit': np.float64(-1.8)}, {'crop': 'pigeonpeas', 'score': 0.25, 'water': 62.0, 'profit': np.float64(-14.9)}]


