# Q-Learning Experiments for Irrigation Scheduling

Notebook for running and experimenting with tabular Q-learning.

## Setup

In [1]:
import sys
import os

# Add multiple paths for Jupyter compatibility
sys.path.insert(0, '.')
sys.path.insert(0, '..')

# Debug: show what's available
print("Current directory:", os.getcwd())
print("Python files available:", [f for f in os.listdir('.') if f.endswith('.py')])

import numpy as np
from irrigation_env import IrrigationEnv
from irr_Qtable import train_q_learning, discretize_state, get_state_space_size, N_ACTIONS, from_discrate_to_full_state

Current directory: c:\Users\User\Documents\שנה ג\py_AI\irrigation_agent
Python files available: ['irr_Qtable.py']


In [2]:
# Create environment instance
env = IrrigationEnv(
    max_et0=8.0,
    max_rain=50.0,
    et0_range=(2.0, 8.0),
    rain_range=(0.0, 40.0),
    episode_length=90,
)

print(f"Environment created")
print(f"Action space: {env.action_space}")
print(f"Observation space: {env.observation_space}")

Environment created
Action space: Discrete(3)
Observation space: Dict('crop_stage': Discrete(3), 'et0': Box(0.0, 1.0, (1,), float32), 'rain': Box(0.0, 1.0, (1,), float32), 'soil_moisture': Box(0.0, 1.0, (1,), float32))


## Training

In [3]:
# Training parameters
n_episodes = 1
alpha = 0.1
gamma = 0.99
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
n_soil_bins = 5

print(f"Training Q-learning agent for {n_episodes} episodes...")
print(f"State space size: {get_state_space_size(n_soil_bins)}")
print(f"Action space size: {N_ACTIONS}")

Training Q-learning agent for 1 episodes...
State space size: 60
Action space size: 3


In [4]:
# Train Q-learning
Q = train_q_learning(
    env=env,
    n_episodes=n_episodes,
    alpha=alpha,
    gamma=gamma,
    epsilon_start=epsilon_start,
    epsilon_end=epsilon_end,
    epsilon_decay=epsilon_decay,
    n_soil_bins=n_soil_bins,
    verbose= True
)

print("\nTraining complete!")
print(f"Q-table shape: {Q.shape}")
print(f"Non-zero entries: {np.count_nonzero(Q)}/{Q.size}")

Step 1: State 25, Action 0, Reward 2.0
Step 2: State 49, Action 2, Reward -5.8
Step 3: State 49, Action 1, Reward -0.8
Step 4: State 49, Action 2, Reward -1.8
Step 5: State 51, Action 0, Reward -0.30000000000000004
Step 6: State 51, Action 1, Reward -0.8
Step 7: State 49, Action 0, Reward -0.30000000000000004
Step 8: State 50, Action 1, Reward -0.8
Step 9: State 51, Action 2, Reward -1.8
Step 10: State 51, Action 1, Reward -0.8
Step 11: State 51, Action 0, Reward -0.30000000000000004
Step 12: State 49, Action 1, Reward -0.8
Step 13: State 51, Action 2, Reward -1.8
Step 14: State 51, Action 0, Reward -0.30000000000000004
Step 15: State 49, Action 0, Reward -0.30000000000000004
Step 16: State 49, Action 2, Reward -1.8
Step 17: State 51, Action 1, Reward -0.8
Step 18: State 50, Action 2, Reward -1.8
Step 19: State 49, Action 2, Reward -1.8
Step 20: State 51, Action 2, Reward -1.8
Step 21: State 49, Action 0, Reward -0.30000000000000004
Step 22: State 51, Action 2, Reward -1.8
Step 23: Sta

In [5]:
env.reset()
env.soil_moisture = 0.25
env.prev_soil_moisture = 0.25
env.current_et0 = 6.0
env.current_rain = 0.0
env.crop_stage = 1  # flowering

obs, _ = env._get_obs(), None

for a in [0, 1, 2]:
    env.prev_soil_moisture = env.soil_moisture
    env._update_state(a)
    r = env._calculate_reward(a)
    print(f"Action {a}: soil={env.soil_moisture:.2f}, reward={r:.2f}")


Action 0: soil=0.18, reward=-0.59
Action 1: soil=0.57, reward=5.50
Action 2: soil=0.86, reward=-5.66


In [6]:
import inspect
print(inspect.getsource(env._calculate_reward))
print("bottom/top:", env.threshold_bottom_soil_moisture, env.threshold_top_soil_moisture)
print("water_cost:", env.water_cost, "irrigation_amounts:", env.irrigation_amounts)

obs, _ = env.reset()
env.prev_soil_moisture = env.soil_moisture
obs, r, term, trunc, info = env.step(1)
print("One step reward:", r)


    def _calculate_reward(self, action: int) -> float:
        """
        Calculate transition-based reward signal for the current step.
        
        1. Transition bonus: +2.0 when ENTERING optimal range (prev not optimal, current is)
        2. Stay bonus: +0.5 when STAYING in optimal range
        3. Drought penalty: -2.0 per unit below optimal minimum
        4. Over-saturation penalty: -0.5 per unit above optimal maximum
        5. Irrigation cost penalty: -0.1 per mm irrigated
        
        Parameters
        ----------
        action : int
            Irrigation action {0, 1, 2}
        
        Returns
        -------
        reward : float
            Reward signal for this step (does not modify any state variables)
        """
        
        

        # Penalty for water stress (soil completely dry)
        # old code
        """
        stress_penalty = 0.0
        if self.soil_moisture <= self.threshold_bottom_soil_moisture:
            stress_penalty = -80 * (self

In [6]:
print(from_discrate_to_full_state(state_index=51))

(4, 0, 1, 1)


In [5]:
env.reset()
for a in [0, 1, 2]:
    env.prev_soil_moisture = 0.28
    env.soil_moisture = 0.28
    env._update_state(a)
    r = env._calculate_reward(a)
    print(a, env.soil_moisture, r)


0 0.3611389992719203 1.0
1 0.354290023093221 0.5
2 0.6112338395373782 -0.5


In [15]:
for m in [0.25, 0.29, 0.31, 0.35, 0.65, 0.71]:
    obs = {
        "soil_moisture": np.array([m]),
        "crop_stage": 1,
        "et0": np.array([0.5]),
        "rain": np.array([0.0]),
    }
    s = discretize_state(obs, n_soil_bins=10)
    print(m, s)


0.25 30
0.29 30
0.31 42
0.35 42
0.65 78
0.71 90


## Inspection

In [9]:
# Basic Q-table statistics
print("Q-table Statistics:")
print(f"Shape: {Q.shape}")
print(f"Min Q-value: {Q.min():.4f}")
print(f"Max Q-value: {Q.max():.4f}")
print(f"Mean Q-value: {Q.mean():.4f}")
print(f"Std Q-value: {Q.std():.4f}")
print(f"\nNon-zero entries: {np.count_nonzero(Q)}/{Q.size} ({100*np.count_nonzero(Q)/Q.size:.1f}%)")

Q-table Statistics:
Shape: (60, 3)
Min Q-value: -9.2630
Max Q-value: 0.0000
Mean Q-value: -2.1424
Std Q-value: 3.1739

Non-zero entries: 62/180 (34.4%)


In [10]:
# Examine action preferences across all states
best_actions = np.argmax(Q, axis=1)
action_counts = np.bincount(best_actions, minlength=N_ACTIONS)

print("Action preferences (greedy policy):")
for action_idx, count in enumerate(action_counts):
    print(f"  Action {action_idx}: {count} states ({100*count/Q.shape[0]:.1f}%)")

Action preferences (greedy policy):
  Action 0: 59 states (98.3%)
  Action 1: 1 states (1.7%)
  Action 2: 0 states (0.0%)


In [11]:
# Test learned policy on a single episode
obs, info = env.reset(seed=123)
total_reward = 0.0
done = False
step_count = 0

print("Testing learned policy:")
while not done and step_count < 10:
    state = discretize_state(obs, n_soil_bins)
    action = np.argmax(Q[state])
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    total_reward += reward
    
    if step_count < 5:
        print(f"  Step {step_count+1}: state={state}, action={action}, reward={reward:.3f}, SM={obs['soil_moisture'][0]:.3f}")
    
    step_count += 1

print(f"\nTotal reward (first {step_count} steps): {total_reward:.3f}")

Testing learned policy:
  Step 1: state=27, action=0, reward=-0.022, SM=0.745
  Step 2: state=37, action=0, reward=-0.051, SM=0.803
  Step 3: state=49, action=0, reward=-0.137, SM=0.974
  Step 4: state=51, action=0, reward=-0.150, SM=1.000
  Step 5: state=51, action=0, reward=-0.150, SM=1.000

Total reward (first 10 steps): -1.252


## Continued Training

In [12]:
# Reload module to pick up changes to train_q_learning
import importlib
import irr_Qtable
importlib.reload(irr_Qtable)
from irr_Qtable import train_q_learning, discretize_state, get_state_space_size, N_ACTIONS

print("Module reloaded")

Module reloaded


In [13]:
# Continue training from existing Q-table
n_additional_episodes = 500

print(f"Continuing training for {n_additional_episodes} additional episodes...")
print(f"Initial Q-table stats: min={Q.min():.4f}, max={Q.max():.4f}, mean={Q.mean():.4f}")

Q = train_q_learning(
    env=env,
    n_episodes=n_additional_episodes,
    alpha=alpha,
    gamma=gamma,
    epsilon_start=0.1,  # Lower exploration for continued training
    epsilon_end=0.01,
    epsilon_decay=0.99,
    n_soil_bins=n_soil_bins,
    Q_init=Q,
)

print("\nContinued training complete!")
print(f"Updated Q-table stats: min={Q.min():.4f}, max={Q.max():.4f}, mean={Q.mean():.4f}")

Continuing training for 500 additional episodes...
Initial Q-table stats: min=-9.2630, max=0.0000, mean=-2.1424

Training complete!
Q-table shape: (60, 3)
Non-zero entries: 63/180

Continued training complete!
Updated Q-table stats: min=-9.3588, max=0.0000, mean=-2.2131


In [14]:
# Compare action preferences after continued training
best_actions_updated = np.argmax(Q, axis=1)
action_counts_updated = np.bincount(best_actions_updated, minlength=N_ACTIONS)

print("Action preferences after continued training:")
for action_idx, count in enumerate(action_counts_updated):
    print(f"  Action {action_idx}: {count} states ({100*count/Q.shape[0]:.1f}%)")

Action preferences after continued training:
  Action 0: 57 states (95.0%)
  Action 1: 3 states (5.0%)
  Action 2: 0 states (0.0%)


In [11]:
print("State_index | Q(no)   Q(light)  Q(heavy) | Best")
print("------------------------------------------------")
for state in range(Q.shape[0]):
    best_action = np.argmax(Q[state])
    soil, stage,et0,rain  = from_discrate_to_full_state(state, n_soil_bins)
    print(
            f"{state:11d} | "
            f"{soil},{stage},{et0},{rain}|"
            f"{Q[state, 0]:7.3f}  {Q[state, 1]:7.3f}  {Q[state, 2]:7.3f} | "
            f"{best_action}"
        )
    



State_index | Q(no)   Q(light)  Q(heavy) | Best
------------------------------------------------
          0 | 0,0,0,0|  0.000    0.000    0.000 | 0
          1 | 0,0,0,1|  0.000    0.000    0.000 | 0
          2 | 0,0,1,0|  0.000    0.000    0.000 | 0
          3 | 0,0,1,1|  0.000    0.000    0.000 | 0
          4 | 0,1,0,0|  0.000    0.000    0.000 | 0
          5 | 0,1,0,1|  0.000    0.000    0.000 | 0
          6 | 0,1,1,0|  0.000    0.000    0.000 | 0
          7 | 0,1,1,1|  0.000    0.000    0.000 | 0
          8 | 0,2,0,0|  0.000    0.000    0.000 | 0
          9 | 0,2,0,1|  0.000    0.000    0.000 | 0
         10 | 0,2,1,0|  0.000    0.000    0.000 | 0
         11 | 0,2,1,1|  0.000    0.000    0.000 | 0
         12 | 1,0,0,0| -5.868   -6.042   -5.970 | 0
         13 | 1,0,0,1| -6.992   -7.704   -8.273 | 0
         14 | 1,0,1,0| -5.817   -5.957   -6.093 | 0
         15 | 1,0,1,1| -7.040   -7.764   -8.953 | 0
         16 | 1,1,0,0|  0.000    0.000    0.000 | 0
         17 | 1,1,0

In [12]:
visited_states = set()

for episode in range(200):
    obs, _ = env.reset()
    state = discretize_state(obs, n_soil_bins=3)
    visited_states.add(state)

    done = False
    while not done:
        action = np.random.randint(3)  # exploration מוחלט
        obs, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        state = discretize_state(obs, n_soil_bins=3)
        visited_states.add(state)

print("Visited states:", sorted(visited_states))
print("Number of visited states:", len(visited_states))


Visited states: [12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]
Number of visited states: 16


In [13]:
# הגדרת הפעולות
actions = [0, 1, 2]  # 0 = no irrigation, 1 = light, 2 = heavy irrigation

# ביצוע מעבר מצב עבור כל פעולה, לכל מצב
for soil_bin in range(3):  # 0, 1, 2 for soil moisture
    for crop_stage in range(3):  # 0, 1, 2 for crop stages (emergence, flowering, maturity)
        for et0_bin in range(2):  # 0, 1 for ET0 values (low/high)
            for rain_bin in range(2):  # 0, 1 for rain values (no/yes)
                
                # הגדרת המצב הנוכחי
                state = (soil_bin, crop_stage, et0_bin, rain_bin)
                
                # אתחול ה-env לפי המצב הנוכחי
                env.soil_moisture = state[0] * 0.5  # לדוגמה, אם soil_bin = 1 אז רטיבות 0.5
                env.crop_stage = state[1]
                env.current_step = 0  # אתחול הזמן למצב התחלה
                env.current_et0 = state[2] * 8.0  # לדוגמה, אם et0_bin = 1 אז et0 = 8
                env.current_rain = state[3] * 50.0  # אם rain_bin = 1 אז גשם = 50

                for action in actions:
                    # הפעלת פעולה על המצב הנוכחי
                    next_state, reward, terminated, truncated, _ = env.step(action)

                    # הדפסת המעבר (המצב הנוכחי, פעולה, תגמול, מצב הבא)
                    print(f"State {state} - Action {action} -> Reward {reward:.2f} -> Next State {next_state}")


State (0, 0, 0, 0) - Action 0 -> Reward -0.60 -> Next State {'soil_moisture': array([0.], dtype=float32), 'crop_stage': 0, 'rain': array([0.0983193], dtype=float32), 'et0': array([0.8345703], dtype=float32)}
State (0, 0, 0, 0) - Action 1 -> Reward -0.97 -> Next State {'soil_moisture': array([0.06577684], dtype=float32), 'crop_stage': 0, 'rain': array([0.11071205], dtype=float32), 'et0': array([0.35413218], dtype=float32)}
State (0, 0, 0, 0) - Action 2 -> Reward -1.59 -> Next State {'soil_moisture': array([0.25696757], dtype=float32), 'crop_stage': 0, 'rain': array([0.6736061], dtype=float32), 'et0': array([0.77843297], dtype=float32)}
State (0, 0, 0, 1) - Action 0 -> Reward 1.00 -> Next State {'soil_moisture': array([0.5], dtype=float32), 'crop_stage': 0, 'rain': array([0.12238463], dtype=float32), 'et0': array([0.6364257], dtype=float32)}
State (0, 0, 0, 1) - Action 1 -> Reward 0.50 -> Next State {'soil_moisture': array([0.58573526], dtype=float32), 'crop_stage': 0, 'rain': array([0.0