# üöÄ RL Football - Fast GPU Training

**Vectorized trainer with 16 parallel environments**

1. Set Runtime ‚Üí GPU (T4)
2. Run all cells
3. Download `trained.json` when done

**Expected speed: 15-30 episodes/sec on T4**

In [None]:
# Setup & GPU check
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import json
import time

print(f'TensorFlow: {tf.__version__}')
gpus = tf.config.list_physical_devices('GPU')
print(f'GPU: {gpus}')
if gpus:
    print('‚úÖ GPU detected - training will be fast!')
else:
    print('‚ö†Ô∏è No GPU - go to Runtime ‚Üí Change runtime type ‚Üí GPU')

In [None]:
# Vectorized Game - runs N parallel environments with NumPy
class VectorizedGame:
    def __init__(self, n_envs=16):
        self.n = n_envs
        self.W, self.H = 720, 420
        self.reset_all()

    def reset_all(self):
        self.blip = np.tile([120., 210., 0., 0.], (self.n, 1))
        self.bloop = np.tile([600., 210., 0., 0.], (self.n, 1))
        self.ball = np.tile([360., 210., 0., 0.], (self.n, 1))
        self.scores = np.zeros((self.n, 2), dtype=np.int32)
        self.time = np.full(self.n, 30.0)
        self.kick_flags = np.zeros((self.n, 2), dtype=np.int32)
        self.done = np.zeros(self.n, dtype=bool)

    def reset_positions(self, mask):
        self.blip[mask, :2] = [120., 210.]
        self.bloop[mask, :2] = [600., 210.]
        self.ball[mask] = [360., 210., 0., 0.]

    def step(self, actions_blip, actions_bloop):
        MOVES = np.array([[0,-1],[0,1],[-1,0],[1,0],[-1,-1],[1,-1],[-1,1],[1,1],[0,0],[0,0]], dtype=np.float32)

        for i, (player, actions) in enumerate([(self.blip, actions_blip), (self.bloop, actions_bloop)]):
            move_mask = actions < 8
            if np.any(move_mask):
                dirs = MOVES[actions[move_mask]]
                player[move_mask, 2] += dirs[:, 0] * 2
                player[move_mask, 3] += dirs[:, 1] * 2
            self.kick_flags[:, i] = (actions == 8).astype(np.int32)
            speed = np.sqrt(player[:, 2]**2 + player[:, 3]**2)
            too_fast = speed > 4
            if np.any(too_fast):
                player[too_fast, 2] *= 4.0 / speed[too_fast]
                player[too_fast, 3] *= 4.0 / speed[too_fast]

        for player in [self.blip, self.bloop]:
            player[:, 0] += player[:, 2]
            player[:, 1] += player[:, 3]
            player[:, 2:4] *= 0.85
            player[:, 0] = np.clip(player[:, 0], 25, self.W - 25)
            player[:, 1] = np.clip(player[:, 1], 25, self.H - 25)

        self.ball[:, 0] += self.ball[:, 2]
        self.ball[:, 1] += self.ball[:, 3]
        self.ball[:, 2:4] *= 0.98

        # Wall bounces
        top = self.ball[:, 1] < 12
        self.ball[top, 1] = 12
        self.ball[top, 3] *= -0.8
        bot = self.ball[:, 1] > self.H - 12
        self.ball[bot, 1] = self.H - 12
        self.ball[bot, 3] *= -0.8

        goal_y_min, goal_y_max = 150, 270
        in_goal = (self.ball[:, 1] > goal_y_min) & (self.ball[:, 1] < goal_y_max)
        left = (self.ball[:, 0] < 12) & ~in_goal
        self.ball[left, 0] = 12
        self.ball[left, 2] *= -0.8
        right = (self.ball[:, 0] > self.W - 12) & ~in_goal
        self.ball[right, 0] = self.W - 12
        self.ball[right, 2] *= -0.8

        # Player-ball collisions
        for i, player in enumerate([self.blip, self.bloop]):
            dx = self.ball[:, 0] - player[:, 0]
            dy = self.ball[:, 1] - player[:, 1]
            dist = np.sqrt(dx**2 + dy**2)
            collide = (dist > 0) & (dist < 37)
            if np.any(collide):
                nx = dx[collide] / dist[collide]
                ny = dy[collide] / dist[collide]
                self.ball[collide, 0] = player[collide, 0] + nx * 37
                self.ball[collide, 1] = player[collide, 1] + ny * 37
                power = np.where(self.kick_flags[collide, i], 12, 6)
                self.ball[collide, 2] = nx * power + player[collide, 2] * 0.5
                self.ball[collide, 3] = ny * power + player[collide, 3] * 0.5
        self.kick_flags[:] = 0

        # Goals
        events = np.full(self.n, None, dtype=object)
        bloop_goal = in_goal & (self.ball[:, 0] < 0)
        if np.any(bloop_goal):
            self.scores[bloop_goal, 1] += 1
            events[bloop_goal] = 'L'
            self.reset_positions(bloop_goal)
        blip_goal = in_goal & (self.ball[:, 0] > self.W)
        if np.any(blip_goal):
            self.scores[blip_goal, 0] += 1
            events[blip_goal] = 'W'
            self.reset_positions(blip_goal)

        self.time -= 1/60
        self.done = self.time <= 0
        return events, self.done.copy()

    def get_states(self, team=0):
        p, o = (self.blip, self.bloop) if team == 0 else (self.bloop, self.blip)
        dist = np.sqrt((p[:, 0] - self.ball[:, 0])**2 + (p[:, 1] - self.ball[:, 1])**2) / 830
        return np.stack([
            p[:, 0]/self.W, p[:, 1]/self.H, self.ball[:, 0]/self.W, self.ball[:, 1]/self.H,
            np.clip(self.ball[:, 2]/15, -1, 1), np.clip(self.ball[:, 3]/15, -1, 1),
            o[:, 0]/self.W, o[:, 1]/self.H, dist, np.zeros(self.n), np.zeros(self.n), np.zeros(self.n)
        ], axis=1).astype(np.float32)

print('‚úÖ VectorizedGame ready')

In [None]:
# Simple AI opponent (vectorized)
def simple_ai_actions(states):
    n = states.shape[0]
    actions = np.full(n, 9, dtype=np.int32)
    px, py, bx, by = states[:, 0], states[:, 1], states[:, 2], states[:, 3]
    dist = states[:, 8]
    dx, dy = bx - px, by - py

    actions[dist < 0.04] = 8  # Kick
    chase = dist >= 0.04

    mx = np.zeros(n, dtype=np.int32)
    my = np.zeros(n, dtype=np.int32)
    mx[dx > 0.02] = 1
    mx[dx < -0.02] = -1
    my[dy > 0.02] = 1
    my[dy < -0.02] = -1

    action_map = {(0,-1):0, (0,1):1, (-1,0):2, (1,0):3, (-1,-1):4, (1,-1):5, (-1,1):6, (1,1):7}
    for i in range(n):
        if chase[i] and (mx[i], my[i]) in action_map:
            actions[i] = action_map[(mx[i], my[i])]
    return actions

print('‚úÖ SimpleAI ready')

In [None]:
# Dueling DQN model
def create_model(lr=0.0005):
    inp = layers.Input(shape=(12,))
    x = layers.Dense(256, activation='relu', kernel_initializer='he_normal')(inp)
    x = layers.Dense(256, activation='relu', kernel_initializer='he_normal')(x)
    x = layers.Dense(128, activation='relu', kernel_initializer='he_normal')(x)
    v = layers.Dense(64, activation='relu', kernel_initializer='he_normal')(x)
    v = layers.Dense(1, kernel_initializer='he_normal')(v)
    a = layers.Dense(64, activation='relu', kernel_initializer='he_normal')(x)
    a = layers.Dense(10, kernel_initializer='he_normal')(a)
    m = layers.Lambda(lambda t: tf.reduce_mean(t, axis=1, keepdims=True))(a)
    q = layers.Add()([v, layers.Subtract()([a, m])])
    model = keras.Model(inp, q)
    model.compile(optimizer=keras.optimizers.Adam(lr), loss='mse')
    return model

# Compiled TensorFlow functions for speed
@tf.function
def predict_batch(model, states):
    return model(states, training=False)

@tf.function
def train_step(model, optimizer, states, targets):
    with tf.GradientTape() as tape:
        q = model(states, training=True)
        loss = tf.reduce_mean(tf.square(q - targets))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

print('‚úÖ DQN model ready')

In [None]:
# Efficient replay buffer
class ReplayBuffer:
    def __init__(self, cap=50000):
        self.cap = cap
        self.states = np.zeros((cap, 12), dtype=np.float32)
        self.actions = np.zeros(cap, dtype=np.int32)
        self.rewards = np.zeros(cap, dtype=np.float32)
        self.next_states = np.zeros((cap, 12), dtype=np.float32)
        self.dones = np.zeros(cap, dtype=bool)
        self.idx = 0
        self.size = 0

    def add_batch(self, s, a, r, s2, d):
        for i in range(len(s)):
            self.states[self.idx] = s[i]
            self.actions[self.idx] = a[i]
            self.rewards[self.idx] = r[i]
            self.next_states[self.idx] = s2[i]
            self.dones[self.idx] = d[i]
            self.idx = (self.idx + 1) % self.cap
            self.size = min(self.size + 1, self.cap)

    def sample(self, batch_size):
        idx = np.random.choice(self.size, batch_size, replace=False)
        return self.states[idx], self.actions[idx], self.rewards[idx], self.next_states[idx], self.dones[idx]

# Reward calculation
def calc_rewards(game, events, last_d):
    p, b, o = game.blip, game.ball, game.bloop
    d = np.sqrt((p[:, 0] - b[:, 0])**2 + (p[:, 1] - b[:, 1])**2)
    r = np.zeros(game.n, dtype=np.float32)
    r[events == 'W'] += 500
    r[events == 'L'] -= 300
    r += (1 - d/830) * 5
    r[d < 40] += 10
    if last_d is not None:
        delta = last_d - d
        r += delta * 0.5
        r[delta > 2] += 3
    sp = np.sqrt(p[:, 2]**2 + p[:, 3]**2)
    r[(sp < 0.5) & (d > 50)] -= 8
    r[sp > 1] += 1
    r[(b[:, 2] > 2) & (d < 80)] += 8
    r[np.abs(b[:, 0] - 720) < 100] += 5
    corner = ((p[:, 0] < 80) | (p[:, 0] > 640)) & ((p[:, 1] < 80) | (p[:, 1] > 340))
    r[corner] -= 5
    r[corner & (d > 100)] -= 5
    r[d > 300] -= 5
    r[(d > 200) & (d <= 300)] -= 3
    r[(d > 150) & (d <= 200)] -= 1
    od = np.sqrt((o[:, 0] - b[:, 0])**2 + (o[:, 1] - b[:, 1])**2)
    r[(od < d) & (d > 60)] -= 2
    r -= 0.1
    return r, d

print('‚úÖ Buffer & rewards ready')

In [None]:
# Training configuration
EPISODES = 100000  # Change this for shorter/longer training
N_ENVS = 32        # Parallel environments (increase for faster training)
BATCH_SIZE = 64
GAMMA = 0.995
EPS_DECAY = 0.9999
EPS_MIN = 0.02
TARGET_UPDATE = 500
SAVE_EVERY = 10000

print(f'Training {EPISODES} episodes with {N_ENVS} parallel environments')

In [None]:
# Main training loop
def train():
    print('='*60)
    print('üöÄ Starting GPU-accelerated training')
    print('='*60)

    game = VectorizedGame(N_ENVS)
    model = create_model()
    target = create_model()
    target.set_weights(model.get_weights())
    optimizer = keras.optimizers.Adam(0.0005)
    buffer = ReplayBuffer(50000)

    eps = 1.0
    step_count = 0
    stats = {'W': 0, 'L': 0, 'D': 0, 'goals': 0}
    t0 = time.time()
    total_ep = 0

    while total_ep < EPISODES:
        game.reset_all()
        last_d = None

        while not np.all(game.done):
            active = ~game.done
            s1 = game.get_states(0)
            s2 = game.get_states(1)

            # Action selection
            a1 = np.zeros(game.n, dtype=np.int32)
            rand = np.random.random(game.n) < eps
            a1[rand] = np.random.randint(0, 10, size=np.sum(rand))
            if np.any(~rand):
                q = predict_batch(model, s1[~rand]).numpy()
                a1[~rand] = np.argmax(q, axis=1)
            a2 = simple_ai_actions(s2)

            events, dones = game.step(a1, a2)
            rewards, last_d = calc_rewards(game, events, last_d)
            new_s = game.get_states(0)

            buffer.add_batch(s1[active], a1[active], rewards[active], new_s[active], dones[active])
            stats['goals'] += np.sum(events != None)

            # Train
            if buffer.size >= 500:
                s, a, r, s2, d = buffer.sample(BATCH_SIZE)
                q_main = predict_batch(model, s2).numpy()
                q_target = predict_batch(target, s2).numpy()
                best = np.argmax(q_main, axis=1)
                next_q = q_target[np.arange(BATCH_SIZE), best]

                targets = predict_batch(model, s).numpy()
                for i in range(BATCH_SIZE):
                    targets[i, a[i]] = r[i] if d[i] else r[i] + GAMMA * next_q[i]

                train_step(model, optimizer, tf.constant(s), tf.constant(targets))
                step_count += 1

                if step_count % TARGET_UPDATE == 0:
                    target.set_weights(model.get_weights())

        # Episode end
        for i in range(game.n):
            if game.scores[i, 0] > game.scores[i, 1]: stats['W'] += 1
            elif game.scores[i, 1] > game.scores[i, 0]: stats['L'] += 1
            else: stats['D'] += 1

        total_ep += N_ENVS
        for _ in range(N_ENVS):
            eps = max(EPS_MIN, eps * EPS_DECAY)

        # Progress
        if total_ep % (100 * N_ENVS) < N_ENVS or total_ep <= N_ENVS:
            el = time.time() - t0
            sp = total_ep / el if el > 0 else 0
            eta = (EPISODES - total_ep) / sp if sp > 0 else 0
            eta_s = f'{eta/3600:.1f}h' if eta > 3600 else f'{eta/60:.1f}m'
            print(f'Ep {total_ep}/{EPISODES} | Œµ:{eps:.4f} | W:{stats["W"]} L:{stats["L"]} D:{stats["D"]} | {sp:.1f}/s | ETA:{eta_s}')

        # Save checkpoint
        if total_ep % SAVE_EVERY < N_ENVS:
            save_weights(model, eps, step_count, f'weights_{total_ep}.json')

    # Final save
    save_weights(model, eps, step_count, 'trained.json')
    print('='*60)
    print('‚úÖ Training complete!')
    print(f'Final: W:{stats["W"]} L:{stats["L"]} D:{stats["D"]} | Goals:{stats["goals"]}')
    print(f'Time: {(time.time()-t0)/3600:.2f} hours')
    print('='*60)
    return model, stats

def save_weights(model, eps, steps, filename):
    weights = [{'shape': list(w.shape), 'data': w.flatten().tolist()} for w in model.get_weights()]
    data = {
        'version': 2, 'aiType': 'dqn',
        'blipAgent': {'weights': weights, 'epsilon': float(eps), 'trainStepCount': int(steps)},
        'bloopAgent': {'weights': weights, 'epsilon': float(eps), 'trainStepCount': int(steps)},
        'blip': {'weights': weights, 'epsilon': float(eps), 'trainStepCount': int(steps)},
        'bloop': {'weights': weights, 'epsilon': float(eps), 'trainStepCount': int(steps)}
    }
    with open(filename, 'w') as f:
        json.dump(data, f)
    print(f'üíæ Saved {filename}')

print('‚úÖ Training function ready')

In [None]:
# üöÄ START TRAINING
model, stats = train()

In [None]:
# Download trained weights
from google.colab import files
files.download('trained.json')