# FPL Model Building
We'll build a model using Keras and set it up as a package so we can train both locally and on the cloud.

We need to create a model as a Python package, so we'll need an `__init__.py` to identify the directory as a package, a `model.py` to hold the model code, and a `task.py` to pass command line parameters to our model.We need to create a model as a Python package, so we'll need an `__init__.py` to identify the directory as a package, a `model.py` to hold the model code, and a `task.py` to pass command line parameters to our model.

In [1]:
!pip install google-cloud-storage
!pip install tensorflow

35.61s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




46.64s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




In [2]:
from google.cloud import storage

In [3]:
PROJECT = !gcloud config get-value project
PROJECT = PROJECT[0]
BUCKET = PROJECT
REGION = "europe-west1"

OUTDIR = f"gs://{BUCKET}/fpl/data"

%env PROJECT=$PROJECT
%env BUCKET=$BUCKET
%env REGION=$REGION
%env OUTDIR=$OUTDIR
%env TFVERSION=2.8

env: PROJECT=bf-fpl-pred-080723
env: BUCKET=bf-fpl-pred-080723
env: REGION=europe-west1
env: OUTDIR=gs://bf-fpl-pred-080723/fpl/data
env: TFVERSION=2.8


In [8]:
%%writefile ./trainer/model.py
"""Data prep, train and evaluate DNN model."""

import logging
import os

import numpy as np
import tensorflow as tf
from tensorflow.keras import callbacks, models
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Input,
)

logging.info(tf.version.VERSION)

# TODO: Parametrise the column list to use the same package for all positions.
CSV_COLUMNS = [
    "hash_id",
    "element_code",
    "season_name",
    "next_season_points",
    "minutes",
    "goals_scored",
    "assists",
    "clean_sheets",
    "penalties_missed",
    "bps",
    "yellow_threshold",
    "red_cards",
    "own_goals",
    "influence",
    "creativity",
    "threat",
    "start_cost",
    "end_cost"
]

LABEL_COLUMN = "next_season_points"
DEFAULTS = [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]
UNWANTED_COLS = ["hash_id", "element_code", "season_name"]

INPUT_COLS = [
    c for c in CSV_COLUMNS if c != LABEL_COLUMN and c not in UNWANTED_COLS
]

def features_and_labels(row_data):
    for unwanted_col in UNWANTED_COLS:
        row_data.pop(unwanted_col)
    label = row_data.pop(LABEL_COLUMN)
    return row_data, label


def load_dataset(pattern, batch_size, num_repeat):
    dataset = tf.data.experimental.make_csv_dataset(
        file_pattern=pattern,
        batch_size=batch_size,
        column_names=CSV_COLUMNS,
        column_defaults=DEFAULTS,
        num_epochs=num_repeat,
        shuffle_buffer_size=1000000,
    )
    return dataset.map(features_and_labels)


def create_train_dataset(pattern, batch_size):
    dataset = load_dataset(pattern, batch_size, num_repeat=None)
    return dataset.prefetch(1)


def create_eval_dataset(pattern, batch_size):
    dataset = load_dataset(pattern, batch_size, num_repeat=1)
    return dataset.prefetch(1)


def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))


def build_dnn_model(nnsize, lr):
    inputs = {
        colname: Input(name=colname, shape=(1,), dtype="float32")
        for colname in INPUT_COLS
    }

    # Concatenate numeric inputs
    dnn_inputs = Concatenate()(list(inputs.values()))

    x = dnn_inputs
    for layer, nodes in enumerate(nnsize):
        x = Dense(nodes, activation="relu", name=f"h{layer}")(x)
    output = Dense(1, name="next_sason_points")(x)

    model = models.Model(inputs, output)

    lr_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    model.compile(optimizer=lr_optimizer, loss="mse", metrics=[rmse, "mse"])

    return model


def train_and_evaluate(hparams):
    # TODO 1b
    batch_size = hparams["batch_size"]
    lr = hparams["lr"]
    nnsize = [int(s) for s in hparams["nnsize"].split()]
    eval_data_path = hparams["eval_data_path"]
    num_evals = hparams["num_evals"]
    num_examples_to_train_on = hparams["num_examples_to_train_on"]
    output_dir = hparams["output_dir"]
    train_data_path = hparams["train_data_path"]

    model_export_path = os.path.join(output_dir, "savedmodel")
    checkpoint_path = os.path.join(output_dir, "checkpoints")
    tensorboard_path = os.path.join(output_dir, "tensorboard")

    if tf.io.gfile.exists(output_dir):
        tf.io.gfile.rmtree(output_dir)

    model = build_dnn_model(nnsize, lr)
    logging.info(model.summary())

    trainds = create_train_dataset(train_data_path, batch_size)
    evalds = create_eval_dataset(eval_data_path, batch_size)

    steps_per_epoch = num_examples_to_train_on // (batch_size * num_evals)

    checkpoint_cb = callbacks.ModelCheckpoint(
        checkpoint_path, save_weights_only=True, verbose=1
    )
    tensorboard_cb = callbacks.TensorBoard(tensorboard_path, histogram_freq=1)

    history = model.fit(
        trainds,
        validation_data=evalds,
        epochs=num_evals,
        steps_per_epoch=max(1, steps_per_epoch),
        verbose=2,  # 0=silent, 1=progress bar, 2=one line per epoch
        callbacks=[checkpoint_cb, tensorboard_cb],
    )

    # Exporting the model with default serving function.
    model.save(model_export_path)
    return history

Overwriting ./trainer/model.py


In [5]:
%%writefile ./trainer/task.py
"""Argument definitions for model training code in `trainer.model`."""
# TODO: Add CSV_COLUMNS.

import argparse

from trainer import model

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--batch_size",
        help="Batch size for training steps",
        type=int,
        default=32,
    )
    parser.add_argument(
        "--eval_data_path",
        help="GCS location pattern of eval files",
        required=True,
    )
    parser.add_argument(
        "--nnsize",
        help="Hidden layer sizes (provide space-separated sizes)",
        default="32 8",
    )
    parser.add_argument(
        "--lr", help="learning rate for optimizer", type=float, default=0.001
    )
    parser.add_argument(
        "--num_evals",
        help="Number of times to evaluate model on eval data training.",
        type=int,
        default=5,
    )
    parser.add_argument(
        "--num_examples_to_train_on",
        help="Number of examples to train on.",
        type=int,
        default=100,
    )
    parser.add_argument(
        "--output_dir",
        help="GCS location to write checkpoints and export models",
        required=True,
    )
    parser.add_argument(
        "--train_data_path",
        help="GCS location pattern of train files containing eval URLs",
        required=True,
    )
    args = parser.parse_args()
    hparams = args.__dict__

    model.train_and_evaluate(hparams)

Overwriting ./trainer/task.py


We'll move the files over to GCS for organisational purposes.

In [6]:
def copy_file_to_gcs(bucket_name, file_path, local_file_path):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    blob = bucket.blob(file_path)
    blob.upload_from_filename(local_file_path)

    print(f'File "{local_file_path}" has been copied to "{file_path}" in the bucket "{bucket_name}".')

copy_file_to_gcs(BUCKET, 'fpl/trainer/__init__.py', './trainer/__init__.py')
copy_file_to_gcs(BUCKET, 'fpl/trainer/model.py', './trainer/model.py')
copy_file_to_gcs(BUCKET, 'fpl/trainer/task.py', './trainer/task.py')

File "./trainer/__init__.py" has been copied to "fpl/trainer/__init__.py" in the bucket "bf-fpl-pred-080723".
File "./trainer/model.py" has been copied to "fpl/trainer/model.py" in the bucket "bf-fpl-pred-080723".
File "./trainer/task.py" has been copied to "fpl/trainer/task.py" in the bucket "bf-fpl-pred-080723".


In [7]:
%%bash

EVAL_DATA_PATH=gs://bf-fpl-pred-080723/fpl/data/mid-test*
TRAIN_DATA_PATH=gs://bf-fpl-pred-080723/fpl/data/mid-train*
OUTPUT_DIR=./fpl-model

test ${OUTPUT_DIR} && rm -rf ${OUTPUT_DIR}
export PYTHONPATH=${PYTHONPATH}:${PWD}/taxifare
    
# Run the trainer module package locally with 1 eval

python3 -m trainer.task \
--eval_data_path $EVAL_DATA_PATH \
--output_dir $OUTPUT_DIR \
--train_data_path $TRAIN_DATA_PATH \
--batch_size 5 \
--num_examples_to_train_on 100 \
--num_evals 1 \
--lr 0.001 \
--nnsize "32 8"

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 minutes (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 goals_scored (InputLayer)      [(None, 1)]          0           []                               
                                                                                                  
 assists (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 clean_sheets (InputLayer)      [(None, 1)]          0           []                               
                                                                                              

2023-07-27 14:36:21.496603: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-27 14:36:21.701326: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-07-27 14:36:25.105350: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-07-27 14:36:25.105709: W tensorflow/strea

CalledProcessError: Command 'b'\nEVAL_DATA_PATH=gs://bf-fpl-pred-080723/fpl/data/mid-test*\nTRAIN_DATA_PATH=gs://bf-fpl-pred-080723/fpl/data/mid-train*\nOUTPUT_DIR=./fpl-model\n\ntest ${OUTPUT_DIR} && rm -rf ${OUTPUT_DIR}\nexport PYTHONPATH=${PYTHONPATH}:${PWD}/taxifare\n    \n# Run the trainer module package locally with 1 eval\n\npython3 -m trainer.task \\\n--eval_data_path $EVAL_DATA_PATH \\\n--output_dir $OUTPUT_DIR \\\n--train_data_path $TRAIN_DATA_PATH \\\n--batch_size 5 \\\n--num_examples_to_train_on 100 \\\n--num_evals 1 \\\n--lr 0.001 \\\n--nnsize "32 8"\n'' returned non-zero exit status 1.

In [1]:
CSV_COLUMNS = [
    "hash_id",
    "element_code",
    "season_name",
    "next_season_points",
    "minutes",
    "goals_scored",
    "assists",
    "clean_sheets",
    "penalties_missed",
    "bps",
    "yellow_threshold",
    "red_cards",
    "own_goals",
    "influence",
    "creativity",
    "threat",
    "start_cost",
    "end_cost"
]

LABEL_COLUMN = "next_season_points"
DEFAULTS = [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]
UNWANTED_COLS = ["hash_id", "element_code", "season_name"]

INPUT_COLS = [
    c for c in CSV_COLUMNS if c != LABEL_COLUMN and c not in UNWANTED_COLS
]
INPUT_COLS

['minutes',
 'goals_scored',
 'assists',
 'clean_sheets',
 'penalties_missed',
 'bps',
 'yellow_threshold',
 'red_cards',
 'own_goals',
 'influence',
 'creativity',
 'threat',
 'start_cost',
 'end_cost']

['minutes',
 'goals_scored',
 'assists',
 'clean_sheets',
 'penalties_missed',
 'bps',
 'yellow_threshold',
 'red_cards',
 'own_goals',
 'influence',
 'creativity',
 'threat',
 'start_cost',
 'end_cost']