<a href="https://colab.research.google.com/github/cappelchi/calcio_notebooks/blob/main/draft/football_word2vec_train_multiclass_220820.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Project config

In [None]:
!pip install neptune-client neptune-tensorflow-keras

In [None]:
import neptune.new as neptune
#from neptune.new.integrations.tensorflow_keras import NeptuneCallback
def get_credential(frmwork = 'neptune_team'):
    with open('cred_andrey.txt', 'r') as container:
        for line in container:
            if frmwork in line:
                login, psw = line.split(' ')[1], line.split(' ')[2].split('\n')[0]
                return login, psw

In [None]:
#@title Set API key for neptune.ai
set_api = True #@param {type:"boolean"}
if set_api:
    username, api_key = get_credential()

### Installations

In [None]:
#!pip install --upgrade gensim

### Downloads

In [None]:
dataset_name = './dataset_npz.npz'
validation_dataset_name = './prem_validation.csv'
dataset_version = 'data/dataset_val_prod_0818'
project = neptune.init_project(
    name="scomesse/football", 
    api_token = api_key
    )
#project['data/dataset4train_y19_y22'].download(dataset_name)
project['data/validation_prem_220818'].download(validation_dataset_name)
project[dataset_version].download(dataset_name)
w2v_model_link = project['data/word2vec_220811_link'].fetch()
word2vec_params = project['data/word2vec_params'].fetch()
project.stop()

https://app.neptune.ai/scomesse/football/
Remember to stop your project once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/project#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
All 0 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/metadata


### Imports

In [None]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
print(pd.__version__)
print(np.__version__)

#import dask.dataframe as dd
import subprocess
from glob import glob
from tqdm import tqdm

1.3.5
1.21.6


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve

In [None]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from itertools import cycle

In [None]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
from tensorflow.keras.losses import MeanSquaredError, MeanAbsoluteError
from keras.layers import Embedding, SimpleRNN, Dense, Bidirectional, GRU, LSTM
from keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping, Callback
from tensorflow.data import Dataset
#from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from keras.models import Sequential

### Code

#####Functions

In [None]:
def run_bash(bashCommand:str, nameCommand = ''):
        process = subprocess.Popen([bashCommand], 
                           shell=True)
        _, error = process.communicate()
        if error:
            print(f'{nameCommand} error:\n', error)

In [None]:
def plot_training_history(history):
    acc = history['acc'] if 'acc' in history.keys() else history['accuracy']
    val_acc = history['val_acc'] if 'val_acc' in history.keys() else history['val_accuracy']
    loss = history['loss'] 
    val_loss = history['val_loss'] 
    epochs = tuple(range(len(acc)))
    fig = go.Figure()
    fig = make_subplots(rows=1, cols=2, subplot_titles = ['Training and validation accuracy', 'Training and validation loss'])
    fig.add_trace(go.Scatter(x = epochs, y = acc, mode = 'lines+markers', name = 'train accuracy'), 
                row = 1, col = 1)
    fig.add_trace(go.Scatter(x = epochs, y = val_acc, mode = 'lines+markers', name = 'validation accuracy'), 
                row = 1, col = 1)
    fig.add_trace(go.Scatter(x = epochs, y = loss, mode = 'lines+markers', name = 'train loss'), 
                row = 1, col = 2)
    fig.add_trace(go.Scatter(x = epochs, y = val_loss, mode = 'lines+markers', name = 'validation loss'), 
                row = 1, col = 2)

    fig.show()
    return fig

#### Load tensors

In [None]:
# трейн, тест, валидация, часть продакшн датасета + кэфы для продакшена + эмбеддингги
X_train, X_test, X_validation,\
_, _, _,\
X_production,\
_,\
y_class_train, y_class_test,y_class_validation, y_class_production,\
Line_production, embedding_matrix = np.load(dataset_name).values()

In [None]:
X_train.shape, X_test.shape, X_validation.shape, X_production.shape, type(X_train)

((1595638, 12), (456113, 12), (95894, 12), (7992, 12), numpy.ndarray)

In [None]:
y_class_train.shape, y_class_test.shape, y_class_validation.shape, y_class_production.shape, type(y_class_train)

((1595638, 3), (456113, 3), (95894, 3), (7992, 3), numpy.ndarray)

In [None]:
embedding_matrix.shape, type(embedding_matrix)

((318979, 256), numpy.ndarray)

####RNN

In [None]:
#Проверяем наличие GPU
gpus = tf.config.experimental.list_logical_devices("GPU")
if len(gpus) > 1:
  strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus])
  print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
  strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
  print('Running on single GPU ', gpus[0].name)
else:
  strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
  print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on single GPU  /device:GPU:0
Number of accelerators:  1


In [None]:
RNN_params = dict(dataset_version = dataset_version)

In [None]:
RNN_params.update({'batch_size':2048})
steps_per_epoch = int(X_train.shape[0] / RNN_params['batch_size']) + 1
validation_steps = int(X_test.shape[0] / RNN_params['batch_size']) + 1

In [None]:
RNN_params.update({'input_length':12, 'embed_dim':128, 
                   'activation_dense':'sigmoid','initial_learning_rate':1e-3},
                  )
class Net(Model):
    def __init__(self):
        super().__init__()
        self.embed = Embedding(embedding_matrix.shape[0], 
                        word2vec_params['vector_size'],
                        weights=[embedding_matrix],
                        input_length=RNN_params['input_length'],
                        trainable=False)
        self.rnn = SimpleRNN(RNN_params['embed_dim'])
        self.dense1 = Dense(128, activation=RNN_params['activation_dense'])
        self.dense2 = Dense(3, activation='softmax')

    def call(self, inputs):
        x = inputs
        x1 = self.embed(x)
        x2 = self.rnn(x1)
        x3 = self.dense1(x2)
        output = self.dense2(x3)
        return output

initial_learning_rate = RNN_params['initial_learning_rate']  # Learning rate

RNN_params.update(
    {'compile':{
    'optimizer':RMSprop(learning_rate = initial_learning_rate),
    'loss':BinaryCrossentropy(),
    'weighted_metrics':["acc"]
            }})
model = Net()
model.compile(
        **RNN_params['compile']
                )

In [None]:
RNN_params.update(
{'ROP_params':{'monitor':'val_loss', 'patience': 5,
                   'factor':0.1, 'verbose':1, 'cooldown':25,
                   'min_lr':1e-6}})
ReduceLROnPlateau_callback = ReduceLROnPlateau(**RNN_params['ROP_params'])
RNN_params.update(
{'MCh_params':{'filepath':'./models_weights','monitor':'val_loss', 'verbose':1,
                   'save_best_only':True, 'save_weights_only':False,
                   }})
ModelCheckpoint_callback = ModelCheckpoint(**RNN_params['MCh_params'])

In [None]:
RNN_params.update({'input_length':12, 'embed_dim':128, 
                   'activation_dense':'sigmoid','epochs':10})
history = model.fit(X_train, y_class_train,
                    epochs=RNN_params['epochs'],
                    batch_size=RNN_params['batch_size'],
                    validation_data = (X_test, y_class_test),
                    #validation_split=RNN_params['validation_split'],
                    callbacks=[
                        ReduceLROnPlateau_callback,
                        ModelCheckpoint_callback
                                ]
                    )

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.62088, saving model to ./models_weights
Epoch 2/10
Epoch 2: val_loss improved from 0.62088 to 0.61152, saving model to ./models_weights
Epoch 3/10
Epoch 3: val_loss improved from 0.61152 to 0.61088, saving model to ./models_weights
Epoch 4/10
Epoch 4: val_loss did not improve from 0.61088
Epoch 5/10
Epoch 5: val_loss improved from 0.61088 to 0.60854, saving model to ./models_weights
Epoch 6/10
Epoch 6: val_loss improved from 0.60854 to 0.60469, saving model to ./models_weights
Epoch 7/10
Epoch 7: val_loss did not improve from 0.60469
Epoch 8/10
Epoch 8: val_loss did not improve from 0.60469
Epoch 9/10
Epoch 9: val_loss did not improve from 0.60469
Epoch 10/10
Epoch 10: val_loss did not improve from 0.60469


In [None]:
PATH_TO_MODEL = './model.tar.gz'
bashCommand = f"""
tar -czvf {PATH_TO_MODEL} {RNN_params['MCh_params']['filepath']}
"""
run_bash(bashCommand, 'tar_model')

In [None]:
model_version = neptune.init_model_version(
    model = 'FOOT-RNN',
    project = 'scomesse/football',
    api_token = api_key # your credentials
)
model_sys = model_version['sys'].fetch()
model_version_params = dict(
    project = 'scomesse/football',
    model = model_sys['model_id'],
    api_token = api_key,
    version = model_sys['id']
)
model_version['model'].upload(PATH_TO_MODEL)
model_version['parameters'] = RNN_params
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-RNN/v/FOOT-RNN-9
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/model-version#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 22 operations to synchronize with Neptune. Do not kill this process.
All 22 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-RNN/v/FOOT-RNN-9/metadata


In [None]:
fig = plot_training_history(history.history)

In [None]:
model_version = neptune.init_model_version(**model_version_params)
model_version['plot_training'].upload(neptune.types.File.as_html(fig))
model_version.stop()

https://app.neptune.ai/scomesse/football/m/FOOT-RNN/v/FOOT-RNN-9
Remember to stop your model_version once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/model-version#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.
Shutting down background jobs, please wait a moment...
Done!
Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.
All 1 operations synced, thanks for waiting!
Explore the metadata in the Neptune app:
https://app.neptune.ai/scomesse/football/m/FOOT-RNN/v/FOOT-RNN-9/metadata
