# Training Upsampled data

In [1]:
### Package Setups
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)
from tensorflow import keras
import time

from helperFunctions import *


from tensorflow.keras.callbacks import Callback

import pandas as pd

import wandb
from wandb.keras import WandbCallback

#print all cell contents 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



1 Physical GPUs, 1 Logical GPUs


## Use Upsampled data

We will use the upsampled data to train, which has higher % of positives (approx 80%) compared to the negative samples. And use the normal dataset for validation and then try to get the maximum precision on Validation. 

* For training 
    + Use all the 130 files ~ 145K samples
    + The shuffle buffer will be filled with these 200K samples (Memory utilisation 25GB)
* For Validation 
    + 5 files = 4K samples (~2%)











In [2]:
TRAIN_FILES_FOLDER = '../data/Train_Upsampled'
VAL_FILES_FOLDER = '../data/Validation'
TEST_FILES_FOLDER = '../data/Test'

from helperFunctions import *

set_x,set_y = readXYfromDisk(100,TRAIN_FILES_FOLDER)
print('\n\nX Shape : ', set_x.shape, calcArrayMemorySize(set_x),'Y Shape: ',set_y.shape)
values, counts = np.unique(set_y, axis=0, return_counts=True)
print('Values, counts, Avg Performance : ', values,counts,counts / counts.sum())


set_x,set_y = readXYfromDisk(10,TEST_FILES_FOLDER)
print('\n\nX Shape : ', set_x.shape, calcArrayMemorySize(set_x),'Y Shape: ',set_y.shape)
values, counts = np.unique(set_y, axis=0, return_counts=True)
print('Values, counts, Avg Performance : ', values,counts,counts / counts.sum())


set_x,set_y = readXYfromDisk(10,VAL_FILES_FOLDER)
print('\n\nX Shape : ', set_x.shape, calcArrayMemorySize(set_x),'Y Shape: ',set_y.shape)
values, counts = np.unique(set_y, axis=0, return_counts=True)
print('Values, counts, Avg Performance : ', values,counts,counts / counts.sum())





X Shape :  (115632, 128, 128, 3) Memory size is : 5420.25 Mb Y Shape:  (115632,)
Values, counts, Avg Performance :  [0 1] [ 14323 101309] [0.1238671 0.8761329]


X Shape :  (30122, 128, 128, 3) Memory size is : 1411.96875 Mb Y Shape:  (30122,)
Values, counts, Avg Performance :  [0 1] [22748  7374] [0.75519554 0.24480446]


X Shape :  (25614, 128, 128, 3) Memory size is : 1200.65625 Mb Y Shape:  (25614,)
Values, counts, Avg Performance :  [0 1] [20349  5265] [0.79444835 0.20555165]


In [2]:
## Data Configuation
TRAIN_FILES_FOLDER = '../data/Train_Upsampled'
VAL_FILES_FOLDER = '../data/Validation'
TEST_FILES_FOLDER = '../data/Test'

TRAIN_STEPS_PER_EPOCH_MULTIPLIER = 2
VAL_STEPS_PER_EPOCH_MULTIPLIER = 2

data_config = dict(INPUT_SHAPE = (128,128,3)

                    ,TRAIN_FILES = 130
                    ,TRAIN_BATCH_SIZE = 512

                    ,VAL_FILES = 5
                    ,VAL_BATCH_SIZE = 512

                    ,PREFETCH = 5
)


data_config.update(TRAIN_SHUFFLE_BUFFER_SIZE = samplesCount(data_config['TRAIN_FILES'],TRAIN_FILES_FOLDER))
data_config.update(TRAIN_STEPS_PER_EPOCH = round(data_config['TRAIN_SHUFFLE_BUFFER_SIZE']/data_config['TRAIN_BATCH_SIZE'])*TRAIN_STEPS_PER_EPOCH_MULTIPLIER)

data_config.update(VAL_SHUFFLE_BUFFER_SIZE = samplesCount(data_config['VAL_FILES'],VAL_FILES_FOLDER))
data_config.update(VAL_STEPS_PER_EPOCH = round(data_config['VAL_SHUFFLE_BUFFER_SIZE']/data_config['VAL_BATCH_SIZE'])*VAL_STEPS_PER_EPOCH_MULTIPLIER)
     
samplesCount(data_config['TRAIN_FILES'],TRAIN_FILES_FOLDER)
samplesCount(data_config['VAL_FILES'],VAL_FILES_FOLDER)

### Data Loading
train = createIODataset(data_config['TRAIN_FILES'],TRAIN_FILES_FOLDER)
val = createIODataset(data_config['VAL_FILES'],VAL_FILES_FOLDER)

train = train.shuffle(buffer_size=data_config['TRAIN_SHUFFLE_BUFFER_SIZE'],reshuffle_each_iteration=True)
train = train.repeat(-1)
train = train.batch(data_config['TRAIN_BATCH_SIZE'],drop_remainder=True)
train = train.prefetch(data_config['PREFETCH'])

val = val.shuffle(buffer_size=data_config['VAL_SHUFFLE_BUFFER_SIZE'],reshuffle_each_iteration=True)
val = val.repeat(-1)
val = val.batch(data_config['VAL_BATCH_SIZE'],drop_remainder=True)
val = val.prefetch(data_config['PREFETCH'])


### Model Configuration
model_config = dict(
        EXPERIMENT = 'FC Baseline - Upsampled'
      ,METRICS = [ keras.metrics.Precision(name='precision'),keras.metrics.Recall(name='recall'),keras.metrics.AUC(name='auc')]
      ,LR = 1e-4
      ,EPOCHS = 100
      ,VAL_FREQUENCY = 1
)





146459

15401

In [3]:
model_config.update(EXPERIMENT = 'FC Baseline - Upsampled')

model = tf.keras.Sequential()
model.add(tf.keras.layers.Flatten(input_shape=data_config['INPUT_SHAPE']))
model.add(tf.keras.layers.experimental.preprocessing.Rescaling(1./255))

# model.add(tf.keras.layers.Dense(32,activation='relu',kernel_initializer='glorot_normal'))
# model.add(tf.keras.layers.Dropout(DROPOUT))

for units in [32,16,8]:
    model.add(tf.keras.layers.Dense(units,activation='relu',kernel_initializer='he_normal'))

model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

model.summary()





Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 49152)             0         
_________________________________________________________________
rescaling (Rescaling)        (None, 49152)             0         
_________________________________________________________________
dense (Dense)                (None, 32)                1572896   
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 1,573,569
Trainable params: 1,573,569
Non-trainable params: 0
______________________________________________

In [7]:
run = wandb.init(project="candlestick-CNN", name = model_config['EXPERIMENT'] ,reinit= True,dir = '../data/'
                    ,config = {**data_config,**model_config})

model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=model_config['LR'])
                        ,loss=tf.keras.losses.binary_crossentropy
                        ,metrics=model_config['METRICS'])

history = model.fit(train
                ,epochs=model_config['EPOCHS']
                ,steps_per_epoch=data_config['TRAIN_STEPS_PER_EPOCH']
                ,verbose=1
                ,validation_data=val                
                ,validation_freq = model_config['VAL_FREQUENCY']
                ,validation_steps = data_config['VAL_STEPS_PER_EPOCH']
                ,callbacks=[WandbCallback()]
                )
  
run.finish()

# model.save('../data/saved_models/' + model_config['EXPERIMENT'] )


Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mamitagni[0m (use `wandb login --relogin` to force relogin)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100

KeyboardInterrupt: 