In [1]:
import yaml
import tqdm
import gc
import bridgescaler
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.callbacks import Callback, ModelCheckpoint, CSVLogger 
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from callbacks import get_callbacks
from metrics import average_acc
from seed import seed_everything
from utils import read_config
from copy import deepcopy

ModuleNotFoundError: No module named 'bridgescaler'

In [27]:
class mpingData:
    def __init__(self, conf):
        # dataset
        self.dataset = conf['dataset']
        
        # variables and groupings
        self.ptypes = conf['ptypes']
        self.scaleGroups = np.array(conf['scale_groups']) 
        
        for i, group in enumerate(self.scaleGroups):
            if i == 0:
                self.varGroups = np.empty((self.scaleGroups.shape[0], 
                                           np.array(conf[group]).shape[0]), dtype='object') 
                                                            
            self.varGroups[i] = np.array(conf[group])
            
        # data parameters
        
        self.seed = conf['seed']
        self.savePath = conf['save_loc']
        self.mpingPath = conf['mping_path']
        self.nSplits = conf['n_splits']
        self.trainSize = conf['train_size']
        
        self.xTrain = None
        self.yTrain = None
        self.xValid = None
        self.yValid = None
        self.xTest = None
        self.yTest = None
            
        # case study parameters
        
        self.caseStudies = conf['case_studies']
            
    def _split_data(self):
        seed_everything(self.seed)
        all_data = pd.read_parquet(self.mpingPath)
        
        all_data['day'] = all_data['datetime'].apply(lambda x: str(x).split(' ')[0])
        
        case_studies = []
        for case_study in self.caseStudies:
            case_studies.append(self.caseStudies[case_study])
            
        case_studies = np.array(case_studies, dtype=object)
        case_studies = np.ravel(case_studies)
        case_study_data = all_data[all_data['datetime'].isin(case_studies)]
        data = all_data[~all_data['datetime'].isin(case_studies)]

        splitter = GroupShuffleSplit(n_splits=self.nSplits, 
                                     train_size=self.trainSize, 
                                     random_state=self.seed)
        train_idx, valid_idx = list(splitter.split(data, groups=data['day']))[0]
        train_data, valid_data = data.iloc[train_idx], data.iloc[valid_idx]
        
        features = self.varGroups.ravel()
        self.xTrain = train_data[features]
        self.yTrain = train_data[self.ptypes]
        self.xValid = valid_data[features]
        self.yValid = valid_data[self.ptypes]
        self.xTest = case_study_data[features]
        self.yTest = case_study_data[self.ptypes]
    
    def _scale_data(self, scale_type):
        scale_types = {'GroupStandardScaler': bridgescaler.group.GroupStandardScaler()}
        scaler = scale_types[scale_type] 
        
        self.xTrain = scaler.fit_transform(x=self.xTrain, groups=self.varGroups)
        self.xValid = scaler.transform(x=self.xValid)
        self.yValid = scaler.transform(x=self.xTest)
    
    def _save_splits(self):
        self.xTrain.to_numpy()
        self.yTrain.to_numpy()
        self.xValid.to_numpy()
        self.yValid.to_numpy()
        self.xTest.to_numpy()
        self.yTest.to_numpy()
        
        np.save(f"{self.savePath}{self.dataset}_xtrain.npy", self.xTrain)
        np.save(f"{self.savePath}{self.dataset}_ytrain.npy", self.yTrain)
        np.save(f"{self.savePath}{self.dataset}_xvalid.npy", self.xValid)
        np.save(f"{self.savePath}{self.dataset}_yvalid.npy", self.yValid)
        np.save(f"{self.savePath}{self.dataset}_xtest.npy", self.xTest)
        np.save(f"{self.savePath}{self.dataset}_ytest.npy", self.yTest)
        
    def preprocess(self, 
                   split=True,
                   scale=True,
                   scale_type='GroupStandardScaler',
                   save=True):
        if split:
            print("splitting data...")
            self._split_data()
            print("completed")

        if scale:
            print("scaling data...")
            self._scale_data(scale_type=scale_type)
            print("completed")

        if save:
            print("saving data...")
            self._save_splits()
            print("completed")

In [3]:
class multiLayerPerceptron:
    def __init__(self, conf):
        # metric parameters
        
        metrics = {'average_acc': average_acc}
        
        self.metric = conf['metric']
        self.direction = conf['direction']
        
        # model parameters
        
        self.hiddenLayers = conf['model']['hidden_layers']
        self.hiddenNeurons = conf['model']['hidden_neurons']
        self.useDropout = conf['model']['use_dropout']
        
        if self.useDropout == 1:
            self.dropoutAlpha = conf['model']['dropout_alpha']
        
        self.batchSize = conf['model']['batch_size']
        self.epochs = conf['model']['epochs']
        self.learningRate = conf['model']['lr']
        self.activation = conf['model']['activation']
        self.outputActivation = conf['model']['output_activation']
        
        optimizers = {'adam': tf.keras.optimizers.Adam(self.learningRate)}
        losses = {'categorical_crossentropy': tf.keras.losses.CategoricalCrossentropy()}
        
        self.optimizer = optimizers[conf['model']['optimizer']]
        self.loss = losses[conf['model']['loss']]
        
        # callback parameters
        
        self.callbacks = get_callbacks(conf)
    
    def _build_mlp_model(self):
        input_size = self.varGroups.ravel().shape[0]
        output_size = self.ptypes.shape[0]
        
        model = tf.keras.models.Sequential()
        
        if activation == 'leaky':
            model.add(tf.keras.layers.Dense(input_size))
            model.add(tf.keras.layers.LeakyReLU())
        
            for i in range(num_hidden_layers):
                if num_hidden_layers == 1:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons))
                    model.add(tf.keras.layers.LeakyReLU())
                else:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons))
                    model.add(tf.keras.layers.LeakyReLU())
                    if self.useDropout == 1:
                        model.add(tf.keras.layers.Dropout(self.dropoutAlpha))
        else:
            model.add(tf.keras.layers.Dense(input_size, activation=activation))
        
            for i in range(num_hidden_layers):
                if num_hidden_layers == 1:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons, activation=self.activation))
                else:
                    model.add(tf.keras.layers.Dense(self.hiddenNeurons, activation=self.activation))
                    if self.useDropout == 1:
                        model.add(tf.keras.layers.Dropout(self.dropoutAlpha))
        
        model.add(tf.keras.layers.Dense(output_size, activation=self.outputActivation))
        model = build_model(input_size, self.hiddenNeurons, self.hiddenLayers, output_size)
        model.build((self.batchSize, input_size))
        model.summary()
    
        return model
    
    def train(self, 
              x_train, 
              y_train, 
              x_valid, 
              y_valid):
        
        #add preprocessing step with option to load previous splits

        #add option to load previous model weights
        
        model = self._build_mlp_model()
        
        model.compile(loss=self.loss, 
                      optimizer=self.optimizer, 
                      metrics=self.metric)
        
        model.fit(x_train, 
                  y_train, 
                  validation_data=(x_valid, y_valid), 
                  callbacks=self.callbacks,
                  batch_size=self.batchSize,  
                  epochs=self.epochs)
        
        return model
    
    def predict(self, model, x_test, y_test):
        predictions = model.predict(x_test)

        probs = np.max(predictions, 1)
        preds = np.argmax(predictions, 1)
        labels = np.argmax(y_test, 1)
        
        return self.metric(y_test, predictions)

In [33]:
try:
    # del mlp
    del mping
    # K.clear_session()
    gc.collect()
except:
    print('pass')

In [4]:
conf = read_config("config/ptype.yml")

In [34]:
mping = mpingData(conf)
mping.preprocess()

splitting data...
completed
scaling data...
    TEMP_C_0_m  TEMP_C_250_m  TEMP_C_500_m  TEMP_C_750_m  TEMP_C_1000_m  \
0     3.983856      1.960746     -0.382201      0.896608       3.748605   
1    -6.516144     -8.871194    -11.125119    -13.413846     -15.754203   
2     1.549530      0.697966     -1.193582     -2.905500      -4.680267   
3     4.674530      3.143140      1.326394     -0.012984      -0.898779   
4     4.549530      3.948643      1.976451      0.439435      -0.655007   
..         ...           ...           ...           ...            ...   
23   22.957916     20.063769     17.687475     15.273155      12.900696   
24   21.770416     18.354883     16.301403     14.161421      11.962797   
25   28.145416     23.845051     21.502045     19.029873      16.543373   
26   22.770416     22.575489     20.780939     18.774169      16.567488   
27   26.832916     25.638598     23.583168     21.714370      19.743269   

    TEMP_C_1250_m  TEMP_C_1500_m  TEMP_C_1750_m  TEMP_C

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [28]:
mlp = multiLayerPerceptron(conf)