In [1]:
!mkdir kfold_models
!mkdir fold_models
!mkdir models
!mkdir tar_models

A subdirectory or file kfold_models already exists.
A subdirectory or file fold_models already exists.
A subdirectory or file models already exists.
A subdirectory or file tar_models already exists.


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [3]:
!cp ../input/feedbackprizemodeldatasettarmodels/* tar_models/

'cp' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
class StreamToLogger(object):
    def __init__(self, queue):
        self.queue = queue

    def setold(self, old):
        self.old = old
        
    def write(self, buf):
        self.old.write(buf)
        for line in buf.rstrip().splitlines():
            self.queue.put(line.rstrip())
    def flush(self):
        self.old.flush()
    def getq(self):
        return self.queue

In [5]:
def callable_wrapper(stdoutl, stderrl, fun, *args, **kvargs):
    import sys
    stdoutl.setold(sys.stdout)
    stderrl.setold(sys.stderr)
    sys.stdout = stdoutl
    sys.stderr = stderrl
    def handle_exception(exc_type, exc_value, exc_traceback):
        stdoutl.getq().put("XXXDONEXXX")
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
    sys.excepthook = handle_exception
    fun(*args, **kvargs)
    stdoutl.getq().put("XXXDONEXXX")

In [6]:
## this is https://github.com/philtrade/mpify
## thanks!
## for spawning a single process for gpu cuda computation
## to help tear down gpu ram when finished.
import os, sys, re
from pathos.helpers import mp
from typing import Callable
from contextlib import AbstractContextManager, nullcontext
#  - globals() doesn't necessarily return the '__main__' global scope when inside package function,
#    thus we use sys.modules['__main__'].__dict__.

def import_star(modules:[str], ns:dict=None):
    """Import ``*`` from a list of module, into namespace ns (default to '__main__')
    Args:
        modules: list of modules or packages
        ns: destination namespace, optional. If not provided, will default to '__main__'
    """
    global_imports([f"from {m} import *" for m in modules], ns)

    
def global_imports(imports:[str], ns:dict=None):
    """
    Parse and execute multiple import statements, and import into target namespace 'ns'
    Args:
        imports: list of import statements, as in Python code.  Supported formats include:
            * import x, y, z as z_alias
            * from A import x
            * from A import z as z_alias
            * from A import x, y, z as z_alias
            Not supported: 'from A import (a, b)'
        ns: target namespace to import into.  Default to '__main__'
    """
    import os, sys, re, multiprocess as mp
    from typing import Callable
    from contextlib import AbstractContextManager, nullcontext

    if ns is None:
        import sys
        ns = sys.modules['__main__'].__dict__
    pat = re.compile(r'^\s*?(?:from\s+?(\S+?)\s+?)?import\s+?(.+)$')
    pat_as = re.compile(r'^\s*?(\S+?)(?:\s*?as\s+?(\S+?))?\s*$')
    for parsed in filter(lambda p:p,[pat.match(i) for i in imports]):
        (from_, imp_)  = parsed.groups()
        imps = imp_.split(',')

        # Parse "from X import ..."
        from_mod = __import__(from_, fromlist=['']) if from_ else None

        for name in imps: # each comma-separated item in import a, b, x as y
            (x, y) = pat_as.match(name).groups()
            if y is None: y=x
            if x == '*': # Handle starred import: 'from X import *'
                assert from_, SyntaxError(f"From what <module> are you trying to 'import *': {parsed.string}")
                importables = getattr(from_mod, "__all__", [n for n in dir(from_mod) if not n.startswith('_')])
                for o in importables: ns[o] = getattr(from_mod, o)
            else: # x is either a name in 1 module, OR a module itself
                ns[y] = getattr(from_mod, x) if from_ else __import__(x, fromlist=[''])

def _contextualize(i:int, nprocs:int, fn:Callable, cm:AbstractContextManager, l=None, env:dict={}, imports=""):
    "Return a function that will setup os.environ and execute a target function within a context manager."
    import os, sys, re, multiprocess as mp
    from typing import Callable
    from contextlib import AbstractContextManager, nullcontext

    def global_imports(imports:[str], ns:dict=None):
        """
        Parse and execute multiple import statements, and import into target namespace 'ns'
        Args:
            imports: list of import statements, as in Python code.  Supported formats include:
                * import x, y, z as z_alias
                * from A import x
                * from A import z as z_alias
                * from A import x, y, z as z_alias
                Not supported: 'from A import (a, b)'
            ns: target namespace to import into.  Default to '__main__'
        """

        if ns is None:
            import sys
            ns = sys.modules['__main__'].__dict__
        pat = re.compile(r'^\s*?(?:from\s+?(\S+?)\s+?)?import\s+?(.+)$')
        pat_as = re.compile(r'^\s*?(\S+?)(?:\s*?as\s+?(\S+?))?\s*$')
        for parsed in filter(lambda p:p,[pat.match(i) for i in imports]):
            (from_, imp_)  = parsed.groups()
            imps = imp_.split(',')

            # Parse "from X import ..."
            from_mod = __import__(from_, fromlist=['']) if from_ else None

            for name in imps: # each comma-separated item in import a, b, x as y
                (x, y) = pat_as.match(name).groups()
                if y is None: y=x
                if x == '*': # Handle starred import: 'from X import *'
                    assert from_, SyntaxError(f"From what <module> are you trying to 'import *': {parsed.string}")
                    importables = getattr(from_mod, "__all__", [n for n in dir(from_mod) if not n.startswith('_')])
                    for o in importables: ns[o] = getattr(from_mod, o)
                else: # x is either a name in 1 module, OR a module itself
                    ns[y] = getattr(from_mod, x) if from_ else __import__(x, fromlist=[''])

    
    if l: assert i < len(l), ValueError("Invalid index {i}, exceeds size of the result list: {len(l)}")
    def _cfn(*args, **kwargs):
        import os
        os.environ.update({"LOCAL_RANK":str(i), "LOCAL_WORLD_SIZE":str(nprocs)})
        try:
            import sys
            # import env into '__main__', which can be in a subprocess here.
            g = sys.modules['__main__'].__dict__
            global_imports(imports.split('\n'), g)
            g.update(env)
            with cm or nullcontext(): r = fn(*args, **kwargs)
            if l: l[i] = r
            return r
        finally: map(lambda k: os.environ.pop(k, None), ("LOCAL_RANK", "LOCAL_WORLD_SIZE"))               
    return _cfn

def ranch(nprocs:int, fn:Callable, *args, caller_rank:int=0, gather:bool=True, ctx:AbstractContextManager=None, need:str="", imports="", **kwargs):
    """ Execute `fn(\*args, \*\*kwargs)` distributedly in `nprocs` processes.  User can
    serialize over objects and functions, spell out import statements, manage execution
    context, gather results, and the parent process can participate as one of the workers.
    If `caller_rank` is `0 <= caller_rank < nprocs`, only `nprocs - 1` processes will be forked, and the caller process will be a worker to run its share of `fn(..)`.
    If `caller_rank` is ``None``, `nprocs` processes will be forked.
    Inside each worker process, its relative rank among all workers is set up in `os.environ['LOCAL_RANK']`, and the total
    number of workers is set up in `os.environ['LOCAL_WORLD_SIZE']`, both as strings.
    Then import statements in `imports`, followed by any objects/functions in `need`, are brought
    into the python global namespace.
    Then, context manager `ctx` is applied around the call `fn(\*args, \*\*kwargs)`.
    Return value of each worker can be gathered in a list (indexed by the process's rank)
    and returned to the caller of `ranch()`.
    Args:
        nprocs: Number of processes to fork.  Visible as a string in `os.environ['LOCAL_WORLD_SIZE']`
            in all worker processes.
        fn: Function to execute on the worker pool
        \*args: Positional arguments by values to `fn(\*args....)`
        \*\*kwargs: Named parameters to `fn(x=..., y=....)`
        caller_rank: Rank of the parent process.  ``0 <= caller_rank < nprocs`` to join, ``None`` to opt out. Default to ``0``.
            In distributed data parallel, 0 means the leading process.
        gather: if ``True``, `ranch` will return a list of return values from each worker, indexed by their ranks.
            If ``False``, and if 'caller_rank' is not None (meaning parent process is a worker),
            `ranch()` will return whatever the parent process' `fn(...)` returns.
        ctx: User defined context manager to be used in a 'with'-clause around the 'fn(...)' call in worker processes.
            Subclassed from AbstractContextManager, ctx needs to define '__enter__()' and '__exit__()' methods.
        need: Space-separated names of objects/functions to be serialized over to the subprocesses.
        imports: A multiline string of `import` statements to execute in the subprocesses
            before `fn()` execution.  Supported formats:
            * `import x, y, z as zoo`
            * `from A import x`
            * `from A import z as zoo`
            * `from A import x, y, z as zoo`
            * Not supported: `from A import (x, y)`
    Returns:
        ``None``, or list of results from worker processes, indexed by their `LOCAL_RANK`: ``[res_0, res_1, .... res_{nprocs-1}]``
    """

    assert nprocs > 0, ValueError("nprocs: # of processes to launch must be > 0")
    manager = mp.Manager()
    queue = manager.Queue()

    children_ranks = list(range(nprocs))
    if caller_rank is not None:
        assert 0 <= caller_rank < nprocs, ValueError(f"Invalid caller_rank {caller_rank}, must satisfy 0 <= caller_rank < {nprocs}")
        children_ranks.pop(caller_rank)
    multiproc_ctx, procs = mp.get_context("spawn"), []
    result_list = multiproc_ctx.Manager().list([None] * nprocs) if gather else None
    try:
        # pass globals in this process to subprocess via fn's wrapper, 'target_fn'
        env = {k : sys.modules['__main__'].__dict__[k] for k in need.split()}
        for rank in children_ranks:
            target_fn = _contextualize(rank, nprocs, fn, cm=ctx, l=result_list, env=env, imports=imports)
            sle = StreamToLogger(queue)
            slo = StreamToLogger(queue)
            p = multiproc_ctx.Process(target=callable_wrapper, args=[sle, slo, target_fn, *args], kwargs=kwargs)
            procs.append(p)
            p.start()
        p_res = (_contextualize(caller_rank, nprocs, fn, cm=ctx, l=result_list, env=env, imports=imports))(*args, **kwargs) if caller_rank is not None else None
        while True:
            mess = queue.get()
            if mess=="XXXDONEXXX":
                break
            print(mess)
        for p in procs: p.join()
        return result_list if gather else p_res
    finally:
        for p in procs: p.terminate(), p.join()

In [7]:
def kfold_model_one_inproc(model_fold_storage, model_type, i_fold, train_idx, valid_idx, X, y, Xtest):
    import os
    print("kfold {0}".format(str(os.getpid())))
    import tensorflow as tf
    from keras import backend as K
    import gc
    from tqdm import tqdm
    import numpy as np
    import pprint
    Xt = X.iloc[train_idx]
    yt = y[train_idx]
    Xv = X.iloc[valid_idx]
    yv = y[valid_idx]

    model, model_files = model_fold_storage.try_load_model(model_type, i_fold)
    if model is None:
        print("creating model {0}".format(model_type.name()))
        model = model_type.create()
        print("fitting fold {0}".format(i_fold))
        model_type.fit(model, Xt, yt, Xv, yv)
        model_files = model_fold_storage.save_model(model_type, i_fold, model)

    global oof_df_fold
    print("predicting oof fold {0}".format(i_fold))
    oof_df_fold = model_type.predict(model, Xv)
    global metrics
    print("evaluating fold {0}".format(i_fold))
    metrics = model_type.evaluate(model, Xv, yv)
    pretty_metrics = pprint.pformat(metrics)
    print("fold {0} metrics {1}".format(i_fold, pretty_metrics))
    global test_df_fold
    test_df_fold = model_type.predict(model, Xtest)
    model_fold_storage.save_fold(model_type, X, y, valid_idx, train_idx, i_fold, oof_df_fold, test_df_fold, metrics)
    K.clear_session()
    del model
    gc.collect()
    K.clear_session()
    gc.collect()
    K.clear_session()
    return oof_df_fold,test_df_fold,metrics, model_files
    
    

In [8]:
def kfold_model_inproc(model_fold_storage, problem_type, model_type, X, y, Xtest):
    import os
    print("kfold {0}".format(str(os.getpid())))
    import tensorflow as tf
    from keras import backend as K
    import gc
    from tqdm import tqdm
    import numpy as np
    split = 5
    global pred_dim
    global pred_dim_list
    pred_dim_list = [i if i>0 else X.shape[0] for i in problem_type.pred_dim()]
    pred_dim = tuple(i for i in pred_dim_list)
    global oof_pred
    oof_pred = np.zeros(pred_dim)
    global test_pred
    test_pred_dim_list = [i if i>0 else Xtest.shape[0] for i in problem_type.pred_dim()]
    test_pred_dim = tuple(i for i in test_pred_dim_list)
    test_pred = np.zeros((split,) + test_pred_dim)
    from sklearn.model_selection import StratifiedKFold
    kf = StratifiedKFold(n_splits=split, shuffle=True)
    i_fold=0
    all_train_idx = []
    all_valid_idx = []
    all_model_files = []
    global all_metrics
    all_metrics = {}
    global train_idx
    global valid_idx
    for train_idx, valid_idx in kf.split(X, y.argmax(1)):
        all_train_idx.append(train_idx)
        all_valid_idx.append(valid_idx)
    nfolds = len(all_valid_idx)
    for i_fold in tqdm(range(nfolds), mininterval=5):
        print("starting {0} fold".format(i_fold))
        train_idx = all_train_idx[i_fold]
        valid_idx = all_valid_idx[i_fold]
        
        oof_df_fold,test_df_fold,metrics, model_files = kfold_model_one(model_fold_storage, model_type, i_fold, train_idx, valid_idx, X, y, Xtest)
        
        for key in metrics:
            if key in all_metrics:
                all_metrics[key]=np.hstack((all_metrics[key], metrics[key]))
            else: 
                all_metrics[key]=metrics[key]
        oof_pred[valid_idx, :] = np.reshape(oof_df_fold, oof_df_fold.shape)
        test_pred[i_fold]=test_df_fold
        all_model_files.append(model_files)
        i_fold+=1
        
    model_fold_storage.save_kfold(model_type, X, y, all_valid_idx, all_train_idx, all_model_files, oof_pred, test_pred, all_metrics)
    return oof_pred, test_pred, all_metrics

In [9]:
!mkdir kfold_models
!mkdir fold_models
!mkdir models
!mkdir tar_models
class ModelFoldStorage:
    def save_fold(self, model_type, X, y, valid_idx, train_idx, i_fold, oof_df_fold, test_df_fold, metrics):
        import pickle
        foldData = {}
        foldData['X'], foldData['y'],foldData['valid_idx'],foldData['train_idx']=X,y,valid_idx,train_idx
        foldData['oof_df_fold'],foldData['i_fold'], foldData['metrics'], foldData['test_df_fold']=oof_df_fold,i_fold, metrics, test_df_fold
        with open(r"fold_models/"+model_type.name()+"_"+str(i_fold)+"_fold.bin", "wb") as output_file:
            pickle.dump(foldData, output_file)
    def try_load_kfold(self, model_type):
        import pickle
        with open(r"kfold_models/"+model_type.name()+"_kfold.bin", "rb") as input_file:
            modelTypeData = pickle.read(input_file)
            return modelTypeData['oof_pred'], modelTypeData['test_pred'], modelTypeData['all_metrics']
        return None
    def save_kfold(self, model_type, X, y, all_valid_idx, all_train_idx, all_model_files, oof_pred, test_pred, all_metrics):
        import pickle
        modelTypeData = {}
        modelTypeData['X'], modelTypeData['y'],modelTypeData['all_valid_idx'],modelTypeData['all_train_idx'], modelTypeData['all_model_files']=X,y,all_valid_idx,all_train_idx, all_model_files
        modelTypeData['model_type'], modelTypeData['oof_pred'],modelTypeData['all_metrics'], modelTypeData['test_pred']=model_type, oof_pred, all_metrics, test_pred
        with open(r"kfold_models/"+model_type.name()+"_kfold.bin", "wb") as output_file:
            pickle.dump(modelTypeData, output_file)
    def save_model(self, model_type, i_fold, model):
        file = r"models/"+model_type.name()+"_"+str(i_fold)+"_model"
        files = model_type.save(file, model)
        files = self.tar(file, files)
        print("saved model %s to %s (%s)" % (model_type.name(), file, ", ".join(files)))
        return files
    def try_load_model(self, model_type, i_fold):
        file = r"models/"+model_type.name()+"_"+str(i_fold)+"_model"
        file = self.try_untar(file)
        model, model_files = model_type.load(file)
        if model is None:
            print("could not load model %s from %s" % (model_type.name(), file))
        else:
            print("loaded model %s from %s" % (model_type.name(), file))
        return model, model_files
    def tar(self, file, files):
        import tarfile
        import os
        tar = "tar_"+file + ".tar.gz.xyz"
        with tarfile.open(tar, "w:gz") as tara:
            for f in files:
                tara.add(f, arcname=os.path.basename(f))
        files.append(tar)
        return files
    def try_untar(self, file):
        import pathlib
        import tensorflow as tf
        import os
        import tarfile
        file_gz = "tar_"+file + ".tar.gz.xyz"
        filepath = pathlib.PurePath(file)
        file_name = filepath.name
        if not os.path.exists(file) and os.path.exists(file_gz):
            import atexit, shutil, tempfile
            models_dir = tempfile.mkdtemp()
            atexit.register(shutil.rmtree, models_dir)
            if not models_dir.endswith('/'):
                models_dir = models_dir + '/'
            print("loading from", file_gz)
            target_model_name = models_dir + file_name
            with tarfile.open(file_gz) as my_tar:
                my_tar.extractall(models_dir) # specify which folder to extract to
                my_tar.close()
            file = target_model_name
            return file
        return file
    
model_fold_storage = ModelFoldStorage()

A subdirectory or file kfold_models already exists.
A subdirectory or file fold_models already exists.
A subdirectory or file models already exists.
A subdirectory or file tar_models already exists.


In [10]:
class MyProblem:
    def pred_dim(self):
        return [0]

In [11]:
def DatasetMapFunction(input_ids, attn_masks, labels):
        return {
            'input_ids': input_ids,
            'attention_mask': attn_masks
            }, labels
class MyBertModel:
    def __init__(self, debug=False):
        self.debug = debug
    def create(self):
        import tensorflow as tf
        import tensorflow_hub as hub
        import tensorflow_text as text
        from transformers import TFBertModel
        model = TFBertModel.from_pretrained("bert-base-uncased")
        input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
        attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

        bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
        bert_embds.trainable=False

        intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
        output_layer = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

        discourse_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
        print(discourse_model.summary())
        from tensorflow.keras.optimizers import Adam
        discourse_model.compile(optimizer=Adam(learning_rate=1e-5, decay=1e-6), 
                        loss='categorical_crossentropy', 
                        metrics=['accuracy'])
        from transformers import BertTokenizerFast
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        return (discourse_model, self, model, tokenizer)
    def load(self, file):
        import os
        import tensorflow as tf
        import pickle
        ttt1=file
        ttt2=file+"_bert"
        ttt3=file+"_python"
        ttt4=file+"_tokenizer"
        if not os.path.exists(ttt1) or not os.path.exists(ttt2) or not os.path.exists(ttt3) or not os.path.exists(ttt4):
            return None, None
        model_0 = tf.keras.models.load_model(ttt1)
        model_2 = tf.keras.models.load_model(ttt2)
        #with open(ttt3, "rb") as input_file:
        #    model1_dict = pickle.load(input_file)
        #self.__dict__.update(model1_dict)
        from transformers import BertTokenizerFast
        tokenizer_3 = BertTokenizerFast.from_pretrained(ttt4) 
        model_files = [ttt1, ttt2, ttt3, ttt4]
        return (model_0, self, model_2, tokenizer_3), model_files
    def save(self, file, model):
        import shutil
        import os
        import tensorflow as tf
        import pickle
        ttt1=file
        tf.keras.models.save_model(model[0], ttt1)
        ttt2=file+"_bert"
        tf.keras.models.save_model(model[2], ttt2)
        ttt3=file+"_python"
        with open(ttt3, "wb") as output_file:
            pickle.dump(model[1].__dict__, output_file)
        ttt4=file+"_tokenizer"
        model[3].save_pretrained(ttt4)
        return [ttt1, ttt2, ttt3, ttt4]
    def name(self):
        return "bert"
    def fit(self, model, Xt, yt, Xv, yv):
        X_input_ids, X_attn_masks, yt = self._transform(model, Xt, yt)
        import tensorflow as tf
        dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, yt))
        dataset = dataset.map(DatasetMapFunction)     # converting to required format for tensorflow dataset
        dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
        X_val_input_ids, X_val_attn_masks, yv = self._transform(model, Xv, yv)
        val_dataset = tf.data.Dataset.from_tensor_slices((X_val_input_ids, X_val_attn_masks, yv))
        val_dataset = val_dataset.map(DatasetMapFunction)     # converting to required format for tensorflow dataset
        val_dataset = val_dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
        epochs = 5
        if self.debug:
            epochs=1
        model[1].history = model[0].fit(dataset,
            steps_per_epoch=200,
            validation_data=val_dataset,
            epochs=epochs, verbose=0)
    def predict(self, model, X):
        X_test_input_ids, X_test_attn_masks, _y = self._transform(model, X, None)
        labels = model[0].predict([X_test_input_ids, X_test_attn_masks], verbose=0)
        return labels
    def evaluate(self, model, x, y):
        import tensorflow as tf
        X_input_ids, X_attn_masks, y = self._transform(model, x,y)
        dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, y))
        dataset = dataset.map(DatasetMapFunction)
        dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)
        return model[0].evaluate(dataset, return_dict=True, verbose=0)
    def _encode_data(self, df, ids, masks, tokenizer):
        from tqdm.auto import tqdm
        for i, text in tqdm(enumerate(df['text']), mininterval=5):
            tokenized_text = tokenizer.encode_plus(
                text,
                max_length=256, 
                truncation=True, 
                padding='max_length', 
                add_special_tokens=True,
                return_tensors='tf'
            )
            ids[i, :] = tokenized_text.input_ids
            masks[i, :] = tokenized_text.attention_mask
        return ids, masks
    
    def _transform(self, model, X,y):
        tokenizer = model[3]
        X["text"]=X['discourse_type'] + tokenizer.sep_token+ X['text']
        if self.debug:
            X = X.head(1000)
            if y is not None:
                y = y[0:1000,:]
        
        import numpy as np
        X_input_ids = np.zeros((len(X), 256))
        X_attn_masks = np.zeros((len(X), 256))
        X_input_ids, X_attn_masks = self._encode_data(X, X_input_ids, X_attn_masks, tokenizer)
        return X_input_ids, X_attn_masks, y

In [12]:
class MyRobertaModel:
    def __init__(self, debug=False):
        self.debug = debug
    def create(self):
        import tensorflow as tf
        import tensorflow_hub as hub
        import tensorflow_text as text
        from transformers import TFRobertaModel
        model = TFRobertaModel.from_pretrained("roberta-base")
        input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
        attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

        bert_embds = model.roberta(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
        bert_embds.trainable=False

        intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
        output_layer = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

        discourse_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
        print(discourse_model.summary())
        from tensorflow.keras.optimizers import Adam
        discourse_model.compile(optimizer=Adam(learning_rate=1e-5, decay=1e-6), 
                        loss='categorical_crossentropy', 
                        metrics=['accuracy'])
        from transformers import RobertaTokenizerFast
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
        return (discourse_model, self, model, tokenizer)
    def load(self, file):
        import os
        import tensorflow as tf
        import pickle
        ttt1=file
        ttt2=file+"_bert"
        ttt3=file+"_python"
        ttt4=file+"_tokenizer"
        if not os.path.exists(ttt1) or not os.path.exists(ttt2) or not os.path.exists(ttt3) or not os.path.exists(ttt4):
            return None, None
        model_0 = tf.keras.models.load_model(ttt1)
        model_2 = tf.keras.models.load_model(ttt2)
        #with open(ttt3, "rb") as input_file:
        #    model1_dict = pickle.load(input_file)
        #self.__dict__.update(model1_dict)
        from transformers import RobertaTokenizerFast
        tokenizer_3 = RobertaTokenizerFast.from_pretrained(ttt4) 
        model_files = [ttt1, ttt2, ttt3, ttt4]
        return (model_0, self, model_2, tokenizer_3), model_files
    def save(self, file, model):
        import shutil
        import os
        import tensorflow as tf
        import pickle
        ttt1=file
        tf.keras.models.save_model(model[0], ttt1)
        ttt2=file+"_bert"
        tf.keras.models.save_model(model[2], ttt2)
        ttt3=file+"_python"
        with open(ttt3, "wb") as output_file:
            pickle.dump(model[1].__dict__, output_file)
        ttt4=file+"_tokenizer"
        model[3].save_pretrained(ttt4)
        return [ttt1, ttt2, ttt3, ttt4]
    def name(self):
        return "roberta"
    def fit(self, model, Xt, yt, Xv, yv):
        X_input_ids, X_attn_masks, yt = self._transform(model, Xt, yt)
        import tensorflow as tf
        dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, yt))
        dataset = dataset.map(DatasetMapFunction)     # converting to required format for tensorflow dataset
        dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
        X_val_input_ids, X_val_attn_masks, yv = self._transform(model, Xv, yv)
        val_dataset = tf.data.Dataset.from_tensor_slices((X_val_input_ids, X_val_attn_masks, yv))
        val_dataset = val_dataset.map(DatasetMapFunction)     # converting to required format for tensorflow dataset
        val_dataset = val_dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor
        epochs = 5
        if self.debug:
            epochs=1
        model[1].history = model[0].fit(dataset,
            steps_per_epoch=200,
            validation_data=val_dataset,
            epochs=epochs, verbose=0)
    def predict(self, model, X):
        X_test_input_ids, X_test_attn_masks, _y = self._transform(model, X, None)
        labels = model[0].predict([X_test_input_ids, X_test_attn_masks], verbose=0)
        return labels
    def evaluate(self, model, x, y):
        import tensorflow as tf
        X_input_ids, X_attn_masks, y = self._transform(model, x,y)
        dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, y))
        dataset = dataset.map(DatasetMapFunction)
        dataset = dataset.shuffle(10000).batch(16, drop_remainder=True)
        return model[0].evaluate(dataset, return_dict=True, verbose=0)
    def _encode_data(self, df, ids, masks, tokenizer):
        from tqdm.auto import tqdm
        for i, text in tqdm(enumerate(df['text']), mininterval=5):
            tokenized_text = tokenizer.encode_plus(
                text,
                max_length=256, 
                truncation=True, 
                padding='max_length', 
                add_special_tokens=True,
                return_tensors='tf'
            )
            ids[i, :] = tokenized_text.input_ids
            masks[i, :] = tokenized_text.attention_mask
        return ids, masks
    
    def _transform(self, model, X,y):
        tokenizer = model[3]
        X["text"]=X['discourse_type'] + tokenizer.sep_token+ X['text']
        if self.debug:
            X = X.head(1000)
            if y is not None:
                y = y[0:1000,:]
        
        import numpy as np
        X_input_ids = np.zeros((len(X), 256))
        X_attn_masks = np.zeros((len(X), 256))
        X_input_ids, X_attn_masks = self._encode_data(X, X_input_ids, X_attn_masks, tokenizer)
        return X_input_ids, X_attn_masks, y

In [13]:
class FeedbackProblem(MyProblem):
    def pred_dim(self):
        return [0,3]

In [14]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', 2550)
df_train = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
df_test = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
df_train["text"] = df_train["essay_id"].apply(lambda x: open(f'../input/feedback-prize-effectiveness/train/{x}.txt').read())
df_test["text"] = df_test["essay_id"].apply(lambda x: open(f'../input/feedback-prize-effectiveness/test/{x}.txt').read())
effectiveness_map = {"Ineffective":0, "Adequate":1,"Effective":2}
df_train["target"] = df_train["discourse_effectiveness"].map(effectiveness_map)
labels = np.zeros((len(df_train), 3))
labels[np.arange(len(df_train)), df_train['target'].values] = 1
labels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [15]:
problem_type = FeedbackProblem()
model_type = MyRobertaModel(debug=False)

In [16]:
#def get_all_functions():
import inspect
all_funcs = [tpl[0] for tpl in inspect.getmembers(sys.modules['__main__'], inspect.isfunction)]
all_classes = [tpl[0] for tpl in inspect.getmembers(sys.modules['__main__'], inspect.isclass)]
some_vars = ['model_fold_storage']
some_funcs=[]
some_classes=[]

In [17]:
all_names = []
all_names.extend(all_funcs)
all_names.extend(some_funcs)
all_names.extend(all_classes)
all_names.extend(some_classes)
all_names.extend(some_vars)
all_names=" ".join(all_names)                             
all_names

'DatasetMapFunction _contextualize callable_wrapper global_imports import_star kfold_model_inproc kfold_model_one_inproc ranch AbstractContextManager FeedbackProblem ModelFoldStorage MyBertModel MyProblem MyRobertaModel StreamToLogger nullcontext model_fold_storage'

In [18]:
all_imports = '''
'''

In [19]:
from pathos.helpers import mp
import dill as pickle

def kfold_model_mp(model_fold_storage, problem_type, model_type, X, y, Xtest):
    res = ranch(1, kfold_model_inproc,model_fold_storage, problem_type, model_type, X, y, Xtest, 
                caller_rank=None, need=all_names, imports = all_imports)
    return res[0]

In [20]:
from pathos.helpers import mp
import dill as pickle

def kfold_model_one(model_fold_storage, model_type, i_fold, train_idx, valid_idx, X, y, Xtest):
    res = ranch(1, kfold_model_one_inproc, model_fold_storage, model_type, i_fold, train_idx, valid_idx, X, y, Xtest, 
                caller_rank=None, need=all_names, imports = all_imports)
    return res[0]

In [21]:
#oof_pred, test_pred, all_metrics=kfold_model(env_lambda)

In [None]:
oof_pred, test_pred, all_metrics=kfold_model_inproc(model_fold_storage, problem_type, model_type, df_train, labels, df_test)

kfold 15944


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

starting 0 fold
kfold 14628
loaded model roberta from models/roberta_0_model
predicting oof fold 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

0it [00:00, ?it/s]

4006it [00:05, 801.13it/s]

7353it [00:09, 776.09it/s]


In [None]:
pred_labels = np.mean(test_pred, axis = 0)
pred_labels

In [None]:
sample_submission = pd.read_csv('../input/feedback-prize-effectiveness/sample_submission.csv')
sample_submission.head()

In [None]:
sample_submission['discourse_id'] = df_test['discourse_id']
sample_submission['Ineffective'] = pred_labels[:,0]
sample_submission['Adequate'] = pred_labels[:,1]
sample_submission['Effective'] = pred_labels[:,2]
sample_submission.to_csv("submission.csv", index=False)

In [None]:
import pickle
dat = []
with open("kfold_models/bert_kfold.bin", "rb") as f:
    dat = pickle.load(f)

In [None]:
dat['all_metrics']