In [1]:
import spacy
import glob
import os
import yaml
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from tqdm import tqdm
from pathlib import Path
import random
import numpy as np

from termcolor import colored
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, AdamW, TFBertModel
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import mean_squared_error


import torch
# from GPUtil import showUtilization as gpu_usage
from numba import cuda

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

# free_gpu_cache()                

if torch.cuda.is_available():        
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, please check.')
    device = torch.device("cpu")




There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [2]:
# !pip3 install pickle5

In [3]:
import logging
import numpy as np
import os
import pandas as pd
import pickle
import random
import re
import string
import sys
import time

from pathlib import Path
import yaml

from tqdm import tqdm
from functools import partial
from nltk.corpus import words, stopwords

import tensorflow_addons as tfa
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, save_model, load_model, model_from_yaml
from tensorflow.keras.layers import Conv1D, \
    Flatten, \
    GlobalMaxPooling1D, \
    TimeDistributed, \
    MaxPooling1D, \
    Dense, \
    Activation, \
    ReLU, \
    LSTM, \
    GRU, \
    SpatialDropout1D, \
    Dropout, \
    Bidirectional, \
    Embedding, \
    Add
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.layers import Layer, InputSpec
from datetime import datetime

import torch
from numba import cuda
import gc
import tensorflow as tf
from tensorflow.python.keras import backend as K


random.seed(0)
np.random.seed(0)

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()

    gc.collect()
    torch.cuda.empty_cache()
    # device = cuda.get_current_device()
    # device.reset()

    # current_device = cuda.get_current_device().id
    # cuda.select_device(current_device)
    # cuda.close()
    # cuda.select_device(current_device)

    print("GPU Usage after emptying the cache")
    gpu_usage()


def config_gpu_growth():
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.compat.v1.Session(config=config)
    K.set_session(sess)

    return sess


def close_sess_keras(sess):
    sess.close()
    tf.keras.backend.clear_session()
# free_gpu_cache()

punctuation = string.punctuation.replace("@", "").replace("+", "").replace("-", "").replace("_", "")
stop_words = set(stopwords.words('english'))


def timing_val(func):
    def wrapper(*arg, **kw):
        t1 = time.time()
        res = func(*arg, **kw)
        t2 = time.time()
        # print(f"\nFunc {func.__name__} took {(t2 - t1)}")
        return res

    return wrapper


class ResidualBlock1D(Layer):
    def __init__(self, channels_in, kernel, **kwargs):
        super(ResidualBlock1D, self).__init__(**kwargs)
        self.channels_in = channels_in
        self.kernel = kernel

        self.conv1 = Conv1D(self.channels_in,
                            self.kernel,
                            padding='same',
                            activation='relu')
        self.conv2 = Conv1D(self.channels_in,
                            self.kernel,
                            padding='same')
        self.activation = Activation('relu')

    def call(self, x):
        y = x
        x = self.conv1(x)
        x = self.conv2(x)
        x = Add()([x, y])
        x = self.activation(x)
        return x

    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'channels_in': self.channels_in,
            'kernel': self.kernel,
        })
        return config


class DNN:
    def __init__(
            self,
            backbone='conv',  # New option TEST_CLIT
            charlevel=False,
            use_glove=True,
            preprocess_data=False,
            logger=None,
            batch_size=64,
            max_vocab=10000,
            max_len=8192,
            embedding_mat_columns=50,
            epochs=200
    ):

        self.backbone = backbone
        self.charlevel = charlevel
        self.use_glove = use_glove
        self.preprocess_data = preprocess_data

        self.OOV_TOK = '<OOV>'
        self.PADDING_TYPE = 'post'
        self.TRUNCATE_TYPE = 'pre'  # 'post'
        self.batch_size = batch_size
        self.max_vocab = max_vocab
        self.max_len = max_len
        self.embedding_mat_columns = embedding_mat_columns

        self.__model = None
        self.res = list()
        self.is_trained = False

        self.logger = logger
        self.epochs = epochs

        self.augment = False

        if self.charlevel and self.use_glove:
            if logger:
                logger.warning('charlevel and use_glove both set to true. use_glove will be ignored.')

    @property
    def model(self):
        return self.__model

    @model.setter
    def model(self, value):
        self.__model = value

    @staticmethod
    def clean_text(txt):
        return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

    def preprocess_texts(self, texts, tokens=[]):
        preprocessed_texts = []
        additional_tokens = []

        for text in texts:
            if self.preprocess_data:
                text = self.clean_text(text)

            preprocessed_texts.append(text)

        return np.array(preprocessed_texts), tokens, np.array(additional_tokens)

    def fit_tokenizer(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        elif isinstance(texts, tuple):
            texts = list(texts)
        elif not isinstance(texts, (list, pd.core.series.Series, np.ndarray)):
            raise ValueError("The text must be a list of strings, a list of lists containing strings or a string")

        self.tokenizer.fit_on_texts(texts)

    def sequence_padding(self, sequences):
        seqs = pad_sequences(sequences, maxlen=self.max_len, padding=self.PADDING_TYPE, truncating=self.TRUNCATE_TYPE)
        return seqs

    def load_glove(self, additional_tokens=[]):
        embeddings_index = {}
        glove_path = f'../../data/glove.6B.{self.embedding_mat_columns}d.txt'  # Todo change it if running from MAIN
        f = open(glove_path)
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        # print(len(embeddings_index))

        # embedding_matrix = np.zeros((len(embeddings_index) + 2 + len(additional_tokens), self.embedding_mat_columns))
        embedding_matrix = np.random.randn(len(embeddings_index) + 2 + len(additional_tokens),
                                           self.embedding_mat_columns)
        w2i = {}
        for i, (word, embs) in enumerate(embeddings_index.items()):
            embedding_matrix[i] = embs
            w2i[word] = i
        i += 1
        w2i['<OOV>'] = i

        if self.logger:
            self.logger.info(f'load_glove: {embedding_matrix.shape}, {len(w2i.keys())}')
        else:
            print('load_glove', embedding_matrix.shape, len(w2i.keys()))

        return embeddings_index, w2i, embedding_matrix

    def compute_output_shape(self, input_shape):
        return input_shape

    def get_model(self, vocab_size, embedding_matrix=None):

        model = Sequential()
        model.add(Embedding(
            vocab_size,
            self.embedding_mat_columns,
            embeddings_initializer=tf.keras.initializers.Constant(
                embedding_matrix) if embedding_matrix is not None else None
        ))

        if self.backbone == 'LSTM':
            model.add(Bidirectional(LSTM(self.embedding_mat_columns)))
        elif self.backbone == 'GRU':
            model.add(Bidirectional(GRU(self.embedding_mat_columns)))
        elif self.backbone == 'CONV':
            model.add(Conv1D(512, 3, activation='relu'))
            model.add(ResidualBlock1D(512, 3))
            model.add(MaxPooling1D())
            model.add(Conv1D(256, 3, activation='relu'))
            model.add(ResidualBlock1D(256, 3))
            model.add(MaxPooling1D())
            model.add(Conv1D(128, 3, activation='relu'))
            model.add(ResidualBlock1D(128, 3))
            model.add(GlobalMaxPooling1D())
        elif self.backbone == 'DEMO':
            model.add(Conv1D(256, 3, activation='relu'))
            model.add(GlobalMaxPooling1D())
        elif self.backbone == 'TEST_CLIT':
            model.add(Conv1D(64, 5, padding='valid', kernel_initializer='normal', activation='relu'))
            model.add(MaxPooling1D(2))
            model.add(Conv1D(128, 5, padding='valid', kernel_initializer='normal', activation='relu'))
            model.add(MaxPooling1D(2))
            model.add(Conv1D(256, 5, padding='valid', kernel_initializer='normal', activation='relu'))
            model.add(Conv1D(512, 5, padding='valid', kernel_initializer='normal', activation='relu'))
            model.add(Conv1D(1024, 5, padding='valid', kernel_initializer='normal', activation='relu'))
            model.add(Conv1D(2048, 5, padding='valid', kernel_initializer='normal', activation='relu'))
            model.add(Conv1D(4098, 5, padding='valid', kernel_initializer='normal', activation='relu'))
            model.add(GlobalMaxPooling1D())
            model.add(Dense(120, kernel_initializer='normal', activation='relu'))
            model.add(Dense(240, kernel_initializer='normal', activation='relu'))
            model.add(Dense(480, kernel_initializer='normal', activation='relu'))
            model.add(Dense(980, kernel_initializer='normal', activation='relu'))
        else:
            raise NotImplementedError

        model.add(Dense(1))

        return model

    def fit(self, X_train, y_train, out_path):
        X, y = X_train, y_train

        if not Path(out_path).exists():
            os.makedirs(out_path)

        X, self.tokens, additional_tokens = self.preprocess_texts(X)

        embedding_matrix = None
        if self.charlevel:
            self.tokenizer = Tokenizer(oov_token=self.OOV_TOK, filters='', lower=False, char_level=True)
            self.fit_tokenizer(texts=X)
        else:
            self.tokenizer = Tokenizer(oov_token=self.OOV_TOK, lower=True,
                                       char_level=False)  # filters='<', lower=False,

            if self.use_glove:
                _, w2i, embedding_matrix = self.load_glove(additional_tokens=additional_tokens)
                self.tokenizer.word_index = w2i
            else:
                self.fit_tokenizer(texts=X)

        # sequences = np.array(self.tokenizer.texts_to_sequences(X))
        sequences = self.tokenizer.texts_to_sequences(X)

        X = self.sequence_padding(sequences)

        self.model = self.get_model(vocab_size=(len(self.tokenizer.word_index) + 1), embedding_matrix=embedding_matrix)

        opt = tf.keras.optimizers.Adam(learning_rate=0.001)

        self.model.compile(loss='mean_squared_error', optimizer=opt, metrics=[tf.metrics.MeanSquaredError()])

        self.model.summary()

        es = EarlyStopping(
            monitor='val_mean_squared_error',
            mode='min',
            patience=20,
            verbose=1
        )
        lr_sch = ReduceLROnPlateau(
            monitor='val_mean_squared_error',
            mode='min',
            factor=0.1,
            patience=10,
            verbose=1,
            min_delta=0.001,
            cooldown=0,
            min_lr=1e-6,
        )
        ckpt = ModelCheckpoint(
            os.path.join(out_path, 'model.h5'),
            monitor='val_mean_squared_error',
            mode='min',
            verbose=1,
            save_best_only=True,
            save_weights_only=False,
            save_freq='epoch'
        )

        try:
            # print(self.batch_size)
            self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, validation_split=0.2, verbose=1,
                           callbacks=[es, lr_sch, ckpt])
        except KeyboardInterrupt:
            print('Got KeyboardInterrupt. Stopping.')

        self.is_trained = True

    def predict_proba(self, X):
        assert self.is_trained, 'Model should be trained before inference.'
        if isinstance(X, str):
            X = [X]

        if self.preprocess_texts:
            X, _, _ = self.preprocess_texts(X)

        sequences = self.tokenizer.texts_to_sequences(X)
        padded = self.sequence_padding(sequences)
        preds = self.model.predict(padded)

        return preds

    def predict(self, X, return_proba=False):
        print("=====================")
        assert self.is_trained, 'Model should be trained before inference.'
        print(X)
        proba = self.predict_proba(X)
        print(proba)
        preds = np.argmax(proba, axis=1)

        if return_proba:
            return proba
        else:
            return preds

    def save(self, path):
        if self.is_trained:

            output_dir = Path(path)
            if not output_dir.exists():
                Path.mkdir(output_dir, parents=True, exist_ok=True)

            # serialize model to YAML
            model_yaml = self.model.to_yaml()
            with open(output_dir / 'nn_model_config.yaml', 'w') as file:
                file.write(model_yaml)
            # serialize weights to HDF5
            self.model.save_weights(output_dir / "model.h5")

            with open(output_dir / 'tokenizer3.pkl', 'wb') as file:
                pickle.dump(self.tokenizer, file, protocol=pickle.HIGHEST_PROTOCOL)

            # self.logger.info(f'Saved model to {output_dir}')
        else:
            pass
            # self.logger.warning('Cannot save the model. Train it first.')

    def load(self, path):
        print("Loading model...")
        output_dir = Path(path)
        
        print("Loading tokenizer")
        with open(output_dir / 'tokenizer3.pkl', 'rb') as file:
            self.tokenizer = pickle.load(file)
        
        with open(output_dir / 'nn_model_config.yaml') as file:
            print("Reading...")
            model_config = file.read()
        print("Model from yaml")
        self.model = model_from_yaml(model_config, custom_objects={'ResidualBlock1D': ResidualBlock1D})
        print("Loading weights")
        self.model.load_weights(output_dir / "model.h5")
        
        print("Done")

        self.is_trained = True
        


In [4]:

import pandas as pd
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")
dnn = DNN(backbone='conv', 
            charlevel=False,
            use_glove=True,
            preprocess_data=False,
            logger=None,
            batch_size=64,
            max_vocab=10000,
            max_len=8192,
            embedding_mat_columns=50,
            epochs=200)

dnn.load('../input/d/pilarpieiro/dnn-modelv1')
preds = dnn.predict(test["excerpt"], return_proba=True)
predictions = preds.squeeze(-1)
submission = pd.DataFrame({'id': test.id, 'target':predictions })
submission.to_csv('/kaggle/working/submission.csv', index=False)

Loading model...
Loading tokenizer
Reading...
Model from yaml
Loading weights
Done
0    My hope lay in Jack's promise that he would ke...
1    Dotty continued to go to Mrs. Gray's every nig...
2    It was a bright and cheerful scene that greete...
3    Cell division is the process by which a parent...
4    Debugging is the process of finding and resolv...
5    To explain transitivity, let us look first at ...
6    Milka and John are playing in the garden. Her ...
Name: excerpt, dtype: object
[[-0.94771826]
 [-0.779474  ]
 [-0.39480895]
 [-1.4428356 ]
 [-1.7663398 ]
 [-1.3449428 ]
 [-0.79534453]]
