In [0]:
!pip install -U -q PyDrive

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [3]:
 from google.colab import drive
 drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
!ls ./drive/My\ Drive/

 01.model
 amidaTest2.py
 bert-master.zip
 bert_x.npy
 cabocha-0.69.tar.bz2
'Colab Notebooks'
 C言語課題.gsite
 export
 fasttext.bin
 Japanese_L-12_H-768_A-12_E-30_BPE_transformers.zip
 ja.zip
 LBa.csv
 LBa.gsheet
 modified.pdf
 nmf.pdf
 Saitaihou.pdf
 train-val_pre.tsv
 train-val-pre.tsv
 train-val-small.tsv
 train-val-wakati-juman2.tsv
 train-val-wakati-juman-nva.tsv
 train-val-wakati-juman.tsv
 train-val-wakati-nva.tsv
 train-val-wakati.tsv
 vector_data
 六義園.gdoc
 無題のプレゼンテーション.gslides
 試作


In [0]:
%cp ./drive/My\ Drive/train-val_pre.tsv ./
%cp ./drive/My\ Drive/fasttext.bin ./

In [0]:
%mkdir model
%mkdir model/tuning

In [7]:
import os,sys
sys.path.append('../')

import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from typing import Callable, List, Optional, Tuple, Union
from sklearn.model_selection import learning_curve
from scipy import sparse
from scipy.sparse import load_npz
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import datetime
import logging

from sklearn.externals import joblib
from scipy.sparse import load_npz
from abc import ABCMeta, abstractmethod

from tqdm import tqdm
from keras.callbacks import EarlyStopping
from keras.layers import SpatialDropout1D, Bidirectional
from keras.layers.recurrent import LSTM
from keras.layers.advanced_activations import ReLU, PReLU
from keras.layers.core import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from gensim.models import KeyedVectors

class Util:
    @classmethod
    def dump(cls, value, path):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        joblib.dump(value, path, compress=True)

    @classmethod
    def load(cls, path):
        return joblib.load(path)

class Logger:
    def __init__(self):
        self.general_logger = logging.getLogger('general')
        self.result_logger = logging.getLogger('result')
        stream_handler = logging.StreamHandler()
        file_general_handler = logging.FileHandler('./model/general.log')
        file_result_handler = logging.FileHandler('./model/result.log')
        if len(self.general_logger.handlers) == 0:
            self.general_logger.addHandler(stream_handler)
            self.general_logger.addHandler(file_general_handler)
            self.general_logger.setLevel(logging.INFO)
            self.result_logger.addHandler(stream_handler)
            self.result_logger.addHandler(file_result_handler)
            self.result_logger.setLevel(logging.INFO)

    def info(self, message):
        # 時刻をつけてコンソールとログに出力
        self.general_logger.info('[{}] - {}'.format(self.now_string(), message))

    def result(self, message):
        self.result_logger.info(message)

    def result_ltsv(self, dic):
        self.result(self.to_ltsv(dic))

    def result_scores(self, run_name, scores):
        # 計算結果をコンソールと計算結果用ログに出力
        dic = dict()
        dic['name'] = run_name
        dic['score'] = np.mean(scores)
        for i, score in enumerate(scores):
            dic[f'score{i}'] = score
        self.result_ltsv(dic)

    def now_string(self):
        return str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

    def to_ltsv(self, dic):
        return '\t'.join(['{}:{}'.format(key, value) for key, value in dic.items()])

def load_x_train(features, sparse=False):
    if features == "bow":
        matrix =load_npz('../vec/bow_train_x.npz').astype('float64')
    elif features == "n-gram":
        matrix = load_npz('../vec/n-gram_x.npz').astype('float64')
    elif features == "tf-idf":
        matrix = load_npz('../vec/tf-idf_x.npz').astype('float64')
    elif features == "n-gram-tf-idf":
        matrix = load_npz('../vec/n-gram-tf-idf_x.npz').astype('float64')
    elif features == "word2vec_mean":
        matrix = np.load('../vec/word2vec_pre_x_mean.npy', allow_pickle = True)
    elif features == "word2vec_max":
        matrix = np.load('../vec/word2vec_pre_x_max.npy', allow_pickle = True)
    elif features == "word2vec_concat":
        l = np.load('../vec/word2vec_pre_x_mean.npy', allow_pickle = True)
        r = np.load('../vec/word2vec_pre_x_max.npy', allow_pickle = True)
        matrix = np.hstack((l, r))
    elif features == "word2vec_hier":
        matrix = np.load('../vec/word2vec_pre_x_hier.npy', allow_pickle = True)
    elif features == "fasttext_mean":
        matrix = np.load('../vec/fasttext_x_mean.npy', allow_pickle = True)
    elif features == "fasttext_max":
        matrix = np.load('../vec/fasttext_x_max.npy', allow_pickle = True)
    elif features == "fasttext_concat":
        l = np.load('../vec/fasttext_x_mean.npy', allow_pickle = True)
        r = np.load('../vec/fasttext_x_max.npy', allow_pickle = True)
        matrix = np.hstack((l, r))
    elif features == "fasttext_hier":
        matrix = np.load('../vec/fasttext_x_hier.npy', allow_pickle = True)
    elif features == "doc2vec":
        matrix = np.load('../vec/doc2vec_pre_x.npy', allow_pickle=True)
    elif features == "raw_text":
        df = pd.read_table("./train-val_pre.tsv", index_col=0)
        matrix = np.array(df["text"], dtype=str)
    return matrix

def load_y_train(features):
    if features == "bow_nva":
        return np.load('../vec/bow_train_y_nva.npy', allow_pickle = True)
    elif features == "raw_text":
        df = pd.read_table("./train-val_pre.tsv", index_col=0)
        return np.array(df["label"], dtype=int)
    else:
        return np.load('../vec/y_full.npy', allow_pickle=True).astype('float64')

  


class Model(metaclass=ABCMeta):
    def __init__(self, run_fold_name: str, params: dict) -> None:
        """コンストラクタ
        :param run_fold_name: ランの名前とfoldの番号を組み合わせた名前
        :param params: ハイパーパラメータ
        """
        self.run_fold_name = run_fold_name
        self.params = params
        self.model = None

    @abstractmethod
    def train(self, tr_x: np.array, tr_y: np.array,
              va_x: Optional[np.array] = None,
              va_y: Optional[np.array] = None) -> None:
        """モデルの学習を行い、学習済のモデルを保存する
        :param tr_x: 学習データの特徴量
        :param tr_y: 学習データの目的変数
        :param va_x: バリデーションデータの特徴量
        :param va_y: バリデーションデータの目的変数
        """
        pass

    @abstractmethod
    def predict(self, te_x: np.array) -> np.array:
        """学習済のモデルでの予測値を返す
        :param te_x: バリデーションデータやテストデータの特徴量
        :return: 予測値
        """
        pass

    @abstractmethod
    def score(self, te_x: np.array, te_y: np.array) -> float:
        """学習済のモデルでのスコア値を返す
        :te_x: np.array
        :te_y: np.array
        :return: 予測値
        """
        pass

    @abstractmethod
    def save_model(self, feature: str) -> None:
        """モデルの保存を行う"""
        pass

    @abstractmethod
    def load_model(self, feature: str) -> None:
        """モデルの読み込みを行う"""
        pass

# tensorflowの警告抑制
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

class ModelLSTM(Model):
    def __init__(self, run_fold_name, **params):
        super().__init__(run_fold_name, params)

    def train(self, tr_x, tr_y, va_x=None, va_y=None):
        """ 
            tr_x : List[str] (example.) [ "I am happy", "hello" ]
            tr_y : List[label]
            embedding_model : gensim.models.KeyedVectors Object
        """
        # scaling
        validation = va_x is not None

        # パラメータ
        nb_classes = 2
        embedding_dropout = self.params['embedding_dropout']
        lstm_dropout = self.params['lstm_dropout']
        lstm_recurrent_dropout = self.params['recurrent_dropout']
        hidden_layers = int(self.params['hidden_layers'])
        hidden_units = int(self.params['hidden_units'])
        hidden_activation = self.params['hidden_activation']
        hidden_dropout = self.params['hidden_dropout']
        batch_norm = self.params['batch_norm']
        optimizer_type = self.params['optimizer']['type']
        optimizer_lr = self.params['optimizer']['lr']
        batch_size = int(self.params['batch_size'])
        nb_epoch = int(self.params['nb_epoch'])
        embedding_model = self.params['embedding_model']
        bidirectional = self.params['Bidirectional']
        use_pre_embedding = not (embedding_model is None)

        # using keras tokenizer here
        self.token = Tokenizer(num_words=None)
        self.max_len = 100
        if validation:
            self.token.fit_on_texts(list(tr_x) + list(va_x))
        else:
            self.token.fit_on_texts(list(tr_x))

        xtrain_seq = self.token.texts_to_sequences(tr_x)
        tr_x = pad_sequences(xtrain_seq, maxlen=self.max_len)
        #tr_y = np_utils.to_categorical(tr_y, num_classes=nb_classes)

        if validation:
            xvalid_seq = self.token.texts_to_sequences(va_x)
            va_x = pad_sequences(xvalid_seq, maxlen=self.max_len)
            #va_y = np_utils.to_categorical(va_y, num_classes=nb_classes)

        word_index = self.token.word_index

        if use_pre_embedding:
            # create an embedding matrix
            vector_dim = embedding_model.vector_size
            embedding_matrix = np.zeros((len(word_index) + 1, vector_dim))
            for word, i in tqdm(word_index.items()):
                embedding_vector = embedding_model.wv[word]
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
            
        self.model = Sequential()
        # input layer
        if use_pre_embedding:
            self.model.add(Embedding(
                    input_dim=len(word_index) + 1, 
                    output_dim=vector_dim,
                    input_length=self.max_len,
                    weights=[embedding_matrix],
                    trainable=False))
        else:
            self.model.add(Embedding(input_dim=len(word_index) + 1, 
                    output_dim=300,
                    input_length=self.max_len))

        self.model.add(SpatialDropout1D(embedding_dropout))
        if bidirectional:
            self.model.add(Bidirectional(LSTM(100, dropout=lstm_dropout, recurrent_dropout=lstm_recurrent_dropout)))
        else:
            self.model.add(LSTM(100, dropout=lstm_dropout, recurrent_dropout=lstm_recurrent_dropout))
        # 中間層
        for i in range(hidden_layers):
            self.model.add(Dense(hidden_units))
            if batch_norm == 'before_act':
                self.model.add(BatchNormalization())
            if hidden_activation == 'prelu':
                self.model.add(PReLU())
            elif hidden_activation == 'relu':
                self.model.add(ReLU())
            else:
                raise NotImplementedError
            self.model.add(Dropout(hidden_dropout))

        # 出力層
        self.model.add(Dense(1, activation='sigmoid'))

        # オプティマイザ
        if optimizer_type == 'sgd':
            optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
        elif optimizer_type == 'adam':
            optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.)
        else:
            raise NotImplementedError

        # 目的関数、評価指標などの設定
        self.model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # エポック数、アーリーストッピング
        # あまりepochを大きくすると、小さい学習率のときに終わらないことがあるので注意
        patience = 12
        # 学習の実行
        if validation:
            early_stopping = EarlyStopping(monitor='val_loss', patience=patience,
                                            verbose=2, restore_best_weights=True)
            history = self.model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=batch_size, verbose=2,
                                validation_data=(va_x, va_y), callbacks=[early_stopping])
        else:
            history = self.model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=batch_size, verbose=2)

    def predict(self, te_x):
        xtest_seq = self.token.texts_to_sequences(te_x)
        te_x = pad_sequences(xtest_seq, maxlen=self.max_len)
        y_pred = self.model.predict(te_x)
        return y_pred

    def score(self, te_x, te_y):
        y_pred = self.predict(te_x)
        y_pred = np.where(y_pred > 0.5, 1, 0)
        return accuracy_score(te_y, y_pred)

    def save_model(self, feature):
        model_path = os.path.join(f'./model/model/{feature}', f'{self.run_fold_name}.h5')
        scaler_path = os.path.join(f'./model/model/{feature}', f'{self.run_fold_name}-scaler.pkl')
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        self.model.save(model_path)
        Util.dump(self.scaler, scaler_path)

    def load_model(self, feature):
        model_path = os.path.join(f'./model/model/{feature}', f'{self.run_fold_name}.h5')
        scaler_path = os.path.join(f'./model/model/{feature}', f'{self.run_fold_name}-scaler.pkl')
        self.model = load_model(model_path)
        self.scaler = Util.load(scaler_path)



Using TensorFlow backend.


In [0]:
from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, Trials

from gensim.models import KeyedVectors, FastText

logger = Logger()

def objective(params):
    global param
    param.update(params)
    model = ModelLSTM("LSTM", **param)
    print("TRAIN")
    model.train(tr_x, tr_y, va_x, va_y)
    print("PREDICT")
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    print(f'params: {params}, logloss: {score:.4f}')
    # 情報を記録しておく
    history.append((params, score))

    return {'loss': score, 'status': STATUS_OK}

if __name__ == '__main__':
    # 基本となるパラメータ
    param = {
        'embedding_dropout' : 0.3,
        'lstm_dropout' : 0.3,
        'recurrent_dropout' : 0.3,
        'hidden_layers': 3,
        'hidden_units': 64,
        'hidden_activation': 'relu',
        'hidden_dropout': 0.8,
        'batch_norm': 'before_act',
        'optimizer': {'type': 'adam', 'lr': 0.001},
        'batch_size': 100,
        'nb_epoch' : 2,
        'embedding_model' : None,
        'Bidirectional' : False
    }
    param['embedding_model'] = KeyedVectors.load_word2vec_format('./fasttext.bin', binary=True)
    # 探索するパラメータの空間を指定する
    param_space = {
        'embedding_dropout': hp.quniform('embedding_dropout', 0, 0.5, 0.05),
        'lstm_dropout' : hp.quniform('lstm_dropout', 0, 0.5, 0.05),
        'recurrent_dropout' : hp.quniform('recurrent_dropout', 0, 0.5, 0.05),
        'hidden_layers': hp.quniform('hidden_layers', 1, 3, 1),
        'hidden_units': hp.quniform('hidden_units', 32, 512, 32),
        'hidden_activation': hp.choice('hidden_activation', ['prelu', 'relu']),
        'hidden_dropout': hp.quniform('hidden_dropout', 0, 0.3, 0.05),
        'batch_norm': hp.choice('batch_norm', ['before_act', 'no']),
        'optimizer': hp.choice('optimizer',
                           [{'type': 'adam',
                             'lr': hp.loguniform('adam_lr', np.log(0.00001), np.log(0.01))},
                            {'type': 'sgd',
                             'lr': hp.loguniform('sgd_lr', np.log(0.00001), np.log(0.01))}]),
        'batch_size': hp.quniform('batch_size', 32, 128, 32),
        #'nb_epoch': hp.quniform('nb_epoch', 200, 500, 50)
    }

    #features = [
    #    "bow","bow_nva","bow_tf-idf","term_2-gram","term_3-gram","word2vec_mean","word2vec_pre_mean",
    #    "word2vec_fine-tuning", "doc2vec", "scdv", "bert"
    #]
    features = [
       "raw_text" 
    ]

    for i, name in enumerate(features):
        train_x = load_x_train(name)
        train_y = load_y_train(name)
        skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=71)
        tr_idx, va_idx = list(skf.split(train_x, train_y))[0]
        tr_x, va_x = train_x[tr_idx], train_x[va_idx]
        tr_y, va_y = train_y[tr_idx], train_y[va_idx]

        # hyperoptによるパラメータ探索の実行
        max_evals = 100
        trials = Trials()
        history = []
        fmin(objective, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
        history = sorted(history, key=lambda tpl: tpl[1])
        best = history[0]
        logger.info(f'{name} - best params:{best[0]}, score:{best[1]:.4f}')
        
        res = pd.DataFrame.from_dict(
                best,
                orient='index', 
                columns=[name]
            )
        if i == 0:
            res.to_csv('./model/tuning/LSTM.csv')
        else:
            res.to_csv('./model/tuning/LSTM.csv', mode='a', header=False)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


TRAIN
  0%|          | 0/100 [00:00<?, ?it/s, best loss: ?]

  0%|          | 0/181555 [00:00<?, ?it/s]
[A

  9%|9         | 16426/181555 [00:00<00:01, 164255.85it/s]
[A
 18%|#7        | 32279/181555 [00:00<00:00, 162494.22it/s]
[A
 26%|##6       | 47766/181555 [00:00<00:00, 160128.15it/s]
[A
 36%|###5      | 64756/181555 [00:00<00:00, 162939.54it/s]
[A
 47%|####6     | 85238/181555 [00:00<00:00, 173587.76it/s]
[A
 58%|#####8    | 105927/181555 [00:00<00:00, 182393.46it/s]
[A
 70%|######9   | 126369/181555 [00:00<00:00, 188485.29it/s]
[A
 81%|########  | 147002/181555 [00:00<00:00, 193505.77it/s]
[A
 92%|#########2| 167141/181555 [00:00<00:00, 195802.37it/s]
[A
100%|##########| 181555/181555 [00:01<00:00, 181175.31it/s]
[A


Train on 41666 samples, validate on 8334 samples
Epoch 1/2
 - 224s - loss: 0.7234 - acc: 0.5438 - val_loss: 0.6566 - val_acc: 0.6122

Epoch 2/2
  0%|          | 0/100 [04:00<?, ?it/s, best loss: ?]

In [0]:
!ls