# NN model for title and description data
# Bi LSTM + GRU with embedding features

## Modules

In [2]:
import sys
import os
import gc
import re
from collections import Counter
import logging
import pandas as pd
import pickle
import h5py
import time
from contextlib import contextmanager
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import CuDNNLSTM, CuDNNGRU, Bidirectional
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.optimizers import Adam
from keras import backend as K
from sklearn.model_selection import KFold, StratifiedKFold

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Settings

In [3]:
FTEXT_PRETRAINED_NEOLOGD_PATH = './fastText_Avito.vec'
FTEXT_PRETRAINED_WIKI_PATH = './wiki.ru.vec'
FTEXT_PRETRAINED_COMMONCRAWL_PATH = './cc.ru.300.vec'
TOKENIZER_PATH = './tokenizer.pkl'
NEW_WORDS_LIST_PATH = './new_wordlist.pkl'
MODEL_HISTORY_PATH = './hist.pkl'
PRETRAINED_MODEL_PATH = './best_model_no_cross_val.h5'

fold_num = 4
seed = 7
VALID = False

seq_maxlen = 300
embed_size = 300

## Methods

In [4]:
def cleanName(text):
    try:
        textProc = text.lower()
        # textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        #regex = re.compile(u'[^[:alpha:]]')
        #textProc = regex.sub(" ", textProc)
        textProc = re.sub('[!@#$_“”¨«»®´·º½¾¿¡§£₤‘’]', '', textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"

In [5]:
def remove_stopwords(text, stopwords):
    words = text.split()
    removed_words = [word for word in words if not word in stopwords]
    filtered_text =  " ".join(removed_words)
    return filtered_text

In [6]:
def _get_train_feature(train_df, seq_maxlen=300, max_num_words=100000):
    """
    コメント(文字列) -> トークン化された特徴, ラベル
    """
    print("学習データの語彙数を計算しています")
    total_word_count = Counter()
    train_splited_np = train_df["title_desc"].str.split().values
    total_word_count = Counter()
    for word_list in train_splited_np: 
        #total_word_count += Counter(train_splited_np[idx])
        total_word_count.update(word_list)
    # num_wordsの上限を設ける
    num_words = min(max_num_words , len(total_word_count.keys()))

    print("tokenizerモデルを学習しています")
    train_np = train_df["title_desc"].values
    tokenizer = text.Tokenizer(num_words=num_words)
    tokenizer.fit_on_texts(list(train_np))

    print("学習済みtokenizerを保存しています")
    with open(TOKENIZER_PATH, 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    print("コメントをtokenizeしています")
    tokened_train_list = tokenizer.texts_to_sequences(train_np)
    X_train_np = sequence.pad_sequences(tokened_train_list, maxlen=seq_maxlen)
    y_train_np = train_df["deal_probability"].values
        
    return X_train_np, y_train_np, tokenizer

In [7]:
def _get_train_wvector_coeff(train_df, tokenizer, embed_size=300, seq_maxlen=300):
    """
    トークン化した各単語に対応する学習済みfastTextの重みを抽出。 
    また、学習済みfastText, GloVeに存在しない単語をnew_words_listとして抽出し、
    train_df中にどのくらい新語が含まれているか計算する。
    """
    print("tokenの重みを取得しています")
    ftext_neo_wmatrix, new_words_list_neo = _get_weighted_matrix(tokenizer, 
                                                                 FTEXT_PRETRAINED_NEOLOGD_PATH, 
                                                                 embed_size=embed_size,
                                                                 seq_maxlen=seq_maxlen)
    ftext_wiki_wmatrix, new_words_list_wiki  = _get_weighted_matrix(tokenizer, 
                                                                    FTEXT_PRETRAINED_WIKI_PATH, 
                                                                    embed_size=embed_size,
                                                                    seq_maxlen=seq_maxlen)
    ftext_cc_wmatrix, new_words_list_cc  = _get_weighted_matrix(tokenizer, 
                                                                FTEXT_PRETRAINED_COMMONCRAWL_PATH, 
                                                                embed_size=embed_size,
                                                                seq_maxlen=seq_maxlen)
    # Concatnate 3 new words lists
    # listはin演算でsetに対して10^3のオーダー倍遅い
    # uniqueの計算時にin演算を使用するのでsetに変換
    # set, dict > numpy.array > list の順に早い
    new_word_set = set(new_words_list_neo + new_words_list_wiki + new_words_list_cc)

    print("学習済みword vectorに含まれない新出語の比率を計算しています")
    #train_df["word_list"] = train_df["title_desc"].apply(lambda comment: comment.split())
    train_df["word_list"] = train_df["title_desc"].str.split()
    # word_list中にnew_word_listの単語がどのくらい含まれているか数える
    
    unique_rate_np = train_df["word_list"].map(lambda word_list: 
                                               len([1 for word in word_list if word in new_word_set])
                                               / len(word_list) 
                                               if len(word_list) !=0 else 1).astype("float16").values

    return ftext_neo_wmatrix, ftext_wiki_wmatrix, ftext_cc_wmatrix, unique_rate_np.reshape(-1, 1)

In [8]:

def uq_rate(word_list):
    return len([word for word in word_list if word in new_words_list]) / len(word_list)

In [9]:
def _get_weighted_matrix(tokenizer, pretrained_path, embed_size=300, seq_maxlen=300):
    """学習済み単語ベクトルを読み込んで、特徴の重みを計算"""
    # word_and_vector.strip().split() ではうまくいかなかった（ロシア語の場合）
    embed_idx = dict(_get_coefs(*word_and_vector.rstrip().rsplit(' ')) for word_and_vector in open(pretrained_path, encoding="utf-8"))
    # 次元の不揃いなベクトルをseq_maxlenにpaddingする
    embed_idx_val = sequence.pad_sequences(embed_idx.values(), maxlen=seq_maxlen)
    all_embs = np.stack(embed_idx_val) # embed_idx_val
    embed_mean, embed_std =  all_embs.mean(), all_embs.std()
    word_idx = tokenizer.word_index
    nb_words = min(tokenizer.num_words, len(word_idx))
    embed_matrix = np.random.normal(embed_mean, embed_std, (nb_words, embed_size))
    new_words_list = []
    for word, idx in word_idx.items():
        if idx >= nb_words: continue
        embed_vector = embed_idx.get(word)
        if embed_vector is not None: embed_matrix[idx] = embed_vector
        else: new_words_list.append(word)

    return embed_matrix, new_words_list

In [10]:
def _get_coefs(word, *arr):
    """
    入力された*arr(str)にワードベクトルの数値以外の不純物が含まれていることがあるので，
    try: 数値データであればarr_fixedにappendする
    except: 数値以外の不純物が混じっている場合（floatへキャスト時にValueErrorする場合）は含めない
    """
    arr_fixed = []
    for arr_val in arr:
        try:
            arr_fixed.append(float(arr_val))
        except ValueError:
            pass

    return word, np.asarray(arr_fixed, dtype='float32')

In [11]:
def _get_input_dict(input_np, seq_maxlen=300):
    """2種類の学習済み分散表現モデル（fastText_NEologd, fastText_Wikipedia, fastText_CommonCrawl）で重み付けできるようにdictを作成"""
    input_dict = {
                  'ftext_neo': input_np[:, :seq_maxlen],
                  'ftext_wiki': input_np[:, :seq_maxlen],
                  'ftext_cc': input_np[:, :seq_maxlen],
                  'uniq_rate': input_np[:, seq_maxlen]
                 }

    return input_dict

In [12]:
def _get_model(len_train, ftext_neo_weight, ftext_wiki_weight, ftext_cc_weight, num_words, seq_maxlen=300, 
               embed_size=300, batch_size=5000, epochs=2):
    inp_ftext_neo = Input(shape=(seq_maxlen, ), name='ftext_neo')
    inp_ftext_wiki = Input(shape=(seq_maxlen, ), name='ftext_wiki')
    inp_ftext_cc = Input(shape=(seq_maxlen, ), name='ftext_cc')
    inp_urate = Input(shape=[1], name='uniq_rate')
    embed_ftext_neo = Embedding(num_words, embed_size, weights=[ftext_neo_weight])(inp_ftext_neo)
    embed_ftext_wiki = Embedding(num_words, embed_size, weights=[ftext_wiki_weight])(inp_ftext_wiki)
    embed_ftext_cc = Embedding(num_words, embed_size, weights=[ftext_cc_weight])(inp_ftext_cc)
    conc_embed = concatenate([embed_ftext_neo, embed_ftext_wiki, embed_ftext_cc])
    dout = SpatialDropout1D(0.5)(conc_embed)

    lstmed = Bidirectional(CuDNNLSTM(40, return_sequences=True, go_backwards=True))(dout)
    grued = Bidirectional(CuDNNGRU(40, return_sequences=True, go_backwards=True))(lstmed)
    avg_pooled = GlobalAveragePooling1D()(grued)
    max_pooled = GlobalMaxPooling1D()(grued)
    conc_pool_urate = concatenate([avg_pooled, max_pooled, inp_urate])
    outs = Dense(1, activation="linear")(conc_pool_urate) # linear, sigmoid

    inputs = [inp_ftext_neo, inp_ftext_wiki, inp_ftext_cc, inp_urate]
    model = Model(inputs=inputs, outputs=outs)

    # 重み減数の設定
    steps = int(len_train / batch_size) * epochs
    lr_init, lr_fin = 0.002, 0.0002
    optimizer_adam = Adam(lr=0.002, decay=_get_exp_decay(lr_init, lr_fin, steps))

    model.compile(loss="mean_squared_error", # root_mean_squared_error
                  optimizer=optimizer_adam,
                  metrics=[root_mean_squared_error]) # , 

    return model

In [13]:
def _get_exp_decay(init, fin, steps): 
    return (init / fin) ** (1 / (steps - 1)) - 1 

In [14]:
# train_test合同でtokenizeするとなぜかval_loss が発散するバグある．
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1)) 

In [15]:
def _get_test_feature(test_data, seq_maxlen=300):
    test_np = test_data["title_desc"].values
    if os.path.exists(TOKENIZER_PATH):
        print("学習済みtokenizerをロードしています")
        with open(TOKENIZER_PATH, 'rb') as handle:
            tokenizer = pickle.load(handle)
    else:
        sys.exit("学習済みtokenizerが見つかりません")
    
    tokened_test_list = tokenizer.texts_to_sequences(test_np)
    X_test_np = sequence.pad_sequences(tokened_test_list, maxlen=seq_maxlen)
        
    return X_test_np, tokenizer

In [16]:
def _get_test_wvector_coeff(test_df, tokenizer, embed_size=300, seq_maxlen=300):
    new_words_set = set([])
    if os.path.exists(NEW_WORDS_LIST_PATH):
        with open(NEW_WORDS_LIST_PATH, 'rb') as handle:
            new_words_list = pickle.load(handle)
    else:
        print("new words listが存在しないので、作成します")
        _, new_words_list_neo = _get_weighted_matrix(tokenizer, 
                                                     FTEXT_PRETRAINED_NEOLOGD_PATH, 
                                                     embed_size=embed_size,
                                                     seq_maxlen=seq_maxlen)
        _, new_words_list_wiki = _get_weighted_matrix(tokenizer, 
                                                      FTEXT_PRETRAINED_WIKI_PATH, 
                                                      embed_size=embed_size,
                                                      seq_maxlen=seq_maxlen)
        _, new_words_list_cc = _get_weighted_matrix(tokenizer, 
                                                    FTEXT_PRETRAINED_COMMONCRAWL_PATH, 
                                                    embed_size=embed_size,
                                                    seq_maxlen=seq_maxlen)
        new_words_set = set(new_words_list_neo + new_words_list_wiki + new_words_list_cc)
    test_df["word_list"] = test_df["title_desc"].str.split()
    unique_rate_np = test_df["word_list"].map(lambda word_list: 
                                              len([1 for word in word_list if word in new_words_set]) 
                                              / len(word_list) 
                                              if len(word_list) !=0 else 1).astype("float16").values

    return unique_rate_np.reshape(-1, 1)

In [17]:
@contextmanager
def timer(name):
	t0 = time.time()
	yield
	print(f'[{name}] done in {time.time() - t0:.0f} s')

## Class

In [18]:
import numpy as np
from keras.callbacks import Callback
#from keras.models = import Model 

class ModelSave(Callback):
    def __init__(self, interval=1, fold_idx=0):
        super(Callback, self).__init__()
        self.interval = interval
        self.fold_idx = fold_idx

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            print("model save")
            if self.fold_idx > 0:
                self.model.save("best_model_{:d}f_ep{:d}.h5".format(self.fold_idx, epoch+1))
            else:
                self.model.save("best_model_no_cross_val_ep{:d}.h5".format(epoch+1))

## Process (Cross Validation)

In [20]:
train_test_df = pd.read_csv("text_train_test.csv",
                            dtype={"item_id":str, "title":str, "description":str, "deal_probability":float})

train_test_df["title_desc"] = train_test_df["title"] + " " + train_test_df["description"]
train_test_df["title_desc"] = train_test_df["title_desc"].fillna("без описания") 
train_test_df["title_desc"] = train_test_df["title_desc"].apply(lambda text: str(text) \
                                                                if type(text) != str else text) 
train_test_df["title_desc"] = train_test_df["title_desc"].apply(lambda text: text.lower())
train_test_df["title_desc"] = train_test_df["title_desc"].apply(lambda text: cleanName(text))

In [21]:
# Update 2018-06-24
# ストップワード除去
import nltk
from nltk.corpus import stopwords 

#russian_stop = set(stopwords.words('russian'))
#train_test_df["title_desc"] = train_test_df["title_desc"].apply(lambda text: remove_stopwords(text, russian_stop))
# 特殊データの除去
elim_id_list = pd.read_csv("elim_item_id.csv")
import sys
elim_idx_list = []
for idx, item in enumerate(elim_id_list["item_id"]):
    elim_idx = train_test_df[train_test_df["item_id"] == item].index
    if len(elim_idx) != 0:
        elim_idx_list.append(list(elim_idx)[0])
    sys.stdout.write("\r {:d}".format(idx+1))
    sys.stdout.flush()
train_test_df = train_test_df.drop(train_test_df.index[elim_idx_list]).reset_index(drop=True)

 100

In [22]:
# Train-test split
train_df = train_test_df[~train_test_df["deal_probability"].isnull()]
test_df = train_test_df[train_test_df["deal_probability"].isnull()]
train_df = train_df.reset_index()
test_df = test_df.reset_index()

len_train = len(train_df)
len_test = len(test_df)

In [23]:
# Train-test remerge
train_test_df = train_df.append(test_df)
train_test_df = train_test_df.reset_index()

In [24]:
X_train_test_np, y_train_test_np, tokenizer = _get_train_feature(train_test_df, seq_maxlen=seq_maxlen)
ftext_neo_wmatrix, ftext_wiki_wmatrix, ftext_cc_wmatrix, unique_rate_np = _get_train_wvector_coeff(
    train_test_df, 
    tokenizer, 
    embed_size=embed_size, 
    seq_maxlen=seq_maxlen)
X_train_test_np = np.concatenate([X_train_test_np, unique_rate_np], axis=1)
gc.collect()

val_ratio = 0.05
batch_size = 250
epochs = 2
num_words = min(tokenizer.num_words, len(tokenizer.word_index))

X_train_np = X_train_test_np[:len_train]
y_train_np = y_train_test_np[:len_train]
X_test_np = X_train_test_np[len_train:]

"""Update 2018-06-24 seed averager"""
seeds = [7, 42, 10, 33]
for seed in seeds:
    kfold = KFold(n_splits=fold_num, shuffle=True, random_state=seed)
    # Initialize oof_xtrain
    oof_X_train = []
    oof_y_train = []
    # initialize output
    sum_output =[]

    with timer("Cross validation elapsed time:"):
        for fold_idx, (train_idx, val_idx) in enumerate(kfold.split(X_train_np)):
            X_val_np_kf = X_train_np[val_idx]
            X_train_np_kf =X_train_np[train_idx]
            y_val_np_kf = y_train_np[val_idx] 
            y_train_np_kf = y_train_np[train_idx]

            print("train size: ", len(X_train_np_kf))
            print("valid size: ", len(X_val_np_kf))
            print("test size : ", len(X_test_np))
            gc.collect()

            len_train = len(X_train_np_kf)

            X_train_dict = _get_input_dict(X_train_np_kf, seq_maxlen)
            X_val_dict = _get_input_dict(X_val_np_kf, seq_maxlen)
            del X_train_np_kf, X_val_np_kf
            gc.collect()

            model = _get_model(len_train, 
                               ftext_neo_wmatrix, 
                               ftext_wiki_wmatrix,
                               ftext_cc_wmatrix,
                               num_words=num_words,  
                               seq_maxlen=seq_maxlen,
                               embed_size=embed_size,
                               batch_size=batch_size,
                               epochs=epochs)


            modelsaver = ModelSave(fold_idx=fold_idx+1)

            history = model.fit(X_train_dict, 
                                y_train_np_kf, 
                                batch_size=batch_size, 
                                epochs=epochs, 
                                validation_data=(X_val_dict, y_val_np_kf),
                                callbacks=[modelsaver],
                                #class_weight={0:0.01, 1:.99}, 
                                shuffle=True, 
                                verbose=1)

            MODEL_HISTORY_PATH = "history_{:d}f_seed{:d}.pkl".format(fold_idx+1, seed)
            print("学習が完了しました。ファイル名：{:s}として履歴を保存しています。".format(MODEL_HISTORY_PATH))
            with open(MODEL_HISTORY_PATH, 'wb') as handle:
                pickle.dump(history.history, handle)

            K.clear_session()
            del model
            gc.collect()

            print("seed={:d}のoof_trainを取得します．".format(seed))       
            best_epoch = np.argmin(history.history["val_loss"]) + 1
            best_model = load_model("best_model_{:d}f_ep{:d}.h5".format(fold_idx+1, best_epoch), 
                                    custom_objects={'root_mean_squared_error': root_mean_squared_error})
            oof_X_train_tmp = best_model.predict(X_val_dict, batch_size=batch_size, verbose=1)
            # concat oof_train
            if fold_idx == 0:
                oof_X_train = oof_X_train_tmp
                oof_y_train = y_val_np_kf
            else:
                oof_X_train = np.concatenate([oof_X_train, oof_X_train_tmp], axis=0)
                oof_y_train = np.concatenate([oof_y_train, y_val_np_kf], axis=0)        

            print("Predicting...")
            X_test_dict = _get_input_dict(X_test_np, seq_maxlen)
            if fold_idx == 0:
                sum_output = best_model.predict(X_test_dict, batch_size=batch_size, verbose=1)
            else:
                sum_output += best_model.predict(X_test_dict, batch_size=batch_size, verbose=1)

            K.clear_session()
            del best_model
            gc.collect()

    print('Save out-of-fold X, y train array')
    np.save('bi_lstm_gru_ftext_neo_wiki_cc_oof_X_train_{:d}f_seed{:d}.npy'.format(fold_num, seed),
            oof_X_train)
    np.save('bi_lstm_gru_ftext_neo_wiki_cc_oof_y_train_{:d}f_seed{:d}.npy'.format(fold_num, seed),
            oof_y_train)

    print("writing...")
    outputs = sum_output / fold_num
    test_df["deal_probability"] = outputs
    sub_df = pd.read_csv("sample_submission.csv")
    sub_df = pd.merge(sub_df[["item_id"]], test_df[["item_id", "deal_probability"]], how='left')
    sub_df["deal_probability"] = sub_df["deal_probability"].apply(lambda x: 0. if x < 0 else x)
    sub_df["deal_probability"] = sub_df["deal_probability"].apply(lambda x: 1. if x > 1 else x)
    sub_df[["item_id","deal_probability"]].to_csv( \
        'bi_lstm_gru_ftext_neo_wiki_cc_{:d}f_seed{:d}.csv'.format(fold_num, seed), index=False)

学習データの語彙数を計算しています
tokenizerモデルを学習しています
学習済みtokenizerを保存しています
コメントをtokenizeしています
tokenの重みを取得しています
学習済みword vectorに含まれない新出語の比率を計算しています
train size:  1127493
valid size:  375831
test size :  508438
Instructions for updating:
Use the retry module or similar alternatives.
Train on 1127493 samples, validate on 375831 samples
Epoch 1/2
model save
Epoch 2/2
model save
学習が完了しました。ファイル名：history_1f_seed7.pklとして履歴を保存しています。
seed=7のoof_trainを取得します．
Predicting...
train size:  1127493
valid size:  375831
test size :  508438
Train on 1127493 samples, validate on 375831 samples
Epoch 1/2
model save
Epoch 2/2
model save
学習が完了しました。ファイル名：history_2f_seed7.pklとして履歴を保存しています。
seed=7のoof_trainを取得します．
Predicting...
train size:  1127493
valid size:  375831
test size :  508438
Train on 1127493 samples, validate on 375831 samples
Epoch 1/2
model save
Epoch 2/2
model save
学習が完了しました。ファイル名：history_3f_seed7.pklとして履歴を保存しています。
seed=7のoof_trainを取得します．
Predicting...
train size:  1127493
valid size:  375831
test size :  50843

### Seed averager

In [27]:
tmp_proba = None
for idx, seed in enumerate(seeds):
    sub_df = pd.read_csv('bi_lstm_gru_ftext_neo_wiki_cc_{:d}f_seed{:d}.csv'
                         .format(fold_num, seed))
    if idx == 0:
        tmp_proba = sub_df["deal_probability"].values
    else:
        tmp_proba +=  sub_df["deal_probability"].values
outputs = tmp_proba / len(seeds)
sub_df["deal_probability"] = outputs

In [30]:
sub_df.to_csv("bi_lstm_gru_seed_avg_7_42_10_33.csv", index=False)