In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.signal import resample
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from gensim.models import Word2Vec
import os
import warnings
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import backend as K
import time
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras import models, optimizers, regularizers, initializers,constraints
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

In [None]:
train = pd.read_csv('../dataset/sensor_train_final.csv')
test = pd.read_csv('../dataset/sensor_test_final.csv')
sub = pd.read_csv('../dataset/submit_example.csv')
y = train.groupby('fragment_id')['behavior_id'].min()
data = pd.concat([train, test], sort=False)

In [None]:
def get_feature(df):
    df = df.astype("float")
    try:
        tmp = df.drop_duplicates(subset=['fragment_id']).reset_index(drop=True)[['fragment_id', 'behavior_id']]
    except:
        tmp = df.drop_duplicates(subset=['fragment_id']).reset_index(drop=True)[['fragment_id']]

    for f in tqdm([f for f in df.columns if 'acc' in f]):
        for stat in [
                     'min', 'max','median', 'std', 'mean', #"cid_ce",
#                      'abs_energy',"absolute_sum_of_changes","cid_ce","count_above_mean","count_below_mean",
#                      "first_location_of_maximum","first_location_of_minimum","has_duplicate","has_duplicate_max","has_duplicate_min",
#                      "kurtosis","length","longest_strike_above_mean","longest_strike_below_mean","mean_abs_change","mean_change",
#                      "mean_second_derivative_central","percentage_of_reoccurring_datapoints_to_all_datapoints","percentage_of_reoccurring_values_to_all_values","ratio_value_number_to_time_series_length", "variance_larger_than_standard_deviation",
#                      "skewness","standard_deviation","sum_of_reoccurring_data_points","sum_of_reoccurring_values","sum_values","variance",
#                      "augmented_dickey_fuller","last_location_of_maximum","last_location_of_minimum","linear_trend","sample_entropy",
                    ]:
            tmp[f+'_'+stat] = df.groupby('fragment_id')[f].agg(stat).values
   
    return tmp

In [None]:
feature_train = get_feature(train)
feature_train = feature_train.drop(["fragment_id","behavior_id"],axis=1)
feature_test = get_feature(test)
feature_test = feature_test.drop(["fragment_id"],axis=1)

In [None]:
feature_test.shape, feature_train.shape

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

m_ids = ['acc_x',"acc_y","acc_z",'acc_xg',"acc_yg","acc_zg"]

def get_data(train, m_id):
    train[m_id] = train[m_id].apply(lambda x:str(x))
    
    result = train.groupby(train['fragment_id'])[m_id].agg(
    lambda x: x.tolist()
    )
    return result.tolist()

def get_w2v_embedding(m_id, dats=pd.DataFrame()):
    path = "./w2v/w2v_{}_10d.model".format(m_id)
    model_creative_id = Word2Vec.load(path)
    vocab_list = [word for word, Vocab in model_creative_id.wv.vocab.items()]# 存储 所有的 词语
    word_index = {" ": 0}# 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。
    word_vector = {} # 初始化`[word : vector]`字典
    embeddings_matrix_creative_id = np.zeros((len(vocab_list) + 1, model_creative_id.vector_size))
    for i in range(len(vocab_list)):
        word = vocab_list[i] 
        word_index[word] = i + 1 
        word_vector[word] = model_creative_id.wv[word]
        embeddings_matrix_creative_id[i + 1] = model_creative_id.wv[word]

    if not dats.empty:
        sentences = get_data(dats, m_id)
        sen2idx = pad_sequences(
                                [[word_index.get(str(w), 0) for w in sen] for sen in sentences],
                                dtype='int32',
                                padding='post',
                                maxlen = 60
                                )
        
        return sen2idx
    
    else:
        return embeddings_matrix_creative_id
    
def trans_dat(x):
    w2v_idx = []
    for k in m_ids:
        item = get_w2v_embedding(k, dats=x)
        w2v_idx.append(item)
    x = np.concatenate(w2v_idx, axis=1)
    return x

# window=3 :

for m_id in m_ids:
    print("========",m_id,"============")
    sentences = get_data(data, m_id)
    w2v_model = Word2Vec(sentences, size=20, iter=64, window=8, seed=2020, sg=1, hs=1)
    w2v_model.save("./w2v/w2v_{}_10d.model".format(m_id))

In [None]:
w2v_emb = {k: get_w2v_embedding(k) for k in m_ids}

In [None]:
x = trans_dat(train)
t = trans_dat(test)

In [None]:
def bna(x_train, x_test):
    features = x_train.columns
    for col in features:
        ss = MinMaxScaler()
        ss.fit(np.vstack([x_train[[col]].values, x_test[[col]].values]))
        x_train[col] = ss.transform(x_train[[col]].values).flatten()
        x_test[col] = ss.transform(x_test[[col]].values).flatten()
    return x_train,x_test

In [None]:
def Net(w2v):
    w2v_acx = w2v["acc_x"]
    w2v_acy = w2v["acc_y"]
    w2v_acz = w2v["acc_z"]
    w2v_acxg = w2v["acc_xg"]
    w2v_acyg = w2v["acc_yg"]
    w2v_aczg = w2v["acc_zg"]
    
    input1 = Input(shape=(60,),name="acc_x")  
    input2 = Input(shape=(60,),name="acc_y")     
    input3 = Input(shape=(60,),name="acc_z")     
    input4 = Input(shape=(60,),name="acc_xg")     
    input5 = Input(shape=(60,),name="acc_yg")     
    input6 = Input(shape=(60,),name="acc_zg")
    feature = Input(shape=(30,),name="feature") 
    
    X1 = Embedding(input_dim=w2v_acx.shape[0],
                  output_dim=w2v_acx.shape[1],
                  input_length=60,
                  weights=[w2v_acx],
                  trainable=False)(input1)
#     X1 = concatenate([GlobalAveragePooling1D()(X1),GlobalMaxPooling1D()(X1)])
    X1 = GlobalAveragePooling1D()(X1)

    X2 = Embedding(input_dim=w2v_acy.shape[0],
                  output_dim=w2v_acy.shape[1],
                  input_length=60,
                  weights=[w2v_acy],
                  trainable=False)(input2)
#     X2 = concatenate([GlobalAveragePooling1D()(X2),GlobalMaxPooling1D()(X2)])
    X2 = GlobalAveragePooling1D()(X2)

    X3 = Embedding(input_dim=w2v_acz.shape[0],
                  output_dim=w2v_acz.shape[1],
                  input_length=60,
                  weights=[w2v_acz],
                  trainable=False)(input3)
#     X3 = concatenate([GlobalAveragePooling1D()(X3),GlobalMaxPooling1D()(X3)])
    X3 = GlobalAveragePooling1D()(X3)

    X4 = Embedding(input_dim=w2v_acxg.shape[0],
                  output_dim=w2v_acxg.shape[1],
                  input_length=60,
                  weights=[w2v_acxg],
                  trainable=False)(input4)
#     X4 = concatenate([GlobalAveragePooling1D()(X4),GlobalMaxPooling1D()(X4)])
    X4 = GlobalAveragePooling1D()(X4)

    X5 = Embedding(input_dim=w2v_acyg.shape[0],
                  output_dim=w2v_acyg.shape[1],
                  input_length=60,
                  weights=[w2v_acyg],
                  trainable=False)(input5)
#     X5 = concatenate([GlobalAveragePooling1D()(X5),GlobalMaxPooling1D()(X5)])
    X5 = GlobalAveragePooling1D()(X5)

    X6 = Embedding(input_dim=w2v_aczg.shape[0],
                  output_dim=w2v_aczg.shape[1],
                  input_length=60,
                  weights=[w2v_aczg],
                  trainable=False)(input6)
#     X6 = concatenate([GlobalAveragePooling1D()(X6),GlobalMaxPooling1D()(X6)])
    X6 = GlobalAveragePooling1D()(X6)
    
    X = concatenate([X1,X2,X3,X4,X5,X6, feature])
    X = BatchNormalization()(X)
    X_len = X.shape[1]
    X = Dense(X_len*4, activation="relu")(X)
    X = BatchNormalization()(X)
    X = Dropout(0.15)(X)
    X = Dense(X_len*2, activation="relu")(X)
    X = BatchNormalization()(X)
    X = Dropout(0.15)(X)
    X = Dense(X_len*1, activation="relu")(X)
    X = Dropout(0.15)(X)
    X = BatchNormalization()(X)
    X = Dense(20, activation="softmax")(X)
    
    return Model([input1,input2,input3,input4,input5,input6,feature], X)

In [None]:
k=20
kfold = StratifiedKFold(k, shuffle=True, random_state=256)
proba_t = np.zeros((16000, 20))
train_pred=np.zeros((15000,20))

for fold, (xx, yy) in enumerate(kfold.split(x, y)):
    print("fold-K: ",fold+1)
    y_ = to_categorical(y, num_classes=20)
    model = Net(w2v_emb)
    model.compile(
                optimizer=Adam(),
                loss="categorical_crossentropy",
                metrics=['accuracy']
                )
    callbacks = [
                ReduceLROnPlateau(monitor = 'val_accuracy', factor = 0.5, patience = 20, verbose = 0, mode = 'max'),
                EarlyStopping(monitor = 'val_accuracy', patience = 50, mode = 'max', verbose = 0),
#                 ModelCheckpoint(f'./model/fold_{fold}.h5', monitor='val_accuracy',verbose=0,mode='max', save_best_only=True)
                ]

    model.fit({
                "acc_x":x[xx,:60],
                "acc_y":x[xx,60:120],
                "acc_z":x[xx,120:180],
                "acc_xg":x[xx,180:240],
                "acc_yg":x[xx,240:300],
                "acc_zg":x[xx,300:360],
                "feature": feature_train.iloc[xx,:]
              },
              y_[xx,:],
              epochs=400,
              batch_size=64,
              verbose=1,
              shuffle=True,
              validation_data=({
                    "acc_x":x[yy,:60],
                    "acc_y":x[yy,60:120],
                    "acc_z":x[yy,120:180],
                    "acc_xg":x[yy,180:240],
                    "acc_yg":x[yy,240:300],
                    "acc_zg":x[yy,300:360],
                    "feature": feature_train.iloc[yy,:]
              }, y_[yy,:]),
              callbacks=callbacks,
             )
    
    train_pred[yy] = model.predict({
                    "acc_x":x[yy,:60],
                    "acc_y":x[yy,60:120],
                    "acc_z":x[yy,120:180],
                    "acc_xg":x[yy,180:240],
                    "acc_yg":x[yy,240:300],
                    "acc_zg":x[yy,300:360],
                    "feature": feature_train.iloc[yy,:]
              }, verbose=0, batch_size=64)
    
#     model.load_weights(f'./model/fold_{fold}.h5')

    proba_t += model.predict({
                "acc_x":t[:,:60],
                "acc_y":t[:,60:120],
                 "acc_z":t[:,120:180],
                 "acc_xg":t[:,180:240],
                 "acc_yg":t[:,240:300],
                 "acc_zg":t[:,300:360],
                 "feature": feature_test
                }, verbose=0, batch_size=128) / k

In [None]:
np.save('dnn_pred.npy', proba_t)