## Библиотеки 

In [1]:
import numpy as np
import pandas as pd
import keras
from keras import layers as L
from keras.utils import to_categorical
import warnings
warnings.filterwarnings("ignore")

## Получение данных

In [2]:
df_train = pd.read_csv('df_train.csv', sep=';')
df_test = pd.read_csv('df_test.csv', sep=';')

In [3]:
df_train['Data'] = df_train.Data.apply(lambda s: list(map(int, s.split(','))))
df_train['Target'] = df_train.Target.apply(lambda s: list(map(int, s.split(','))))
df_test['Data'] = df_test.Data.apply(lambda s: list(map(int, s.split(','))))

# df_test['Target'] = df_test.Data.apply(lambda row: row[-10:])
# df_test['Data'] = df_test.Data.apply(lambda row: row[:-10])
# all_data = pd.concat([df_train[['Data', 'Target']], df_test[['Data', 'Target']]], axis=0, ignore_index=True)

# df_test = pd.read_csv('df_test.csv', sep=';')
# df_test['Data'] = df_test.Data.apply(lambda s: list(map(int, s.split(','))))

## Подготовка обучающего датасета

In [4]:
pad = 9999
all_codes = list(df_train.Data.explode().unique()) + [pad]
vocab = {code: i+1 for i, code in enumerate(all_codes)}
decode_vocab = {i: code for code, i in vocab.items()}

n_sequences = len(df_train)
n_timesteps = 500
n_outputs = 10
n_features = len(vocab) + 1

X1 = np.empty((n_sequences, n_timesteps))
X2 = np.empty((n_sequences, n_outputs))
y = np.empty((n_sequences, n_outputs))

for i in range(n_sequences):
    X_seq = df_train.Data[i][-n_timesteps:]
    pad_length = n_timesteps - len(X_seq)
    X1[i] = [vocab[x] for x in X_seq] + [vocab[pad]] * pad_length
    y[i] = [vocab[x] for x in df_train.Target[i]]
    X2[i] = [0] + list(y[i][:-1])

X1 = to_categorical(X1, num_classes=n_features)
X2 = to_categorical(X2, num_classes=n_features)
y = to_categorical(y, num_classes=n_features)

X1.shape, X2.shape, y.shape

((7033, 500, 186), (7033, 10, 186), (7033, 10, 186))

## Построение и обучение модели

In [5]:
# define training encoder
enc_inputs = L.Input(shape=(None, n_features))
encoder_inputs = L.Masking(mask_value=X1[0][-1])(enc_inputs)
encoder = L.LSTM(512, return_state=True, dropout=0.3)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

# define training decoder
dec_inputs = L.Input(shape=(None, n_features))
decoder_inputs = L.Masking(mask_value=X1[0][-1])(dec_inputs)
decoder_lstm = L.LSTM(512, return_sequences=True, return_state=True, dropout=0.3)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = L.Dense(n_features, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = keras.Model([enc_inputs, dec_inputs], decoder_outputs)

# define inference encoder
encoder_model = keras.Model(enc_inputs, encoder_states)

# define inference decoder
decoder_state_input_h = L.Input(shape=(512,))
decoder_state_input_c = L.Input(shape=(512,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model([dec_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)

# define model
model.compile(optimizer='nadam', loss='categorical_crossentropy', metrics=['accuracy'])

# train model
model.fit([X1, X2], y, epochs=10, batch_size=32, verbose=1, workers=-1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1d1112a7a10>

## Функции

In [6]:
def predictioning(seq):
    seq = seq[-n_timesteps:]
    pad_length = n_timesteps - len(seq)
    seq = [vocab[x] for x in seq] + [vocab[pad]] * pad_length
    seq = np.array([to_categorical(seq, num_classes=n_features)])
    
    # encode
    state = encoder_model.predict(seq, verbose=0)
    
    # start of sequence input
    target_seq = np.zeros((1, 1, n_features))
    
    # collect predictions
    output = []
    
    for t in range(n_outputs):
        # predict next char
        yhat, h, c = decoder_model.predict([target_seq] + state, verbose=0)
        # store prediction
        output.append(yhat[0,0])
        # update state
        state = [h, c]
        # update target sequence
        target_seq = yhat
        
    return [decode_vocab[np.argmax(x)] for x in output]

In [7]:
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

## Оценка модели

In [8]:
df_train['Predicted'] = df_train['Data'].apply(predictioning)
print(mapk(df_train['Target'], df_train['Predicted']))

0.11034534473536323
