In [None]:
# Filter the dataset for Task 1 locales
locales_task1 = ['UK', 'DE', 'JP']
sessions_train_task1 = sessions_train[sessions_train['locale'].isin(locales_task1)]


# Merge product attributes with the training data
sessions_train_task1 = sessions_train_task1.merge(products_train, left_on='next_item', right_on='id', how='left')


# Tokenize product titles and descriptions
sessions_train_task1['title_tokens'] = sessions_train_task1['title'].apply(lambda x: str(x).split())
sessions_train_task1['desc_tokens'] = sessions_train_task1['desc'].apply(lambda x: str(x).split())


# One-hot encode categorical features
categorical_features = ['brand', 'color', 'size', 'model', 'material', 'author']
encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(sessions_train_task1[categorical_features].fillna(''))


# Create a DataFrame with the encoded features and set the column names
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))


# Normalize continuous features
scaler = MinMaxScaler()
sessions_train_task1['price_norm'] = scaler.fit_transform(sessions_train_task1[['price']])


# Concatenate preprocessed features to the main DataFrame
sessions_train_task1_preprocessed = pd.concat([sessions_train_task1, encoded_df], axis=1)


# Define model parameters
vocab_size = 10000                          # Set the size of your vocabulary (you may need to adjust this)
embedding_dim = 128                         # Set the dimension of the word embeddings
lstm_units = 128                            # Set the number of LSTM units
num_encoded_features = encoded_df.shape[1]  # The number of one-hot encoded features
max_seq_length = 20                         # Set the maximum length of input sequences (you may need to adjust this)


# Encoder
encoder_inputs = Input(shape=(max_seq_length, ), name='encoder_inputs')
encoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True, name='encoder_embedding')(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]


# Decoder
decoder_inputs = Input(shape=(max_seq_length, ), name='decoder_inputs')
decoder_embedding = Embedding(vocab_size, embedding_dim, mask_zero=True, name='decoder_embedding')(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True, name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)


# Attention
attention_layer = Attention(name='attention_layer')
attention_output = attention_layer([encoder_outputs, decoder_outputs])


# Concatenate attention output with decoder output
decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attention_output])


# Dense layer for predictions
decoder_dense = Dense(vocab_size, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_concat_input)


# Define the model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)


# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')


# Tokenize the product titles
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(sessions_train_task1_preprocessed['prev_items'])
word_index = tokenizer.word_index


# Convert the product titles to sequences
input_sequences = tokenizer.texts_to_sequences(sessions_train_task1_preprocessed['prev_items'])
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')


# Prepare target sequences
target_sequences = tokenizer.texts_to_sequences(sessions_train_task1_preprocessed['next_item'])
target_sequences = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')


# Convert target sequences to categorical
target_sequences = np.expand_dims(target_sequences, axis=2)
target_sequences = tf.keras.utils.to_categorical(target_sequences, num_classes=vocab_size)


# Slice the dataset, for example, keep only 50% of the data
input_sequences = input_sequences[: len(input_sequences) // 8]
target_sequences = target_sequences[: len(target_sequences) // 8]


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(input_sequences, target_sequences, test_size=0.2, random_state=42)


# Define model parameters
batch_size = 32
epochs = 10


# Clear GPU memory
tf.keras.backend.clear_session()


# Train the model
checkpoint = ModelCheckpoint('seq2seq_attention_model.h5', save_best_only=True, monitor='val_loss', mode='min')
history = model.fit([X_train, X_train], y_train, batch_size=batch_size, epochs=epochs, validation_data=([X_val, X_val], y_val), callbacks=[checkpoint])