Here's a basic implementation of the LSTM model for the Task 1 using Keras and TensorFlow. Please note that this is just a starting point, and you may need to fine-tune the model and preprocess the data more thoroughly for better results.

GPU Code

In [54]:
# importing libraries
import os
import glob
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import Word2Vec

import sklearn
import tensorflow as tf
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
from tensorflow.keras.utils import to_categorical


# ignoring warnings
import warnings
warnings.filterwarnings('ignore')


# setting up the environment
task = 'task1'
train_path = './data/train/'
test_path = './data/test/sessions_test_' + task + '.csv'
PREDS_PER_SESSION = 100
slice_size = 1000

# slicing data for memory management
if slice_size != None:
    train_prod = pd.read_csv(train_path + '/products_train.csv', nrows=slice_size)
    train_sess = pd.read_csv(train_path + '/sessions_train.csv', nrows=slice_size)
    test_sess = pd.read_csv(test_path, nrows=slice_size)
else:
    train_prod = pd.read_csv(train_path + '/products_train.csv')
    train_sess = pd.read_csv(train_path + '/sessions_train.csv')
    test_sess = pd.read_csv(test_path)

train_prod: Index(['id', 'locale', 'title', 'price', 'brand', 'color', 'size', 'model',
       'material', 'author', 'desc'],
      dtype='object')
train_sess: Index(['prev_items', 'next_item', 'locale'], dtype='object')
test_sess: Index(['prev_items', 'locale'], dtype='object')


In [56]:
train_prod.head()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,RED DRAGON Amberjack 3 - Steel Tip 22 Gramm Wo...,30.95,RED DRAGON,,,RDD0089,,,Amberjacks Steel Dartpfeile sind verfügbar in ...
1,B08PRYN6LD,DE,Simply Keto Lower Carb* Schokodrops ohne Zucke...,17.9,Simply Keto,,750 g (1er Pack),,,,🌱 NATÜRLICHE SÜSSE DURCH ERYTHRIT - Wir stelle...
2,B09MBZJ48V,DE,"Sennheiser 508377 PC 5.2 Chat, Stilvolles Mult...",68.89,Sennheiser,Multi-Colour,One size,508377,Kunstleder,,3.5 MM BUCHSE - Kann problemlos an Geräte mit ...
3,B08ZN6F26S,DE,AmyBenton Auto ab 1 2 3 ahre - Baby Aufziehbar...,18.99,Amy & Benton,Animal Car,,2008B,aufziehauto 1 jahr,,【Auto aufziehbar】: Drücken Sie einfach leicht ...
4,B094DGRV7D,DE,PLAYMOBIL - 70522 - Cavaliere mit grauem Pony,7.17,PLAYMOBIL,Nicht Zutreffend.,OneSize,70522,Polypropylen,,Inhalt: 1 Stück


In [60]:
train_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        1000 non-null   object 
 1   locale    1000 non-null   object 
 2   title     1000 non-null   object 
 3   price     1000 non-null   float64
 4   brand     981 non-null    object 
 5   color     748 non-null    object 
 6   size      572 non-null    object 
 7   model     467 non-null    object 
 8   material  576 non-null    object 
 9   author    49 non-null     object 
 10  desc      931 non-null    object 
dtypes: float64(1), object(10)
memory usage: 86.1+ KB


In [57]:
train_sess.head()

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE


In [59]:
train_sess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   prev_items  1000 non-null   object
 1   next_item   1000 non-null   object
 2   locale      1000 non-null   object
dtypes: object(3)
memory usage: 23.6+ KB


In [58]:
test_sess.head()

Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE


In [61]:
test_sess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   prev_items  1000 non-null   object
 1   locale      1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [55]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.test.gpu_device_name())

# Load data
train = train_sess
products = train_prod

# Merge DataFrames
merged_data = train_sess.merge(train_prod, left_on=['next_item', 'locale'], right_on=['id', 'locale'], how='left')

# Tokenize 'prev_items' column
tokenizer = Tokenizer(filters='', lower=False, split=',')
tokenizer.fit_on_texts(merged_data['prev_items'])
tokenized_prev_items = tokenizer.texts_to_sequences(merged_data['prev_items'])
merged_data['prev_items'] = tokenized_prev_items

# Encode categorical features
encoder = LabelEncoder()
categorical_columns = ['locale', 'brand', 'color', 'size', 'model', 'material', 'author']
for col in categorical_columns:
    merged_data[col] = encoder.fit_transform(merged_data[col].astype(str))

# Create sequences of fixed length
sequence_length = 10
sequences = pad_sequences(merged_data['prev_items'].values.tolist(), maxlen=sequence_length, padding='post', truncating='post')

# One-hot encode target labels separately for each categorical column
y = merged_data[categorical_columns].apply(lambda x: to_categorical(np.array(x)))
# y = merged_data[categorical_columns].apply(lambda x: print(x.shape) or to_categorical(x))
# y = merged_data[categorical_columns].apply(lambda x: to_categorical(x))

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences, y, test_size=0.2, random_state=42)

# Create the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(products), output_dim=128, input_length=sequence_length))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(len(categorical_columns) * len(encoder.classes_), activation='softmax'))
model.add(Reshape((len(categorical_columns), len(encoder.classes_))))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

# Save the model
model.save('lstm_model.h5')

Num GPUs Available:  1
/device:GPU:0


ValueError: Per-column arrays must each be 1-dimensional

In [37]:
# Load data
train = train_sess
products = train_prod

# Merge DataFrames
merged_data = train_sess.merge(train_prod, left_on=['next_item', 'locale'], right_on=['id', 'locale'], how='left')

# Encode categorical features
encoder = LabelEncoder()
categorical_columns = ['locale', 'brand', 'color', 'size', 'model', 'material', 'author']
for col in categorical_columns:
    merged_data[col] = encoder.fit_transform(merged_data[col].astype(str))

# Normalize continuous features
scaler = MinMaxScaler()
merged_data['price'] = scaler.fit_transform(merged_data['price'].values.reshape(-1, 1))

from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize 'prev_items' column to split the strings into lists of items
tokenizer = Tokenizer(filters='', lower=False, split=',')
tokenizer.fit_on_texts(merged_data['prev_items'])
tokenized_prev_items = tokenizer.texts_to_sequences(merged_data['prev_items'])

# Create sequences of fixed length
sequence_length = 10
sequences = pad_sequences(tokenized_prev_items, maxlen=sequence_length, padding='post', truncating='post')

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences, merged_data[categorical_columns], test_size=0.2, random_state=42)

# Convert the target variable to one-hot encoded vectors
y_train = to_categorical(y_train)
y_val = to_categorical(y_val)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(products), output_dim=128, input_length=sequence_length))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(len(products), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

# Save the model
model.save('lstm_model.h5')

Num GPUs Available:  1
/device:GPU:0
Epoch 1/10


ValueError: in user code:

    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\miniconda3\envs\tryfastai\lib\site-packages\keras\backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 7, 3) and (None, 1000) are incompatible
