In [21]:
%%capture
%pip install pandas
%pip install scikit-learn
%pip install numpy
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [22]:
# Load initial dataset
initial_df = pd.read_csv('./datasets/accions.csv')
initial_df.drop(['Usuari', 'Representat'], axis=1, inplace=True)

In [None]:
# Actions by session (deprecated)
session_counts = initial_df.groupby('Sessio').size()
count_by_rows = session_counts.value_counts().sort_index()
result = pd.DataFrame({
    'row_count': [1, 2, 3, 4, '>5'],
    'num_sessions': [
        count_by_rows.get(1, 0),
        count_by_rows.get(2, 0),
        count_by_rows.get(3, 0),
        count_by_rows.get(4, 0),
        count_by_rows[count_by_rows.index > 5].sum()
    ]
})
result

In [None]:
# Removed single-action sessions (deprecated)
session_counts = initial_df.groupby('Sessio').size().reset_index(name='count')
sessions_to_keep = session_counts[session_counts['count'] > 1]
df = initial_df[initial_df['Sessio'].isin(sessions_to_keep['Sessio'])]

In [23]:
# Remove sessions with less than n actions
n = 4
session_counts = initial_df.groupby('Sessio').size().reset_index(name='count')
sessions_to_keep = session_counts[session_counts['count'] >= n]
df = initial_df[initial_df['Sessio'].isin(sessions_to_keep['Sessio'])]

In [24]:
# Encode action values
df = df.copy()  # If df is a subset of another dataframe, make an explicit copy first
df['Accio_Tramit'] = df['Accio'] + '_' + df['Tramit']
label_encoder = LabelEncoder()
df['action_id'] = label_encoder.fit_transform(df['Accio_Tramit'])
df.drop(['Accio', 'Tramit', 'Accio_Tramit'], axis=1, inplace=True)

In [25]:
# Store sequences in a dictionary in order
df_sorted = df.sort_values(by=['Sessio', 'Data'])
session_sequences = {}
for session_id, group in df_sorted.groupby('Sessio'):
    action_sequence = group['action_id'].tolist()
    session_sequences[session_id] = action_sequence

In [26]:
num_actions = len(label_encoder.classes_)
embedding_dim = 50
embedding_map = {}
for action_id in range(num_actions):
    random_embedding = np.random.randn(embedding_dim).astype(np.float32)
    embedding_map[action_id] = random_embedding

In [29]:
# Generate sequences embeddings
sequence_data = []
for session_id, action_sequence in session_sequences.items():
    for i in range(len(action_sequence) - 3):
        input1 = embedding_map[action_sequence[i]]
        input2 = embedding_map[action_sequence[i + 1]]
        input3 = embedding_map[action_sequence[i + 2]]
        label = embedding_map[action_sequence[i + 3]]
        sequence_data.append((input1, input2, input3, label))

In [None]:
# Generate sequences + label one-hot (deprecated)
num_actions = len(label_encoder.classes_)
def one_hot_encode(action_id, num_classes):
    one_hot_vector = np.zeros(num_classes, dtype=np.int8)
    one_hot_vector[action_id] = 1
    return one_hot_vector
sequence_data = []

for session_id, action_sequence in session_sequences.items():
    for i in range(len(action_sequence) - 3):
        input1 = one_hot_encode(action_sequence[i], num_actions)
        input2 = one_hot_encode(action_sequence[i + 1], num_actions)
        input3 = one_hot_encode(action_sequence[i + 2], num_actions)
        label = one_hot_encode(action_sequence[i + 3], num_actions)
        sequence_data.append((input1, input2, input3, label))

In [30]:
print(len(sequence_data))
print(sequence_data[0])

1837252
(array([ 0.560104  , -1.2443599 ,  0.24320407, -0.55655557,  0.6899699 ,
        1.225055  , -1.2968394 ,  1.4089557 ,  0.9985551 , -0.49891   ,
        1.3149197 ,  0.28642792,  0.83909076, -1.6275573 ,  0.14795668,
       -0.54824567,  0.6784365 , -1.0792032 ,  0.04030007,  0.8015891 ,
       -1.9001613 ,  1.9727807 , -0.47027194,  0.96602154,  0.62383544,
       -1.5227431 ,  0.28896347,  0.03794384,  1.0415932 , -1.1073035 ,
       -0.9342644 , -0.0315561 , -0.19095355, -0.47510934, -0.8628383 ,
       -2.6398306 ,  0.04242746, -0.6909476 ,  1.8770038 , -0.95575356,
       -0.27185607,  0.891469  ,  0.44863716,  0.02669654, -0.42737186,
        0.04919973,  0.37200773, -0.02590366, -0.3990094 ,  2.0886755 ],
      dtype=float32), array([ 0.560104  , -1.2443599 ,  0.24320407, -0.55655557,  0.6899699 ,
        1.225055  , -1.2968394 ,  1.4089557 ,  0.9985551 , -0.49891   ,
        1.3149197 ,  0.28642792,  0.83909076, -1.6275573 ,  0.14795668,
       -0.54824567,  0.6784365 ,