In [2]:
%%capture
%pip install pandas
%pip install scikit-learn
%pip install numpy
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [8]:
# Load initial dataset
initial_df = pd.read_csv('./datasets/accions.csv')
initial_df.drop(['Usuari', 'Representat'], axis=1, inplace=True)

In [None]:
# Actions by session (deprecated)
session_counts = initial_df.groupby('Sessio').size()
count_by_rows = session_counts.value_counts().sort_index()
result = pd.DataFrame({
    'row_count': [1, 2, 3, 4, '>5'],
    'num_sessions': [
        count_by_rows.get(1, 0),
        count_by_rows.get(2, 0),
        count_by_rows.get(3, 0),
        count_by_rows.get(4, 0),
        count_by_rows[count_by_rows.index > 5].sum()
    ]
})
result

In [None]:
# Removed single-action sessions (deprecated)
session_counts = initial_df.groupby('Sessio').size().reset_index(name='count')
sessions_to_keep = session_counts[session_counts['count'] > 1]
df = initial_df[initial_df['Sessio'].isin(sessions_to_keep['Sessio'])]

In [9]:
# Remove sessions with less than n actions
n = 4
session_counts = initial_df.groupby('Sessio').size().reset_index(name='count')
sessions_to_keep = session_counts[session_counts['count'] >= n]
df = initial_df[initial_df['Sessio'].isin(sessions_to_keep['Sessio'])]

In [10]:
# Encode action values
df = df.copy()  # If df is a subset of another dataframe, make an explicit copy first
df['Accio_Tramit'] = df['Accio'] + '_' + df['Tramit']
label_encoder = LabelEncoder()
df['action_id'] = label_encoder.fit_transform(df['Accio_Tramit'])
df.drop(['Accio', 'Tramit', 'Accio_Tramit'], axis=1, inplace=True)

In [11]:
# Store sequences in a dictionary in order
df_sorted = df.sort_values(by=['Sessio', 'Data'])
session_sequences = {}
for session_id, group in df_sorted.groupby('Sessio'):
    action_sequence = group['action_id'].tolist()
    session_sequences[session_id] = action_sequence

In [12]:
# Generate sequences + label
num_actions = len(label_encoder.classes_)
def one_hot_encode(action_id, num_classes):
    one_hot_vector = np.zeros(num_classes, dtype=np.int8)
    one_hot_vector[action_id] = 1
    return one_hot_vector
sequence_data = []

for session_id, action_sequence in session_sequences.items():
    for i in range(len(action_sequence) - 3):
        input1 = one_hot_encode(action_sequence[i], num_actions)
        input2 = one_hot_encode(action_sequence[i + 1], num_actions)
        input3 = one_hot_encode(action_sequence[i + 2], num_actions)
        label = one_hot_encode(action_sequence[i + 3], num_actions)
        sequence_data.append((input1, input2, input3, label))

In [16]:
len(sequence_data)

1837252