# Data

In [1]:
%%capture
%pip install pandas
%pip install scikit-learn
%pip install numpy
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load initial dataset
initial_df = pd.read_csv('./datasets/accions.csv')
initial_df.drop(['Usuari', 'Representat'], axis=1, inplace=True)

# Remove sessions with less than n actions
n = 4
session_counts = initial_df.groupby('Sessio').size().reset_index(name='count')
sessions_to_keep = session_counts[session_counts['count'] >= n]
df = initial_df[initial_df['Sessio'].isin(sessions_to_keep['Sessio'])]

# Encode action values
df = df.copy()  # If df is a subset of another dataframe, make an explicit copy first
df['Accio_Tramit'] = df['Accio'] + '_' + df['Tramit']
label_encoder = LabelEncoder()
df['action_id'] = label_encoder.fit_transform(df['Accio_Tramit'])
df.drop(['Accio', 'Tramit', 'Accio_Tramit'], axis=1, inplace=True)

# Store sequences in a dictionary in order
df_sorted = df.sort_values(by=['Sessio', 'Data'])
session_sequences = {}
for session_id, group in df_sorted.groupby('Sessio'):
    action_sequence = group['action_id'].tolist()
    session_sequences[session_id] = action_sequence

num_actions = len(label_encoder.classes_)
embedding_dim = 10
embedding_map = {}
for action_id in range(num_actions):
    random_embedding = np.random.randn(embedding_dim).astype(np.float32)
    embedding_map[action_id] = random_embedding

# Generate sequences embeddings
sequence_data = []
for session_id, action_sequence in session_sequences.items():
    for i in range(len(action_sequence) - 3):
        input1 = embedding_map[action_sequence[i]]
        input2 = embedding_map[action_sequence[i + 1]]
        input3 = embedding_map[action_sequence[i + 2]]
        label = embedding_map[action_sequence[i + 3]]
        sequence_data.append((input1, input2, input3, label))

del initial_df, df_sorted, session_sequences, session_counts, sessions_to_keep, n

In [12]:
del initial_df, df_sorted, session_sequences, session_counts, sessions_to_keep, n

# RNN

## Imports

In [2]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Input, LSTM, GRU
from sklearn.model_selection import train_test_split
import numpy as np

2024-11-23 20:22:11.170736: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-23 20:22:11.275616: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-23 20:22:11.361545: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732389731.431407   10703 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732389731.453008   10703 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-23 20:22:11.642076: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

## Data

In [9]:
aux = sequence_data
# Convert sequence_data to numpy arrays
X = []
y = []
for seq in aux:
    X.append(np.stack(seq[:3]))  # input1, input2, input3
    y.append(seq[3])  # label

X = np.array(X)  # Shape: (num_samples, 3, num_actions)
y = np.array(y)  # Shape: (num_samples, num_actions)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model

### Simple RNN

In [10]:
model = Sequential([
    Input(shape=(3, embedding_dim)),
    SimpleRNN(32, activation='tanh', input_shape=(3, embedding_dim)),  # 3 timesteps, 1 feature
    Dense(embedding_dim, activation='linear')  # Cambia softmax según el tipo de tarea
])

# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


  super().__init__(**kwargs)


### LSTM

In [None]:
model = Sequential([
    LSTM(64, activation='tanh', input_shape=(3, embedding_dim), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(embedding_dim, activation='linear')  # 50 clases en la salida
])

# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

### GRU

In [None]:
model = Sequential([
    Input(shape=(3, embedding_dim)),
    GRU(64, activation='tanh', input_shape=(3, embedding_dim), return_sequences=False),  # GRU en lugar de SimpleRNN
    Dense(32, activation='relu'),
    Dense(embedding_dim, activation='linear')  # Cambia softmax según el tipo de tarea
])
# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


## Training

In [11]:
# Entrenar el modelo
history = model.fit(X_train, y_train, epochs=10, batch_size=4, validation_data=(X_test, y_test))


Epoch 1/10
[1m367451/367451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 698us/step - accuracy: 0.4350 - loss: 0.6440 - val_accuracy: 0.4436 - val_loss: 0.6191
Epoch 2/10
[1m367451/367451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 690us/step - accuracy: 0.4502 - loss: 0.6181 - val_accuracy: 0.4639 - val_loss: 0.6178
Epoch 3/10
[1m367451/367451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 693us/step - accuracy: 0.4491 - loss: 0.6160 - val_accuracy: 0.4429 - val_loss: 0.6166
Epoch 4/10
[1m367451/367451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m251s[0m 683us/step - accuracy: 0.4507 - loss: 0.6158 - val_accuracy: 0.4538 - val_loss: 0.6139
Epoch 5/10
[1m367451/367451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 686us/step - accuracy: 0.4513 - loss: 0.6153 - val_accuracy: 0.4501 - val_loss: 0.6152
Epoch 6/10
[1m367451/367451[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 685us/step - accuracy: 0.4522 - loss: 0.6157 - val_accu

In [12]:
model.save('./models/model_simple_v1.h5')



## Testing

In [None]:
# Evaluar el modelo en el conjunto de prueba
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")

# Visualizar el rendimiento durante el entrenamiento
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()