# Data

In [None]:
%%capture
%pip install pandas
%pip install scikit-learn
%pip install numpy
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np


# Load initial dataset
initial_accions_df = pd.read_csv('./datasets/accions.csv')
initial_accions_df.drop(['Usuari', 'Representat'], axis=1, inplace=True)
initial_tramits_df = pd.read_csv('./datasets/tramits.csv')
tramits_map = initial_tramits_df.set_index('Id')['Titol'].to_dict()
initial_tramits_df.drop(['Titol'], axis=1, inplace=True)
merged_df = initial_accions_df.merge(initial_tramits_df, left_on='Tramit', right_on='Id').drop(['Id'], axis=1)

# Remove non-vigent
initial_df = merged_df[merged_df['Vigent']].drop(columns=['Vigent'])

# Encode action values
df = initial_df.copy()  # If df is a subset of another dataframe, make an explicit copy first
df['Accio_Tramit'] = df['Accio'] + '_' + df['Tramit']
label_encoder = LabelEncoder()
df['action_id'] = label_encoder.fit_transform(df['Accio_Tramit'])
df.drop(['Accio', 'Tramit', 'Accio_Tramit'], axis=1, inplace=True)
df = df.sort_values(by=['Sessio', 'Data'])

# Remove repeated actions
df = df.loc[
    df['action_id'] != df.groupby('Sessio')['action_id'].shift()
]

# Remove sessions with less than n actions
n = 4
session_counts = df.groupby('Sessio').size().reset_index(name='count')
sessions_to_keep = session_counts[session_counts['count'] >= n]
df = df[df['Sessio'].isin(sessions_to_keep['Sessio'])]

# Store sequences in a dictionary in order
df_sorted = df.sort_values(by=['Sessio', 'Data'])
session_sequences = {}
for session_id, group in df_sorted.groupby('Sessio'):
    action_sequence = group['action_id'].tolist()
    session_sequences[session_id] = action_sequence

# Generate sequences embeddings
num_actions = len(label_encoder.classes_)
embedding_dim = 50
embedding_map = {}
for action_id in range(num_actions):
    random_embedding = np.random.randn(embedding_dim).astype(np.float32)
    embedding_map[action_id] = random_embedding
sequence_data = []
for session_id, action_sequence in session_sequences.items():
    for i in range(len(action_sequence) - 3):
        input1 = embedding_map[action_sequence[i]]
        input2 = embedding_map[action_sequence[i + 1]]
        input3 = embedding_map[action_sequence[i + 2]]
        label = embedding_map[action_sequence[i + 3]]
        sequence_data.append((input1, input2, input3, label))

# Cleanup step
allowed_variables = {'sequence_data', 'embedding_map', 'embedding_dim', 'num_actions', 'label_encoder', 'tramits_map'}
current_variables = set(globals().keys())
for variable in current_variables - allowed_variables:
    if variable not in ['__builtins__', '__name__', '__doc__', '__package__', '__loader__', '__spec__', '__annotations__', '__file__', '__cached__']:
        del globals()[variable]

In [124]:
import pickle
import os
with open("./datasets/embeddings.pkl", "wb") as f:
    pickle.dump(embedding_map, f)
with open("./datasets/tramits_map.pkl", "wb") as f:
    pickle.dump(tramits_map, f)
label_to_encoded = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
encoded_to_label = {v: k for k, v in label_to_encoded.items()}

# Save the label-to-encoded map to a file
with open('./datasets/label_to_encoded.pkl', 'wb') as f:
    pickle.dump(label_to_encoded, f)

# Save the encoded-to-label map to another file
with open('./datasets/encoded_to_label.pkl', 'wb') as f:
    pickle.dump(encoded_to_label, f)

In [126]:
# Check if the embeddings file exists
if os.path.exists("./datasets/encoded_to_label.pkl"):
    with open("./datasets/encoded_to_label.pkl", "rb") as f:
        embeddings = pickle.load(f)
else:
    embeddings = {}  # If not found, return an empty dictionary
print(embeddings)

{0: 'AFIT_+1JQhTbg4lgyTIo1IfnXsGHt0uVmFQui/PprXOhQCOQ=', 1: 'AFIT_+9hGyUgYGGqr/1tg7SbHuXM2SdoloC0VruPrL3jtm2A=', 2: 'AFIT_+NoTO0ItlnsY3BVIot97xdqd429P3E5EDa09XUEQHJk=', 3: 'AFIT_+P1T5jsqGWcF0eJ0g9ZX7HHKQdaxE8OTAm9uMUcdRJ8=', 4: 'AFIT_+RMaZY0iNBpUgCTfndSjpDAECoYX+P/GE72G6YD9CjA=', 5: 'AFIT_+Ti5iuCSbvCPrzcrOWtfasZM9ALnysMq1mdbcPGMklg=', 6: 'AFIT_+fMaLoK/BqzaE+L2DM8WjufzaXXcBZG5VcUUspZNkDI=', 7: 'AFIT_+gzoIX67Vc3BDtzzSrI8PH2ofTaYmCgjivWJT/o04e4=', 8: 'AFIT_+kKhgUgWOsYKFrnH+SXCmnqFK3A+DNBjs7OD4T3QuBw=', 9: 'AFIT_/3JQ89bxH5lmeJX6KK7AucRpkUy+FoNooW081i7o8TM=', 10: 'AFIT_/G7JgOWHzNkVUDM/pT0ZHAC6igSeC+EtcTenAR10EMQ=', 11: 'AFIT_/H580/Kb0mC56bGtqZlmQyePH1qgeLRqPBwuLZYZBIE=', 12: 'AFIT_/MTfZFU+61dycpUuENyS9WNWQ2lJzFJpeJimWfP6smY=', 13: 'AFIT_/ZHrTbJK0Er0nTsBomIpAvaB8Sq1Xt+Lb1Bi8lBtaII=', 14: 'AFIT_/bibhZIje8ttoNCixHlQeemRHnz4GjfsbvxDN24M/O4=', 15: 'AFIT_/dIZEULTf8w+C6Hxd609twuaS2HOnKX5iJy3O/KxuEE=', 16: 'AFIT_/o5VxteIj4P6dGS1H8PcQRV1u4NwtgIiJ6jpDu1D4hw=', 17: 'AFIT_/oN4oi0HZ8zVv6XnkX8lY7fCAHXzVx

In [None]:
test_embedding_map = None

# RNN

## Imports

In [56]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Input, LSTM, GRU
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

## Data

In [None]:
aux = random.sample(sequence_data, 500_000)
test_aux = [item for item in sequence_data if item not in aux]
# Convert sequence_data to numpy arrays
X = []
y = []
for seq in aux:
    X.append(np.stack(seq[:3]))  # input1, input2, input3
    y.append(seq[3])  # label

X = np.array(X)  # Shape: (num_samples, 3, num_actions)
y = np.array(y)  # Shape: (num_samples, num_actions)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Model

### Simple RNN

In [10]:
model = Sequential([
    Input(shape=(3, embedding_dim)),
    SimpleRNN(32, activation='tanh', input_shape=(3, embedding_dim)),  # 3 timesteps, 1 feature
    Dense(embedding_dim, activation='linear')  # Cambia softmax según el tipo de tarea
])

# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


  super().__init__(**kwargs)


### LSTM

In [18]:
model = Sequential([
    LSTM(64, activation='tanh', input_shape=(3, embedding_dim), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(embedding_dim, activation='linear')  # 50 clases en la salida
])

# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

  super().__init__(**kwargs)


### GRU

In [None]:
model = Sequential([
    Input(shape=(3, embedding_dim)),
    GRU(64, activation='tanh', input_shape=(3, embedding_dim), return_sequences=False),  # GRU en lugar de SimpleRNN
    Dense(32, activation='relu'),
    Dense(embedding_dim, activation='linear')  # Cambia softmax según el tipo de tarea
])
# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


## Training

In [20]:
# Entrenar el modelo
history = model.fit(X_train, y_train, epochs=5, batch_size=16, validation_data=(X_test, y_test))


Epoch 1/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 965us/step - accuracy: 0.3425 - loss: 0.6147 - val_accuracy: 0.3469 - val_loss: 0.6067
Epoch 2/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 966us/step - accuracy: 0.3520 - loss: 0.6002 - val_accuracy: 0.3466 - val_loss: 0.5992
Epoch 3/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 953us/step - accuracy: 0.3537 - loss: 0.5951 - val_accuracy: 0.3629 - val_loss: 0.5961
Epoch 4/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 964us/step - accuracy: 0.3536 - loss: 0.5925 - val_accuracy: 0.3525 - val_loss: 0.5941
Epoch 5/5
[1m25000/25000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 966us/step - accuracy: 0.3535 - loss: 0.5904 - val_accuracy: 0.3510 - val_loss: 0.5925


In [21]:
model.save('./models/model_lstm_v1.h5')



In [127]:
random_input = random.choice(sequence_data)
# print(f'Random input: {random_input}')
# Stack them to form a sequence of shape (3, 50)
input_sequence = np.stack(random_input[:3])
# Add batch dimension (shape becomes (1, 3, 50))
input_sequence = np.expand_dims(input_sequence, axis=0)
predicted_output = model.predict(input_sequence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step


In [128]:
for output in [random_input[0], random_input[1], random_input[2], predicted_output]:
    output = output.flatten()
    similarities = []
    # Iterate through each action_id and embedding in the embeddings_map
    for action_id, embedding in embedding_map.items():
        # Reshape the embedding to ensure it's 2D (1, 50)
        embedding = embedding.reshape(1, -1)

        # Compute cosine similarity (output is also reshaped to (1, 50) for comparison)
        similarity = cosine_similarity([output], embedding)[0][0]

        # Append the similarity and corresponding action_id to the list
        similarities.append((action_id, similarity))

    # Sort the similarities in descending order (most similar first)
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Get the top 3 most similar embeddings
    top_3_similar = similarities[:3]

    # Print the top 3 closest embeddings
    for action_id, similarity in top_3_similar[:1]:
        print(f"Action ID: {action_id}, Similarity: {similarity}\nName: {tramits_map[label_encoder.inverse_transform([action_id])[0].split('_')[1]]}\nAcció: {label_encoder.inverse_transform([action_id])[0].split('_')[0]}")

Action ID: 462, Similarity: 1.0
Name: Certificats del padró d'habitants. Expedició automàtica
Acció: AFST
Action ID: 440, Similarity: 0.9999998807907104
Name: Sol·licitud genèrica
Acció: AFIT
Action ID: 445, Similarity: 0.9999998807907104
Name: Padró d'Habitants: gestions d'alta, canvi de domicili, sol·licitud de certificats i altres gestions relacionades
Acció: AFIT
Action ID: 223, Similarity: 0.9863439798355103
Name: Sol·licitar certificats d'empadronament
Acció: AFIT


## Testing

In [None]:
# Evaluar el modelo en el conjunto de prueba
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")

# Visualizar el rendimiento durante el entrenamiento
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()