# Data

In [1]:
%%capture
%pip install pandas
%pip install scikit-learn
%pip install numpy
%pip install sentence-transformers
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sentence_transformers import SentenceTransformer



# Load initial dataset
initial_accions_df = pd.read_csv('./datasets/accions.csv')
initial_accions_df.drop(['Usuari', 'Representat'], axis=1, inplace=True)
initial_tramits_df = pd.read_csv('./datasets/tramits.csv')
tramits_map = initial_tramits_df.set_index('Id')['Titol'].to_dict()
initial_tramits_df.drop(['Titol'], axis=1, inplace=True)
merged_df = initial_accions_df.merge(initial_tramits_df, left_on='Tramit', right_on='Id').drop(['Id'], axis=1)

# Remove non-vigent
initial_df = merged_df[merged_df['Vigent']].drop(columns=['Vigent'])

# Encode action values
df = initial_df.copy()  # If df is a subset of another dataframe, make an explicit copy first
df['Accio_Tramit'] = df['Accio'] + '_' + df['Tramit']
label_encoder = LabelEncoder()
df['action_id'] = label_encoder.fit_transform(df['Accio_Tramit'])
df.drop(['Accio', 'Tramit', 'Accio_Tramit'], axis=1, inplace=True)
df = df.sort_values(by=['Sessio', 'Data'])

# Remove repeated actions
df = df.loc[
    df['action_id'] != df.groupby('Sessio')['action_id'].shift()
]

# Remove sessions with less than n actions
n = 4
session_counts = df.groupby('Sessio').size().reset_index(name='count')
sessions_to_keep = session_counts[session_counts['count'] >= n]
df = df[df['Sessio'].isin(sessions_to_keep['Sessio'])]

# Store sequences in a dictionary in order
df_sorted = df.sort_values(by=['Sessio', 'Data'])
session_sequences = {}
for session_id, group in df_sorted.groupby('Sessio'):
    action_sequence = group['action_id'].tolist()
    session_sequences[session_id] = action_sequence

def get_embedding(sentence):
    return model.encode(sentence)

action_type_map = {
    'AFIT': 'Acces a la fitxa informativa de ',
    'AFST': 'Acces a la fitxa de solicitud de ',
    'PFST': 'Presentacio del formulari de solicitud de ',
}

# Generate sequences embeddings
embedding_map = {}
embedding_dim = 384
def get_embedding(sentence):
    return embedding_model.encode(sentence)
action_type_map = {
    'AFIT': 'Acces a la fitxa informativa de ',
    'AFST': 'Acces a la fitxa de solicitud de ',
    'PFST': 'Presentacio del formulari de solicitud de ',
}
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
for indx, action in enumerate(label_encoder.classes_):
    action_type = action.split('_')[0]
    tramit_id = action.split('_')[1]
    action_text = action_type_map[action_type] + tramits_map[tramit_id]
    embedding_map[indx] = get_embedding(action_text)
sequence_data = []
for session_id, action_sequence in session_sequences.items():
    for i in range(len(action_sequence) - 3):
        input1 = embedding_map[action_sequence[i]]
        input2 = embedding_map[action_sequence[i + 1]]
        input3 = embedding_map[action_sequence[i + 2]]
        label = embedding_map[action_sequence[i + 3]]
        sequence_data.append((input1, input2, input3, label))

# Cleanup step
allowed_variables = {'sequence_data', 'embedding_map', 'embedding_dim', 'num_actions', 'label_encoder', 'tramits_map'}
current_variables = set(globals().keys())
for variable in current_variables - allowed_variables:
    if variable not in ['__builtins__', '__name__', '__doc__', '__package__', '__loader__', '__spec__', '__annotations__', '__file__', '__cached__']:
        del globals()[variable]

2024-11-24 04:33:31.354162: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732419211.426438    6479 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732419211.448927    6479 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-24 04:33:31.632473: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pickle
import os
with open("./streamlit/data/embeddings_v2.pkl", "wb") as f:
    pickle.dump(embedding_map, f)
with open("./streamlit/data/tramits_map_v2.pkl", "wb") as f:
    pickle.dump(tramits_map, f)
label_to_encoded = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
encoded_to_label = {v: k for k, v in label_to_encoded.items()}

# Save the label-to-encoded map to a file
with open('./streamlit/data/label_to_encoded_v2.pkl', 'wb') as f:
    pickle.dump(label_to_encoded, f)

# Save the encoded-to-label map to another file
with open('./streamlit/data/encoded_to_label_v2.pkl', 'wb') as f:
    pickle.dump(encoded_to_label, f)

In [4]:
# Check if the embeddings file exists
if os.path.exists("./datasets/encoded_to_label.pkl"):
    with open("./datasets/encoded_to_label.pkl", "rb") as f:
        embeddings = pickle.load(f)
else:
    embeddings = {}  # If not found, return an empty dictionary
print(embeddings)

{}


In [None]:
test_embedding_map = None

# RNN

## Imports

In [3]:
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Input, LSTM, GRU
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

## Data

In [5]:
import numpy as np
from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, sequence_data, batch_size=16):
        self.sequence_data = sequence_data
        self.batch_size = batch_size

    def __len__(self):
        return (len(self.sequence_data) + self.batch_size - 1) // self.batch_size

    def __getitem__(self, idx):
        batch_data = self.sequence_data[idx * self.batch_size:(idx + 1) * self.batch_size]
        X = []
        y = []
        for seq in batch_data:
            X.append(np.stack(seq[:3]))
            y.append(seq[3])
        return np.array(X), np.array(y)

# Example usage
batch_size = 16
train_generator = DataGenerator(sequence_data[:int(len(sequence_data) * 0.8)], batch_size=batch_size)
test_generator = DataGenerator(sequence_data[int(len(sequence_data) * 0.8):], batch_size=batch_size)

# Train the model
history = model.fit(train_generator, validation_data=test_generator, epochs=10)


  self._warn_if_super_not_called()


Epoch 1/10
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 2ms/step - accuracy: 0.4220 - loss: 6.5343e-04 - val_accuracy: 0.4265 - val_loss: 5.7338e-04
Epoch 2/10
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 2ms/step - accuracy: 0.4451 - loss: 5.9420e-04 - val_accuracy: 0.5167 - val_loss: 5.7012e-04
Epoch 3/10
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 2ms/step - accuracy: 0.4457 - loss: 5.8911e-04 - val_accuracy: 0.4108 - val_loss: 5.6588e-04
Epoch 4/10
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 2ms/step - accuracy: 0.4426 - loss: 5.8768e-04 - val_accuracy: 0.4131 - val_loss: 5.6461e-04
Epoch 5/10
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 2ms/step - accuracy: 0.4421 - loss: 5.8450e-04 - val_accuracy: 0.4749 - val_loss: 5.6450e-04
Epoch 6/10
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 2ms/step - accuracy: 0.4416 - loss: 5.8562e

## Model

### Simple RNN

In [10]:
model = Sequential([
    Input(shape=(3, embedding_dim)),
    SimpleRNN(32, activation='tanh', input_shape=(3, embedding_dim)),  # 3 timesteps, 1 feature
    Dense(embedding_dim, activation='linear')  # Cambia softmax según el tipo de tarea
])

# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


  super().__init__(**kwargs)


### LSTM

In [4]:
model = Sequential([
    LSTM(64, activation='tanh', input_shape=(3, embedding_dim), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(embedding_dim, activation='linear')  # 50 clases en la salida
])

# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

2024-11-24 04:35:03.949332: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
  super().__init__(**kwargs)


### GRU

In [12]:
model = Sequential([
    Input(shape=(3, embedding_dim)),
    GRU(64, activation='tanh', input_shape=(3, embedding_dim), return_sequences=False),  # GRU en lugar de SimpleRNN
    Dense(32, activation='relu'),
    Dense(embedding_dim, activation='linear')  # Cambia softmax según el tipo de tarea
])
# Compilar el modelo
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])


  super().__init__(**kwargs)


## Training

In [13]:
# Entrenar el modelo
history = model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test))


Epoch 1/20


2024-11-24 03:23:21.313130: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 548346600 exceeds 10% of free system memory.


[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 1ms/step - accuracy: 0.3551 - loss: 0.6126 - val_accuracy: 0.3639 - val_loss: 0.5782
Epoch 2/20
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 1ms/step - accuracy: 0.3811 - loss: 0.5772 - val_accuracy: 0.3815 - val_loss: 0.5743
Epoch 3/20
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 1ms/step - accuracy: 0.3824 - loss: 0.5730 - val_accuracy: 0.3851 - val_loss: 0.5722
Epoch 4/20
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 1000us/step - accuracy: 0.3827 - loss: 0.5701 - val_accuracy: 0.3923 - val_loss: 0.5695
Epoch 5/20
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 1ms/step - accuracy: 0.3816 - loss: 0.5683 - val_accuracy: 0.3777 - val_loss: 0.5700
Epoch 6/20
[1m57120/57120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 997us/step - accuracy: 0.3820 - loss: 0.5677 - val_accuracy: 0.3726 - val_loss: 0.5683
Epoc

In [10]:
import keras
print(keras.__version__)

3.6.0


In [9]:
model.save('./streamlit/models/model_lstm_v4.h5')



In [127]:
random_input = random.choice(sequence_data)
# print(f'Random input: {random_input}')
# Stack them to form a sequence of shape (3, 50)
input_sequence = np.stack(random_input[:3])
# Add batch dimension (shape becomes (1, 3, 50))
input_sequence = np.expand_dims(input_sequence, axis=0)
predicted_output = model.predict(input_sequence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step


In [128]:
for output in [random_input[0], random_input[1], random_input[2], predicted_output]:
    output = output.flatten()
    similarities = []
    # Iterate through each action_id and embedding in the embeddings_map
    for action_id, embedding in embedding_map.items():
        # Reshape the embedding to ensure it's 2D (1, 50)
        embedding = embedding.reshape(1, -1)

        # Compute cosine similarity (output is also reshaped to (1, 50) for comparison)
        similarity = cosine_similarity([output], embedding)[0][0]

        # Append the similarity and corresponding action_id to the list
        similarities.append((action_id, similarity))

    # Sort the similarities in descending order (most similar first)
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Get the top 3 most similar embeddings
    top_3_similar = similarities[:3]

    # Print the top 3 closest embeddings
    for action_id, similarity in top_3_similar[:1]:
        print(f"Action ID: {action_id}, Similarity: {similarity}\nName: {tramits_map[label_encoder.inverse_transform([action_id])[0].split('_')[1]]}\nAcció: {label_encoder.inverse_transform([action_id])[0].split('_')[0]}")

Action ID: 462, Similarity: 1.0
Name: Certificats del padró d'habitants. Expedició automàtica
Acció: AFST
Action ID: 440, Similarity: 0.9999998807907104
Name: Sol·licitud genèrica
Acció: AFIT
Action ID: 445, Similarity: 0.9999998807907104
Name: Padró d'Habitants: gestions d'alta, canvi de domicili, sol·licitud de certificats i altres gestions relacionades
Acció: AFIT
Action ID: 223, Similarity: 0.9863439798355103
Name: Sol·licitar certificats d'empadronament
Acció: AFIT


In [11]:
print(X_train)

[[[ 0.5936548  -0.47816718 -1.1702312  ... -0.41135162  0.0239204
   -0.8581901 ]
  [ 0.6404202  -1.9613961   0.9597868  ...  1.845338    0.41126075
   -1.3003867 ]
  [ 0.15095814 -1.2164788   0.34207943 ...  2.0424657  -1.0943674
   -0.34722838]]

 [[-1.1344854  -2.8561525   0.11432025 ...  1.3501385   0.19619687
    0.01653572]
  [-0.85601515 -0.86147314 -0.37468165 ...  0.7338877   0.5312311
   -0.9679297 ]
  [-0.9830078  -1.4014475   2.928438   ... -2.3828523  -0.23451519
   -1.3053267 ]]

 [[-0.9917925   0.93422335 -0.15956964 ... -1.0005369  -1.1052504
    1.0878924 ]
  [-0.538926    0.03241725 -1.7057962  ... -0.5711149   0.1057082
    0.6349573 ]
  [ 0.46988806  0.42448315  1.0847745  ... -0.48395592 -0.13236533
   -0.16239367]]

 ...

 [[-0.09672425  0.10088343 -0.46765578 ...  0.6535846  -0.26443586
    0.9512202 ]
  [-0.9917925   0.93422335 -0.15956964 ... -1.0005369  -1.1052504
    1.0878924 ]
  [-0.15341094 -0.49633235 -1.0531574  ... -1.0422709   0.14696114
   -1.1754737 

## Testing

In [None]:
# Evaluar el modelo en el conjunto de prueba
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")

# Visualizar el rendimiento durante el entrenamiento
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()