### Setup

In [2]:
#from google.cloud import bigquery
import pandas as pd
from sklearn.svm import SVC
from fuzzywuzzy import process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Custom transformer to encode labels
class LabelEncoderTransformer(LabelEncoder, TransformerMixin):
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    
# Custom transformer to select columns from DataFrame
class DataFrameSelector(TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names]

### Clustering Stablishments

#### Step 1: Data Preprocessing

In [6]:
# Load the data
sql = """
    SELECT *
    FROM `finances-314506.refined.dim_stablishment`
    """
data = pd.read_gbq(sql, dialect="standard")
text_column = "nm_stablishment_bill"  # Replace "text_column" with the name of your text column
labels_column = "category"  # Replace "labels_column" with the name of your labels column

# Clean the text column if needed (e.g., remove punctuation, convert to lowercase)

data.head(10)

Unnamed: 0,nm_stablishment_bill,nm_stablishment,category
0,Toa Toa,Toa Toa,Bar
1,Pet da Milla,Pet da Milla,Casa
2,Petz Loja Virtual,Petz Loja Virtual,Casa
3,Eletronicos El Zein L,Eletronicos El Zein L,Casa
4,Claudio Lima,Claudio Lima,Casa
5,de Vita F,de Vita F,Casa
6,Mercpago *Pantys,Mercpago *Pantys,Casa
7,Pag*Gomesmachado,Pag*Gomesmachado,Casa
8,Dicico Be,Dicico Be,Casa
9,Leroy Mer,Leroy Mer,Casa


#### Step 2: Fuzzy Matching

In [7]:
# Function to perform fuzzy matching
def fuzzy_match(text, choices):
    return process.extractOne(text, choices)[0]

# Apply fuzzy matching to identify similar text strings
data["matched_text"] = data[text_column].apply(lambda x: fuzzy_match(x, data[text_column]))

data.head(10)

Unnamed: 0,nm_stablishment_bill,nm_stablishment,category,matched_text
0,Toa Toa,Toa Toa,Bar,Toa Toa
1,Pet da Milla,Pet da Milla,Casa,Pet da Milla
2,Petz Loja Virtual,Petz Loja Virtual,Casa,Petz Loja Virtual
3,Eletronicos El Zein L,Eletronicos El Zein L,Casa,Eletronicos El Zein L
4,Claudio Lima,Claudio Lima,Casa,Claudio Lima
5,de Vita F,de Vita F,Casa,de Vita F
6,Mercpago *Pantys,Mercpago *Pantys,Casa,Mercpago *Pantys
7,Pag*Gomesmachado,Pag*Gomesmachado,Casa,Pag*Gomesmachado
8,Dicico Be,Dicico Be,Casa,Dicico Be
9,Leroy Mer,Leroy Mer,Casa,Leroy Mer


#### Step 3: Vectorization and Encoding

In [9]:
# Vectorize text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["matched_text"])

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data[labels_column])

#### Step 4: Training and Testing

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Step 5: Training a model

In [11]:
# Train a Support Vector Machine (SVM) classifier
svm_model = SVC(kernel='linear', C=1.0, random_state=42)  
svm_model.fit(X_train, y_train)


#### Step 6: Evaluation

In [12]:
# Predict labels on the test set using the SVM classifier
y_pred_svm = svm_model.predict(X_test)

# Evaluate the performance of the SVM classifier
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

print("SVM Classifier Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)

SVM Classifier Metrics:
Accuracy: 0.43548387096774194
Precision: 0.405342583907949
Recall: 0.43548387096774194
F1-score: 0.35712486341922306


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Current Transactions

#### Data Ingestion

In [3]:
sql = """
    SELECT *
    FROM `finances-314506.trusted.tb_nordigen_transactions`
    """
df_transactions = pd.read_gbq(sql, dialect="standard")
df_transactions.head(10)

Unnamed: 0,transactionId,bookingDate,valueDate,remittanceInformationUnstructured,internalTransactionId,transactionAmount_amount,transactionAmount_currency,clientId,dtInsert
0,2023120552023-12-06-00.07.43.948610,2023-12-05,2023-12-05,IMPOSTO SELO ART 17.3.4,a512a166ecc16a763c66eaccdcd98649,-0.24,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
1,2023120532023-12-05-09.56.19.335229,2023-12-05,2023-12-05,TRF MB WAY DE LARISSA SOUZA,90ff9b6145b9a376c333fadd1ccc11ff,600.0,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
2,2023120512023-12-05-04.39.34.096925,2023-12-05,2023-12-05,COMPRA 0284 UBER EATS PENDING AMSTERDAM NL,478760620b9e2731bef9fcee803d2764,-27.79,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
3,2023120512023-12-05-04.39.34.096524,2023-12-05,2023-12-05,COMPRA 0284 UBER EATS PENDING AMSTERDAM NL,25dc2253214578213011fcb50e211c7f,-43.6,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
4,2023120512023-12-05-04.39.34.096054,2023-12-05,2023-12-05,COMPRA 0284 UBER EATS PENDING AMSTERDAM NL,e0117805a3bdc26aaaf08020d6be42fb,-17.29,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
5,2023120512023-12-05-04.39.34.095257,2023-12-05,2023-12-05,COMPRA 0284 UBER EATS PENDING AMSTERDAM NL,ba931231735b1f87f6485df12b8e0699,-21.2,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
6,2023120552023-12-06-00.07.43.948543,2023-12-05,2023-12-05,COM.MAN.CONTA PACOTE PROGRAMA PRESTIGE 112023,4c0527c4fb634b0e616ccf57af592820,-6.0,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
7,2023120512023-12-05-04.39.34.095735,2023-12-05,2023-12-05,COMPRA 0284 IGUARIAS IMPONEMTES LIS CONTACTLESS,30d5d68c6d95cd1816a68f5e122e128d,-1.62,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
8,2023120512023-12-05-04.39.34.097463,2023-12-05,2023-12-05,COMPRA 0284 MARQUES POMBAL 2 LISBOA CONTACTLESS,b705314de0825cbd2ede5dbf6b33f122,-4.95,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
9,2023120512023-12-05-04.39.34.097805,2023-12-05,2023-12-05,COMPRA 0284 TEMAS MEDIEVAIS LDA. LI CONTACTLESS,2935f3729b2ba71e6fe2514230749d79,-16.0,EUR,5984b6bd-b065-460a-a9aa-bb4ec3af99de,2024-02-11 14:31:07.662538
