In [5]:
#from google.cloud import bigquery
import pandas as pd
from sklearn.svm import SVC
from fuzzywuzzy import process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Custom transformer to encode labels
class LabelEncoderTransformer(LabelEncoder, TransformerMixin):
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    
# Custom transformer to select columns from DataFrame
class DataFrameSelector(TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names]

In [6]:
# Step 1: Data Preprocessing
# Load the data
sql = """
    SELECT *
    FROM `finances-314506.refined.dim_stablishment`
    """
data = pd.read_gbq(sql, dialect="standard")
text_column = "nm_stablishment_bill"  # Replace "text_column" with the name of your text column
labels_column = "category"  # Replace "labels_column" with the name of your labels column

# Clean the text column if needed (e.g., remove punctuation, convert to lowercase)

data.head(10)

Unnamed: 0,nm_stablishment_bill,nm_stablishment,category
0,Toa Toa,Toa Toa,Bar
1,Pet da Milla,Pet da Milla,Casa
2,Petz Loja Virtual,Petz Loja Virtual,Casa
3,Eletronicos El Zein L,Eletronicos El Zein L,Casa
4,Claudio Lima,Claudio Lima,Casa
5,de Vita F,de Vita F,Casa
6,Mercpago *Pantys,Mercpago *Pantys,Casa
7,Pag*Gomesmachado,Pag*Gomesmachado,Casa
8,Dicico Be,Dicico Be,Casa
9,Leroy Mer,Leroy Mer,Casa


In [7]:
# Step 2: Fuzzy Matching
# Function to perform fuzzy matching
def fuzzy_match(text, choices):
    return process.extractOne(text, choices)[0]

# Apply fuzzy matching to identify similar text strings
data["matched_text"] = data[text_column].apply(lambda x: fuzzy_match(x, data[text_column]))

data.head(10)

Unnamed: 0,nm_stablishment_bill,nm_stablishment,category,matched_text
0,Toa Toa,Toa Toa,Bar,Toa Toa
1,Pet da Milla,Pet da Milla,Casa,Pet da Milla
2,Petz Loja Virtual,Petz Loja Virtual,Casa,Petz Loja Virtual
3,Eletronicos El Zein L,Eletronicos El Zein L,Casa,Eletronicos El Zein L
4,Claudio Lima,Claudio Lima,Casa,Claudio Lima
5,de Vita F,de Vita F,Casa,de Vita F
6,Mercpago *Pantys,Mercpago *Pantys,Casa,Mercpago *Pantys
7,Pag*Gomesmachado,Pag*Gomesmachado,Casa,Pag*Gomesmachado
8,Dicico Be,Dicico Be,Casa,Dicico Be
9,Leroy Mer,Leroy Mer,Casa,Leroy Mer


In [9]:
# Step 3: Vectorization and Encoding
# Vectorize text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["matched_text"])

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data[labels_column])

In [10]:
# Step 4: Training and Testing
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Step 5: Training a model
# Train a Support Vector Machine (SVM) classifier
svm_model = SVC(kernel='linear', C=1.0, random_state=42)  
svm_model.fit(X_train, y_train)


In [12]:
# Step 6: Evaluation
# Predict labels on the test set using the SVM classifier
y_pred_svm = svm_model.predict(X_test)

# Evaluate the performance of the SVM classifier
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted')
recall_svm = recall_score(y_test, y_pred_svm, average='weighted')
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')

print("SVM Classifier Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)

SVM Classifier Metrics:
Accuracy: 0.43548387096774194
Precision: 0.405342583907949
Recall: 0.43548387096774194
F1-score: 0.35712486341922306


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
