In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

In [None]:
synthetic_data = pd.DataFrame({
    'bud_names': [
        'Credit Card Payment',
        'Utility Bill',
        'Salary',
        'Online Purchase',
        'Grocery Expenses'
    ],
    'bud_desc': [
        'A payment made using a credit card for a purchase.',
        'A monthly utility bill payment for electricity and water.',
        'Income received as salary for work done.',
        'A payment for an online purchase from a retail website.',
        'Expenses related to groceries and daily essentials.'
    ],
    'ent_names': [
        'Expense via Credit Card',
        'Monthly Services Payment',
        'Monthly Earnings',
        'Online Shopping Expenses',
        'Grocery Store Costs'
    ],
    'ent_desc': [
        'A payment using a credit card for an expense.',
        'A monthly payment for various services.',
        'Earnings received on a monthly basis.',
        'Expenses related to online shopping activities.',
        'Costs associated with a local grocery store.'
    ]
})



In [None]:
path = 'category_data.xlsx'
labeled_data = pd.read_excel(path, sheet_name='labeled')

In [None]:
def preprocess_text(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [None]:
labeled_data['ent_desc'] = labeled_data['ent_desc'].apply(preprocess_text)
labeled_data['bud_desc'] = labeled_data['bud_desc'].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

X_train, X_test, y_train, y_test = train_test_split(
    labeled_data['ent_desc'],
    labeled_data['bud_desc'],
    test_size=0.2,
    random_state=42
)

In [None]:
vect = TfidfVectorizer()
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

dimensions = X_train_vect.shape[1]
classes = len(encoder.classes_)

In [5]:
from tensorflow import keras
from keras.layers import Input, Dense
from keras.models import Model

def create_model(hidden_layers=2, units=128, learning_rate=0.001):
    model = keras.Sequential([
        Input(shape=(dimensions,)),
    ])
    for _ in range(hidden_layers):
        model.add(Dense(units, activation='relu'))
    
    model.add(Dense(classes, activation='softmax'))
    
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer,
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [None]:
hyperparams = {
    'hidden_layers': [1, 2, 3],
    'units': [64, 128, 256],
    'learning_rate': [0.001, 0.01, 0.1]
}

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

model = keras.wrappers.scikit_learn.KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)
grid_search = GridSearchCV(estimator=model, param_grid=hyperparams, cv=3, scoring='accuracy')
grid_search.fit(X_train_vect, y_train_encoded)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_.model

In [None]:
test_loss, test_acc = best_model.evaluate(X_test_vect, y_test_encoded)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}')