In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Normalization, Concatenate, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import json
import os

# ========== CONFIG ==========
CSV_PATH = 'expense_dataset.csv'
TFLITE_MODEL_PATH = 'expense_model.tflite'
TFIDF_VOCAB_PATH = 'tfidf_vocab.json'
SCALER_PATH = 'amount_normalizer.npy'
LABEL_ENCODER_PATH = 'label_classes.json'
# ============================

# Load data
df = pd.read_csv(CSV_PATH)
df['text'] = df['merchant'].astype(str) + ' ' + df['description'].astype(str)
texts = df['text'].values
amounts = df['amount'].values
labels = df['category'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

# TF-IDF for text
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(texts).toarray()

# Save vocab
with open(TFIDF_VOCAB_PATH, 'w') as f:
    json.dump({k: int(v) for k, v in tfidf.vocabulary_.items()}, f)

# Split data
X_text_train, X_text_test, X_amount_train, X_amount_test, y_train, y_test = train_test_split(
    X_text, amounts, y_encoded, test_size=0.2, random_state=42
)

# Normalize amount
amount_normalizer = Normalization()
amount_normalizer.adapt(X_amount_train.reshape(-1, 1))

# Save scaler stats
np.save(SCALER_PATH, {
    'mean': amount_normalizer.mean.numpy().tolist(),
    'var': amount_normalizer.variance.numpy().tolist()
})

# Save label classes
with open(LABEL_ENCODER_PATH, 'w') as f:
    json.dump(label_encoder.classes_.tolist(), f)

# Build model
text_input = Input(shape=(X_text_train.shape[1],), name='text_input')
amount_input = Input(shape=(1,), dtype=tf.float32, name='amount_input')
x_amount = amount_normalizer(amount_input)
x = Concatenate()([text_input, x_amount])
x = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu', kernel_regularizer=l2(0.01))(x)
x = Dropout(0.3)(x)
output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=[text_input, amount_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train with validation and early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(
    {'text_input': X_text_train, 'amount_input': X_amount_train}, y_train,
    epochs=30,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping]
)

# Evaluate
print("Evaluating on test set...")
y_pred_probs = model.predict({'text_input': X_text_test, 'amount_input': X_amount_test})
y_pred = np.argmax(y_pred_probs, axis=1)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
with open(TFLITE_MODEL_PATH, 'wb') as f:
    f.write(tflite_model)

print("All files saved:")
print("- expense_model.tflite")
print("- amount_normalizer.npy")
print("- tfidf_vocab.json")
print("- label_classes.json")


Epoch 1/30
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.4348 - loss: 3.2615 - val_accuracy: 1.0000 - val_loss: 1.0365
Epoch 2/30
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9990 - loss: 0.9949 - val_accuracy: 1.0000 - val_loss: 0.7016
Epoch 3/30
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9994 - loss: 0.7544 - val_accuracy: 1.0000 - val_loss: 0.5926
Epoch 4/30
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9999 - loss: 0.6472 - val_accuracy: 1.0000 - val_loss: 0.5249
Epoch 5/30
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9993 - loss: 0.5759 - val_accuracy: 1.0000 - val_loss: 0.4739
Epoch 6/30
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9999 - loss: 0.5251 - val_accuracy: 1.0000 - val_loss: 0.4385
Epoch 7/30
[1m203/203[0m 

INFO:tensorflow:Assets written to: C:\Users\KIIT\AppData\Local\Temp\tmpjbqf29hw\assets


Saved artifact at 'C:\Users\KIIT\AppData\Local\Temp\tmpjbqf29hw'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 96), dtype=tf.float32, name='text_input'), TensorSpec(shape=(None, 1), dtype=tf.float32, name='amount_input')]
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  1452352498448: TensorSpec(shape=(1, 1), dtype=tf.float32, name=None)
  1452352497680: TensorSpec(shape=(1, 1), dtype=tf.float32, name=None)
  1452352496912: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1452352498256: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1452352499792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1452352496720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1452352500176: TensorSpec(shape=(), dtype=tf.resource, name=None)
  1452352501328: TensorSpec(shape=(), dtype=tf.resource, name=None)
All files saved:
- expense_model.tflite
- amount_normalizer.npy
- tfidf_vo