In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Normalization, Concatenate, Dense
from tensorflow.keras.models import Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import json
import os

# ========== CONFIG ==========
CSV_PATH = 'expense_dataset.csv'
TFLITE_MODEL_PATH = 'expense_model.tflite'
TFIDF_VOCAB_PATH = 'tfidf_vocab.json'
SCALER_PATH = 'amount_normalizer.npy'
LABEL_ENCODER_PATH = 'label_classes.json'
# ============================

# Load data
df = pd.read_csv(CSV_PATH)
df['text'] = df['merchant'].astype(str) + ' ' + df['description'].astype(str)
texts = df['text'].values
amounts = df['amount'].values
labels = df['category'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

# TF-IDF for text
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(texts).toarray()

# Save vocab
with open(TFIDF_VOCAB_PATH, 'w') as f:
    json.dump({k: int(v) for k, v in tfidf.vocabulary_.items()}, f)

# Split data
X_text_train, X_text_test, X_amount_train, X_amount_test, y_train, y_test = train_test_split(
    X_text, amounts, y_encoded, test_size=0.2, random_state=42
)

# Normalize amount
amount_normalizer = Normalization()
amount_normalizer.adapt(X_amount_train.reshape(-1, 1))

# Save scaler stats
np.save(SCALER_PATH, {
    'mean': amount_normalizer.mean.numpy().tolist(),
    'var': amount_normalizer.variance.numpy().tolist()
})

# Save label classes
with open(LABEL_ENCODER_PATH, 'w') as f:
    json.dump(label_encoder.classes_.tolist(), f)

# Build model
text_input = Input(shape=(X_text_train.shape[1],), name='text_input')
amount_input = Input(shape=(1,), dtype=tf.float32, name='amount_input')
x_amount = amount_normalizer(amount_input)
x = Concatenate()([text_input, x_amount])
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=[text_input, amount_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit({'text_input': X_text_train, 'amount_input': X_amount_train}, y_train,
          epochs=5, batch_size=32, validation_split=0.1)

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
with open(TFLITE_MODEL_PATH, 'wb') as f:
    f.write(tflite_model)

print("✅ All files saved:")
print("- expense_model.tflite")
print("- amount_normalizer.npy")
print("- tfidf_vocab.json")
print("- label_classes.json")


Epoch 1/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.7154 - loss: 1.5533 - val_accuracy: 1.0000 - val_loss: 0.0183
Epoch 2/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0109 - val_accuracy: 1.0000 - val_loss: 0.0037
Epoch 3/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 1.0000 - loss: 0.0027 - val_accuracy: 1.0000 - val_loss: 0.0016
Epoch 4/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 1.0000 - loss: 0.0013 - val_accuracy: 1.0000 - val_loss: 8.8026e-04
Epoch 5/5
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 1.0000 - loss: 7.2031e-04 - val_accuracy: 1.0000 - val_loss: 5.5829e-04
INFO:tensorflow:Assets written to: C:\Users\KIIT\AppData\Local\Temp\tmps3r_rfsa\assets


INFO:tensorflow:Assets written to: C:\Users\KIIT\AppData\Local\Temp\tmps3r_rfsa\assets


Saved artifact at 'C:\Users\KIIT\AppData\Local\Temp\tmps3r_rfsa'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 96), dtype=tf.float32, name='text_input'), TensorSpec(shape=(None, 1), dtype=tf.float32, name='amount_input')]
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  2330600942736: TensorSpec(shape=(1, 1), dtype=tf.float32, name=None)
  2330600941968: TensorSpec(shape=(1, 1), dtype=tf.float32, name=None)
  2330600941200: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2330600942544: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2330600943888: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2330600941776: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2330600942352: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2330600944656: TensorSpec(shape=(), dtype=tf.resource, name=None)
✅ All files saved:
- expense_model.tflite
- amount_normalizer.npy
- tfidf_