In [7]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Normalization, Concatenate, Dense
from tensorflow.keras.models import Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import json
import os

# ========== CONFIG ==========
CSV_PATH = 'expense_dataset.csv'
TFLITE_MODEL_PATH = 'expense_model.tflite'
TFIDF_VOCAB_PATH = 'tfidf_vocab.json'
SCALER_PATH = 'amount_normalizer.npy'
LABEL_ENCODER_PATH = 'label_classes.json'
# ============================

# Load data
df = pd.read_csv(CSV_PATH)
df['text'] = df['merchant'] + ' ' + df['description']
texts = df['text'].values
amounts = df['amount'].values
labels = df['category'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)

# TF-IDF outside the model
tfidf = TfidfVectorizer(max_features=5000)
X_text = tfidf.fit_transform(texts).toarray()

# Save vocab
with open(TFIDF_VOCAB_PATH, 'w') as f:
    json.dump({k: int(v) for k, v in tfidf.vocabulary_.items()}, f)


# Split data
X_text_train, X_text_test, X_amount_train, X_amount_test, y_train, y_test = train_test_split(
    X_text, amounts, y_encoded, test_size=0.2, random_state=42
)

# Normalize amount
amount_normalizer = Normalization()
amount_normalizer.adapt(X_amount_train.reshape(-1, 1))

# Save scaler stats
np.save(SCALER_PATH, {
    'mean': amount_normalizer.mean.numpy().tolist(),
    'var': amount_normalizer.variance.numpy().tolist()
})

# Save label classes
with open(LABEL_ENCODER_PATH, 'w') as f:
    json.dump(label_encoder.classes_.tolist(), f)

# Build model
text_input = Input(shape=(X_text_train.shape[1],), name='text_input')
amount_input = Input(shape=(1,), dtype=tf.float32, name='amount_input')

x_amount = amount_normalizer(amount_input)
x = Concatenate()([text_input, x_amount])
x = Dense(128, activation='relu')(x)
x = Dense(64, activation='relu')(x)
output = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=[text_input, amount_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train
model.fit({'text_input': X_text_train, 'amount_input': X_amount_train}, y_train,
          epochs=5, batch_size=32, validation_split=0.1)

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
with open(TFLITE_MODEL_PATH, 'wb') as f:
    f.write(tflite_model)

print("✅ TFLite model and config files saved!")


Epoch 1/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.5263 - loss: 1.8463 - val_accuracy: 1.0000 - val_loss: 0.0379
Epoch 2/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0218 - val_accuracy: 1.0000 - val_loss: 0.0056
Epoch 3/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0044 - val_accuracy: 1.0000 - val_loss: 0.0023
Epoch 4/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0019 - val_accuracy: 1.0000 - val_loss: 0.0012
Epoch 5/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 7.7534e-04
INFO:tensorflow:Assets written to: C:\Users\KIIT\AppData\Local\Temp\tmp1idu7bll\assets


INFO:tensorflow:Assets written to: C:\Users\KIIT\AppData\Local\Temp\tmp1idu7bll\assets


Saved artifact at 'C:\Users\KIIT\AppData\Local\Temp\tmp1idu7bll'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): List[TensorSpec(shape=(None, 70), dtype=tf.float32, name='text_input'), TensorSpec(shape=(None, 1), dtype=tf.float32, name='amount_input')]
Output Type:
  TensorSpec(shape=(None, 10), dtype=tf.float32, name=None)
Captures:
  3047669653264: TensorSpec(shape=(1, 1), dtype=tf.float32, name=None)
  3047669655568: TensorSpec(shape=(1, 1), dtype=tf.float32, name=None)
  3047669654032: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3047669657296: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3047669656720: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3047669658064: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3047669657488: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3047669658832: TensorSpec(shape=(), dtype=tf.resource, name=None)
✅ TFLite model and config files saved!
