In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from lightgbm import LGBMClassifier

In [3]:
df = pd.read_csv("./banking_recommendation_data.csv")

In [4]:
df

Unnamed: 0,Customer ID,Category,Total Spent,Quantity,Transaction Date,Payment Method,Location
0,CUST_0001,Medical,2615.90,9,2024-11-19,Digital Wallet,Online
1,CUST_0001,Travel,2520.21,6,2024-12-28,Credit Card,In-store
2,CUST_0001,Fitness,1918.49,9,2024-09-01,Digital Wallet,Mobile App
3,CUST_0001,Personal Hygiene,1080.66,9,2024-07-28,Cash,In-store
4,CUST_0001,Friend,760.48,11,2024-12-22,Credit Card,Mobile App
...,...,...,...,...,...,...,...
2537,CUST_0200,Groceries,296.88,30,2024-07-10,Credit Card,In-store
2538,CUST_0200,Hobbies,254.85,8,2024-05-20,Debit Card,In-store
2539,CUST_0200,Personal Hygiene,199.29,1,2023-11-22,Cash,In-store
2540,CUST_0200,Subscriptions,171.62,5,2025-01-10,Debit Card,In-store


In [5]:
def build_preprocessor(X):

    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ])

    return preprocessor


In [11]:
import os

os.makedirs("model", exist_ok=True)


In [12]:
df = pd.read_csv("./banking_recommendation_data.csv")

target = "Category"
y_raw = df[target]

le = LabelEncoder()
y = le.fit_transform(y_raw)

X = df.drop(columns=[target, "Customer ID"])

# Build Preprocessor
preprocessor = build_preprocessor(X)

# Model
model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    random_state=42
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Save Model
joblib.dump(pipeline, "model/model.pkl")
joblib.dump(le, "model/label_encoder.pkl")

print("Model Saved Successfully")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000331 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 303
[LightGBM] [Info] Number of data points in the train set: 2033, number of used features: 9
[LightGBM] [Info] Start training from score -2.567412
[LightGBM] [Info] Start training from score -2.561022
[LightGBM] [Info] Start training from score -2.561022
[LightGBM] [Info] Start training from score -2.586830
[LightGBM] [Info] Start training from score -2.561022
[LightGBM] [Info] Start training from score -2.580315
[LightGBM] [Info] Start training from score -2.567412
[LightGBM] [Info] Start training from score -2.554673
[LightGBM] [Info] Start training from score -2.561022
[LightGBM] [Info] Start training from score -2.548364
[LightGBM] [Info] Start training from score -2.573843
[LightGBM] [Info] Start training from score -2.554673
[LightGBM] [Info] Start training from score -2.567412




Accuracy: 0.33398821218074654
Model Saved Successfully


In [13]:
pipeline = joblib.load("model/model.pkl")
le = joblib.load("model/label_encoder.pkl")

def recommend_top3(new_data: pd.DataFrame):

    probs = pipeline.predict_proba(new_data)

    top3_idx = np.argsort(probs, axis=1)[:, -3:]

    results = []

    for i in range(len(new_data)):
        labels = le.inverse_transform(top3_idx[i])
        results.append(labels.tolist())

    return results

In [14]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.17      0.18      0.18        39
           1       0.25      0.25      0.25        40
           2       0.28      0.31      0.29        39
           3       0.17      0.18      0.18        38
           4       0.46      0.41      0.43        39
           5       0.26      0.26      0.26        38
           6       0.72      0.54      0.62        39
           7       0.26      0.25      0.25        40
           8       0.21      0.15      0.18        39
           9       0.81      0.65      0.72        40
          10       0.49      0.59      0.53        39
          11       0.24      0.30      0.27        40
          12       0.22      0.26      0.24        39

    accuracy                           0.33       509
   macro avg       0.35      0.33      0.34       509
weighted avg       0.35      0.33      0.34       509



In [15]:
import numpy as np

probs = pipeline.predict_proba(X_test)
top3 = np.argsort(probs, axis=1)[:, -3:]

correct = 0
for i in range(len(y_test)):
    if y_test[i] in top3[i]:
        correct += 1

top3_accuracy = correct / len(y_test)
print("Top-3 Accuracy:", top3_accuracy)




Top-3 Accuracy: 0.6915520628683693


In [16]:
df

Unnamed: 0,Customer ID,Category,Total Spent,Quantity,Transaction Date,Payment Method,Location
0,CUST_0001,Medical,2615.90,9,2024-11-19,Digital Wallet,Online
1,CUST_0001,Travel,2520.21,6,2024-12-28,Credit Card,In-store
2,CUST_0001,Fitness,1918.49,9,2024-09-01,Digital Wallet,Mobile App
3,CUST_0001,Personal Hygiene,1080.66,9,2024-07-28,Cash,In-store
4,CUST_0001,Friend,760.48,11,2024-12-22,Credit Card,Mobile App
...,...,...,...,...,...,...,...
2537,CUST_0200,Groceries,296.88,30,2024-07-10,Credit Card,In-store
2538,CUST_0200,Hobbies,254.85,8,2024-05-20,Debit Card,In-store
2539,CUST_0200,Personal Hygiene,199.29,1,2023-11-22,Cash,In-store
2540,CUST_0200,Subscriptions,171.62,5,2025-01-10,Debit Card,In-store


In [17]:
print(df["Category"].value_counts())


Category
Shopping            199
Transportation      198
Medical             198
Food                197
Friend              196
Personal Hygiene    196
Groceries           196
Travel              195
Fitness             195
Housing             195
Subscriptions       194
Hobbies             192
Gifts               191
Name: count, dtype: int64


In [18]:
print(df.columns)


Index(['Customer ID', 'Category', 'Total Spent', 'Quantity',
       'Transaction Date', 'Payment Method', 'Location'],
      dtype='object')


In [19]:
import streamlit as st

pipeline = joblib.load("model/model.pkl")
le = joblib.load("model/label_encoder.pkl")

st.title("Banking Transaction Category Predictor")

amount = st.number_input("Total Spent")
quantity = st.number_input("Quantity")
payment = st.selectbox("Payment Method", ["Credit Card", "Debit Card", "Cash"])
location = st.text_input("Location")

if st.button("Predict"):
    df = pd.DataFrame([{
        "Total Spent": amount,
        "Quantity": quantity,
        "Payment Method": payment,
        "Location": location
    }])

    pred = pipeline.predict(df)
    category = le.inverse_transform(pred)

    st.success(f"Predicted Category: {category[0]}")


2026-02-18 14:44:49.962 
  command:

    streamlit run C:\Users\anuve\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2026-02-18 14:44:49.979 Session state does not function when running a script without `streamlit run`


In [1]:
import sklearn
print(sklearn.__version__)


1.7.2


In [2]:
import pandas as pd
import numpy as np
import joblib
import sklearn
import lightgbm

print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Joblib version: {joblib.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"LightGBM version: {lightgbm.__version__}")

Pandas version: 2.3.3
NumPy version: 2.3.5
Joblib version: 1.5.2
Scikit-learn version: 1.7.2
LightGBM version: 4.6.0
