<a href="https://colab.research.google.com/github/ashishawasthi/colab/blob/master/Merchant_Categorization_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from sklearn.metrics import classification_report

### Load data

In [2]:
# df = pd.read_csv('transactions.csv')

In [3]:
data = {
    'transaction_description': [
        'Purchase at Merchant A id123', 
        'Purchase at Merchant B id124', 
        'Purchase at Merchant C id125', 
        'Refund at Merchant A id126', 
        'Purchase at Merchant D id127', 
        'Purchase at Merchant B id128', 
        'Refund at Merchant E id129', 
        'Purchase at Merchant F id130', 
        'Purchase at Merchant G id131', 
        'Refund at Merchant A id132'
    ],
    'MCC': ['5812', '5814', '5732', '5812', '5812', '5814', '5813', '5813', '5813', '5812'],
    'merchant_name': [
        'Merchant A', # 5812
        'Merchant B', # 5814
        'Merchant C', # 5732
        'Merchant A', # 5812
        'Merchant D', # 5812
        'Merchant B', # 5814
        'Merchant E', # 5813
        'Merchant E', # 5813
        'Merchant E', # 5813
        'Merchant A'  # 5812
    ],
    'transaction_category': ['Food', 'Food', 'Electronics', 'Food', 'Food', 'Food', 'Travel', 'Electronics', 'Food', 'Food']
}

df = pd.DataFrame(data)


### Prepare datasets

In [4]:
y = df['transaction_category']
le = LabelEncoder()
y = le.fit_transform(y)
X = df.drop('transaction_category', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Define feature engineering pipeline

In [5]:
# Tokenize and pad transaction_description
maxlen = 100 # Adjust for expected max length of transaction_description
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['transaction_description'])
sequences = tokenizer.texts_to_sequences(X_train['transaction_description'])
word_index = tokenizer.word_index
X_train_text = pad_sequences(sequences, maxlen=maxlen)
sequences = tokenizer.texts_to_sequences(X_test['transaction_description'])
X_test_text = pad_sequences(sequences, maxlen=maxlen)

# One-hot encode categorical features
ohe = OneHotEncoder()
X_train_cat = ohe.fit_transform(X_train[['MCC', 'merchant_name']]).toarray()
X_test_cat = ohe.transform(X_test[['MCC', 'merchant_name']]).toarray()

# Concatenate text and categorical features
X_train = np.concatenate([X_train_text, X_train_cat], axis=1)
X_test = np.concatenate([X_test_text, X_test_cat], axis=1)

### Train

In [6]:
input = Input(shape=(X_train.shape[1],))
embedded = Embedding(len(word_index) + 1, 128)(input)
lstm_out = LSTM(64)(embedded)
output = Dense(np.unique(y).shape[0], activation='softmax')(lstm_out)
model = Model(input, output)

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6bc819a2f0>

### Report

In [7]:
y_pred = model.predict(X_test).argmax(axis=1)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

