# Installations

In [11]:
!pip install -Uq setfit
!pip install -Uq transformers==4.49
!pip install -Uq kaleido

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h

# Imports

In [19]:
import os
os.environ["WANDB_DISABLED"] = "true"
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

import torch
from torch.utils.data import Dataset, DataLoader

from datasets import Dataset as hf_dataset

from setfit import SetFitModel,Trainer,TrainingArguments

from sentence_transformers.losses import CosineSimilarityLoss

import transformers

from sklearn.manifold import TSNE

import plotly.express as px

# Config

In [3]:
class config:
  txn_file = "/content/drive/MyDrive/Spend Categorization/my_transactions.csv"
  txn_cat_file = "/content/drive/MyDrive/Spend Categorization/my_transactions_categorized.csv"
  model = 'sentence-transformers/paraphrase-mpnet-base-v2'

In [4]:
df_txn_cat = pd.read_csv(config.txn_cat_file)

In [6]:
df_txn_cat.head(4)

Unnamed: 0,Full Text,Transaction Type,Transaction ID,VPA,Amount,Summary,Category
0,UPIOUT/451368462558/blinkit.payu@hdfcbank/UP ...,UPIOUT,451368462558,blinkit.payu@hdfcbank,228.0,You have sent 228.00 to blinkit.payu@hdfcbank ...,Grocery
1,UPIOUT/415140428056/zeptonowcashfree@hdfcb ...,UPIOUT,415140428056,zeptonowcashfree@hdfcb,325.06,You have sent 325.06 to zeptonowcashfree@hdfcb...,Grocery
2,UPIOUT/429504581630/chaayos488077.rzp@rxair ...,UPIOUT,429504581630,chaayos488077.rzp@rxair,83.0,You have sent 83.00 to chaayos488077.rzp@rxair...,Food
3,UPIOUT/470940782926/Vyapar.170258239279@h ...,UPIOUT,470940782926,Vyapar.170258239279@h,702.0,You have sent 702.00 to Vyapar.170258239279@h ...,Ecommerce


In [5]:
id2label = {n:k for n,k in enumerate(df_txn_cat['Category'].unique())}
label2id = {k:n for n,k in enumerate(df_txn_cat['Category'].unique())}

In [6]:
df_txn_cat['label'] = df_txn_cat['Category'].map(label2id)

In [9]:
enc = OneHotEncoder()
label = enc.fit_transform(df_txn_cat['label'].to_numpy().reshape(-1,1)).toarray()

# Dataset class

In [10]:
class TransactionData(Dataset):
    """
    A simple dataset class for PyTorch Lightning that takes lists of texts and labels.
    """

    def __init__(self, texts, labels):
        """
        Args:
            texts (list): List of text strings.
            labels (list): List of corresponding labels (e.g., integers).
        """
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        return {'text': text, 'label': label}


In [10]:
train_ds = TransactionData(
    df_txn_cat['Summary'].tolist(),
    df_txn_cat['label'].tolist()
)

In [11]:
train_dl = DataLoader(train_ds, batch_size = 8,shuffle = True)

In [11]:
# Huggingface datasets

hf_ds = hf_dataset.from_dict({
    'text' : df_txn_cat['Summary'].tolist(),
    'label' : label
})

# Model

In [13]:
model = SetFitModel.from_pretrained(config.model, multi_target_strategy="one-vs-rest")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


# Training

In [14]:
args = TrainingArguments(
    output_dir="models",
    num_epochs=5,
    use_amp=True,
    batch_size=32,
    save_strategy = 'epoch',
    report_to=None,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
trainer = Trainer(
    model=model,
    args = args,
    train_dataset=hf_ds,
    column_mapping={"text": "text", "label": "label"},

)

Applying column mapping to the training dataset
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Map:   0%|          | 0/73 [00:00<?, ? examples/s]

In [16]:
trainer.train()

***** Running training *****
  Num unique pairs = 4496
  Batch size = 32
  Num epochs = 5


Step,Training Loss
1,0.2461
50,0.1668
100,0.0682
150,0.0376
200,0.0227
250,0.0191
300,0.0147
350,0.0136
400,0.0174
450,0.0102


In [17]:
model.save_pretrained('FinetunedModel/')

In [18]:
model1 = model.from_pretrained('/content/FinetunedModel',local_files_only= True)

In [108]:
model.save_pretrained('mymodel/checkpoint')

In [22]:
!zip -r /content/FinetunedModel.zip /content/FinetunedModel

  adding: content/FinetunedModel/ (stored 0%)
  adding: content/FinetunedModel/config_sentence_transformers.json (deflated 34%)
  adding: content/FinetunedModel/vocab.txt (deflated 53%)
  adding: content/FinetunedModel/special_tokens_map.json (deflated 85%)
  adding: content/FinetunedModel/1_Pooling/ (stored 0%)
  adding: content/FinetunedModel/1_Pooling/config.json (deflated 57%)
  adding: content/FinetunedModel/README.md (deflated 59%)
  adding: content/FinetunedModel/model.safetensors (deflated 8%)
  adding: content/FinetunedModel/model_head.pkl (deflated 9%)
  adding: content/FinetunedModel/sentence_bert_config.json (deflated 4%)
  adding: content/FinetunedModel/modules.json (deflated 53%)
  adding: content/FinetunedModel/tokenizer.json (deflated 71%)
  adding: content/FinetunedModel/tokenizer_config.json (deflated 75%)
  adding: content/FinetunedModel/config.json (deflated 47%)
  adding: content/FinetunedModel/config_setfit.json (deflated 6%)


# Vizualizations

In [7]:
model_untrained = SetFitModel.from_pretrained(config.model, multi_target_strategy="one-vs-rest")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [29]:
# Trained Embeddings
trained_embeddings = model.encode(df_txn_cat['Summary'].tolist())
tsne = TSNE(n_components=2, random_state=42)
trained_tsne = tsne.fit_transform(trained_embeddings)

In [8]:
#Untrained Embeddings
untrained_embeddings = model_untrained.encode(df_txn_cat['Summary'].tolist())
tsne = TSNE(n_components=2, random_state=42)
untrained_tsne = tsne.fit_transform(untrained_embeddings)

In [53]:
fig = px.scatter(x= trained_embeddings[:,0],
                 y= trained_embeddings[:,1],
                 color = df_txn_cat['Category'].tolist(),
                 symbol= df_txn_cat['Category'].tolist(),
                 title = 'Finetuned Model Embeddings')
fig.show()

In [13]:
px.scatter(x= untrained_embeddings[:,0],
           y= untrained_embeddings[:,1],
           color = df_txn_cat['Category'].tolist(),
           symbol= df_txn_cat['Category'].tolist())

In [None]:
px.scatter().to_

# Inference

In [None]:
!unzip /content/FinetunedModel.zip

In [15]:
model = SetFitModel.from_pretrained('/content/FinetunedModel')

In [16]:
def extraction(fullText: str) -> str:
  trans_type = fullText.split('/')[0]
  transaction_id = fullText.split('/')[1]
  vpa = fullText.split('/')[2].split(' ')[0]
  amount = fullText.split('/')[3].split(' ')[-1]

  return trans_type,vpa,amount

def generate_summary(transaction_type: str,
                     VPA : str,
                     amount : str
                     ) -> str:

  if transaction_type.lower() == 'UPIOUT'.lower():
    summary = f"You have sent {amount.strip()} to {VPA.strip()} via UPI"

  elif transaction_type.lower() == 'UPI IN'.lower():
    summary = f"You have received {amount.strip()} from {VPA.strip()} via UPI"

  elif transaction_type.lower() == 'CHRG'.lower():
    summary = f"You have been charged {amount.strip()} to {VPA.strip()}"

  else:
    summary = 'Null'

  return summary

In [17]:
def pred(narration) -> str:
  transaction_type,vpa,amount = extraction(narration)
  summary = generate_summary(transaction_type,vpa,amount)
  print(f'{summary}')

  preds = model([summary])
  preds = id2label[int(np.argmax(preds))]
  return preds



In [20]:
narration = 'UPIOUT/509042621775/wellnessquotien833.rzp@i/7298    260.00'
pred(narration)

You have sent 260.00 to wellnessquotien833.rzp@i via UPI


'Sports & Fitness'

In [25]:
narration = 'UPIOUT/409826621840/zomatoonlineorder.rzp@ic/                   75.00'
pred(narration)

You have sent 75.00 to zomatoonlineorder.rzp@ic via UPI


'Food'

In [26]:
narration = 'UPIOUT/411144932833/BURGERKINGINDIA@ybl/Pa                  178.92'
pred(narration)

You have sent 178.92 to BURGERKINGINDIA@ybl via UPI


'Food'