# Real-time fraud detection with vector search for MemoryDB

## 1. Components
![Packages](images/AWS-OnAir_01-Architecture.png)

## 2. Install packages
![Packages](images/AWS-OnAir_02-Packages.jpeg)

In [2]:
# Install/upgrade pip and other packages in the current Jupyter kernel
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install -U valkey # Note the Valkey library



In [3]:
import pandas as pd
import numpy as np
import uuid
import datetime
import os
import time

from valkey.commands.search.field import VectorField, TextField
from valkey.commands.search.field import NumericField, TagField
from valkey.commands.search.query import Query
from valkey.commands.search.result import Result
from valkey.commands.search.indexDefinition import IndexDefinition, IndexType
from valkey.cluster import ValkeyCluster as MemoryDB

## 3. Connect to MemoryDB
![Packages](images/AWS-OnAir_03-Connection.jpeg)

In [12]:
%env MEMORYDB_HOST=clustercfg.rag-vss.73c0e5.memorydb.us-east-1.amazonaws.com
%env MEMORYDB_PORT=6379

env: MEMORYDB_HOST=clustercfg.rag-vss.73c0e5.memorydb.us-east-1.amazonaws.com
env: MEMORYDB_PORT=6379


In [13]:
memorydb_host = os.environ.get("MEMORYDB_HOST", "localhost")
memorydb_port = os.environ.get("MEMORYDB_PORT", 6379)

mdb = MemoryDB(host=memorydb_host, port=memorydb_port, ssl=True, decode_responses=False, ssl_cert_reqs="none")

print("Ping status of MemoryDB = " + str(mdb.ping()))

Ping status of MemoryDB = True


## 4. [Credit Card Fraud Detection Source](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)

This dataset presents transactions that occurred in two days, where we have __492__ _frauds_ out of __284,807__ _transactions_.  
The dataset is highly unbalanced, the 1 Class (_frauds_) account for __0.172%__ of all _transactions_.

It contains only numerical input variables:
    
- Feature __'Time'__ contains the seconds elapsed between each transaction and the first transaction in the dataset.
- Features __V1__, __V2__, and __V28__ Are 28 dimensions of vectorized (_embeddings created_) data representing transaction details such a time location and so on.
- Feature __'Amount'__ is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning.
- Feature __'Class'__ is the response variable and it takes value 1 in case of fraud and 0 otherwise.

![Packages](images/AWS-OnAir_04-NeedleHaystack.jpeg)

In [5]:
df = pd.read_csv("data/creditcard.csv")

print(f"Number of rows in dataset: {df.shape[0]:,} Number of columns: {df.shape[1]:,}\n")

df.head()

Number of rows in dataset: 284,807 Number of columns: 31



Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
# Validate data
# Ensure the specified columns exist in the DataFrame

embedding_columns = [f'V{i}' for i in range(1, 29)]

missing_columns = [col for col in embedding_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"The following embedding columns are missing from the DataFrame: {missing_columns}")

df['Vector'] = df[embedding_columns].values.tolist()

print(f"Number of columns in dataset: {df.shape[1]:,}\n")
print(f"{df['Vector'].head().to_string()}\n")

# fraud_only_df = df.query('Class == 1').tail(10)
df_filtered = df.query('Class == 1')
fraud_only_df = df_filtered.reset_index(drop=True)
print(f"Number of rows in fraud only dataset: {fraud_only_df.shape[0]:,}\n")

Number of columns in dataset: 32

0    [-1.359807134, -0.072781173, 2.536346738, 1.37...
1    [1.191857111, 0.266150712, 0.166480113, 0.4481...
2    [-1.358354062, -1.340163075, 1.773209343, 0.37...
3    [-0.966271712, -0.185226008, 1.79299334, -0.86...
4    [-1.158233093, 0.877736755, 1.548717847, 0.403...

Number of rows in fraud only dataset: 492



## 5. Create index in MemoryDB

![Packages](images/AWS-OnAir_05-Index.jpeg)

In [9]:
def generate_key(prefix = ""):
    return prefix + str(uuid.uuid4())

def create_hnsw_index(mdb, index_name, vector_field_name, initial_size, 
                      vector_dimensions=len(embedding_columns), distance_metric='L2', M_EDGES=16, EF_CONSTRUCT=512, key_prefix=''):
    # Create the MemoryDB index
    # larger M value increases the number of edges thus creating a more connected graph helping with recall but consumes more memory
    # Larger EF_CONSTRUCTION has a larger dynamic candidate list during construction. This leads to a more thorough search during construction and longer construction time
    # Larger EF_RUNTIME examines more vectors during query execution resulting in better recall but taking longer to complete left at default value of 10
    # Distance Metrics L2->Euclidean distance. | IP->Dot product | COSINE->the angle between vectors

    # Drop the index if it exits so that you can re-run this block of code.
    # print( mdb.ft(index_name).info())

    # Create a new index
    try:
        mdb.ft(index_name).create_index([
            VectorField(vector_field_name, 
                        "HNSW", {
                            "TYPE": "FLOAT32",
                            "DIM": vector_dimensions,
                            "DISTANCE_METRIC": distance_metric,
                            "INITIAL_CAP": initial_size,
                            "M": M_EDGES,
                            "EF_CONSTRUCTION": EF_CONSTRUCT
                        }
                ),
            NumericField("amount"),
            NumericField("class")
            ],
            definition=IndexDefinition(prefix=[key_prefix])
        )
        print(f"Index {index_name} created successfully.")

    except Exception as e:
        print(f"Index {index_name} created previously: {str(e)}")

## Behind the scenes
```
FT.CREATE "ccfd_hnsw_index"
ON HASH
PREFIX "1" "tsx:"
SCHEMA "vector" 
VECTOR "HNSW" "12" "TYPE" "FLOAT32" "DIM" "28" "DISTANCE_METRIC" "Cosine" 
INITIAL_CAP "274807" "M" "16" 
EF_CONSTRUCTION "512" "amount" "NUMERIC" "class" "NUMERIC"
```

![Packages](images/AWS-OnAir_08-KNNdistanceMetrics.png)

In [10]:
KEY_PREFIX = "tsx:"
vector_field_name = "VEC"
index_name = "ccfd_hnsw_index"
number_of_vectors = fraud_only_df.shape[0] - 10  # do not add the last 10 fraudulent rows
vector_dimensions = len(embedding_columns)       # vector dimension is the number of embedded/vector columns only

print(f"Creating Vector Index {index_name} on Field {vector_field_name} Expecting {number_of_vectors:,} vectors")

# First clean up  MemoryDB
mdb.flushall()
# mdb.ft(index_name).dropindex()

# Create an emtpy index in MemoryDB
create_hnsw_index(mdb, index_name, vector_field_name, number_of_vectors, 
                 vector_dimensions=vector_dimensions, distance_metric='Cosine', M_EDGES=16, EF_CONSTRUCT=512, key_prefix=KEY_PREFIX)

print(f"\nVector Information: {mdb.ft(index_name).info()}")
print(f"\nNumber of indexed vectors: {mdb.ft(index_name).info()['num_indexed_vectors']}")

Creating Vector Index ccfd_hnsw_index on Field VEC Expecting 482 vectors
Index ccfd_hnsw_index created successfully.

Vector Information: {'index_name': 'ccfd_hnsw_index', 'creation_timestamp': 1731507851151127, 'key_type': 'HASH', 'key_prefixes': [b'tsx:'], 'fields': [[b'identifier', b'VEC', b'field_name', b'VEC', b'type', b'VECTOR', b'option', b'', b'vector_params', [b'algorithm', b'HNSW', b'data_type', b'FLOAT32', b'dimension', 28, b'distance_metric', b'COSINE', b'initial_capacity', 482, b'current_capacity', 482, b'maximum_edges', 16, b'ef_construction', 512, b'ef_runtime', 10, b'epsilon', b'0.01']], [b'identifier', b'amount', b'field_name', b'amount', b'type', b'NUMERIC', b'option', b''], [b'identifier', b'class', b'field_name', b'class', b'type', b'NUMERIC', b'option', b'']], 'space_usage': 629170, 'fulltext_space_usage': 0, 'vector_space_usage': 629170, 'num_docs': 0, 'num_indexed_vectors': 0, 'current_lag': 0, 'index_status': 'QUEUED', 'index_degradation_percentage': 0}

Number 

6. Load vector embeddings into MemoryDB

![Index](images/AWS-OnAir_06-Load.jpeg)

In [11]:
%%time
# Load data into MemoryDB
BATCH_SIZE = 100

# Import tqdm for jupyter notebook
from tqdm.notebook import tqdm
# Enable tqdm for Pandas
tqdm.pandas()

pipe = mdb.pipeline()

for index, row in tqdm(fraud_only_df.loc[:number_of_vectors-1].iterrows(), total=number_of_vectors-1):
    key = generate_key(prefix=KEY_PREFIX)
    vector = np.array(row['Vector'], dtype=np.float32).tobytes()
    
    pipe.hset(key, mapping={
        vector_field_name: vector,
        'amount': row['Amount'],
        'class': row['Class']
        })
    
    if index % BATCH_SIZE == 0:
        pipe.execute()
        pipe = mdb.pipeline()
    
pipe.execute()

print(f"\nData indexed successfully. Keys created: {index}\n")
print(f"Indexed info: {mdb.ft(index_name).info()}")
time.sleep(1)
print(f"\nNumber of indexed vectors: {mdb.ft(index_name).info()['num_indexed_vectors']}\n")

  0%|          | 0/481 [00:00<?, ?it/s]


Data indexed successfully. Keys created: 481

Indexed info: {'index_name': 'ccfd_hnsw_index', 'creation_timestamp': 1731507851151127, 'key_type': 'HASH', 'key_prefixes': [b'tsx:'], 'fields': [[b'identifier', b'VEC', b'field_name', b'VEC', b'type', b'VECTOR', b'option', b'', b'vector_params', [b'algorithm', b'HNSW', b'data_type', b'FLOAT32', b'dimension', 28, b'distance_metric', b'COSINE', b'initial_capacity', 482, b'current_capacity', 482, b'maximum_edges', 16, b'ef_construction', 512, b'ef_runtime', 10, b'epsilon', b'0.01']], [b'identifier', b'amount', b'field_name', b'amount', b'type', b'NUMERIC', b'option', b''], [b'identifier', b'class', b'field_name', b'class', b'type', b'NUMERIC', b'option', b'']], 'space_usage': 817805, 'fulltext_space_usage': 45133, 'vector_space_usage': 772672, 'num_docs': 401, 'num_indexed_vectors': 463, 'current_lag': 0, 'index_status': 'AVAILABLE', 'index_degradation_percentage': 0}

Number of indexed vectors: 482

CPU times: user 114 ms, sys: 4.57 ms, tot

## 7. Find fraudulent transactions

![Find-Tsx](images/AWS-OnAir_07-Find.jpeg)

In [12]:
def similarity_search(mdb, index_name, query_vector, top_n=5):

    # Convert the query vector to bytes
    query_vector_bytes = np.array(query_vector, dtype=np.float32).tobytes()

    # Create the query
    query = Query(f"*=>[KNN {top_n} @VEC $query_vec AS score ]") \
        .sort_by("score") \
        .return_fields("score", "amount", "class") \
        .paging(0, top_n) \
        .dialect(2)

    params = {
        "query_vec": query_vector_bytes,
        "EF_RUNTIME": 64
    }

    # Process the query
    result = mdb.ft(index_name).search(query, query_params=params).docs
    return result

In [16]:

result = df.query('Class == 1')['Amount'].tail(10)
print(result.to_string())


274382      0.00
274475     39.90
275992    634.30
276071     19.95
276864    349.08
279863    390.00
280143      0.76
280149     77.89
281144    245.00
281674     42.53


In [19]:
%%time

query_vector = df.iloc[281143]['Vector']

results = similarity_search(mdb, index_name, query_vector, top_n=5)

# print(results)

for doc in results:
        score = round(1 - float(doc.score), 2)
        id = doc.id
        print(f"Vector {id} has a score {score}")
        # amount = doc.amount
        # print(f"Vector {id} has a score {score} for the amount {amount}")
print("\n")

Vector tsx:415558da-7c1f-4bff-91cd-4df0df1b1223 has a score 0.44
Vector tsx:a24e3397-2e34-4bbc-b4e7-bb0b92fd9da1 has a score 0.4
Vector tsx:f1f45421-18c3-4bbc-85fa-de173237fc6e has a score 0.39
Vector tsx:22602a0a-571d-4323-bd5b-0f5edf7e6108 has a score 0.39
Vector tsx:270f6b7c-89f5-4475-94c6-2a0bffd73937 has a score 0.32


CPU times: user 2.07 ms, sys: 0 ns, total: 2.07 ms
Wall time: 2.74 ms


### Query behind the scenes

```
FT.SEARCH "ccfd_hnsw_index" "*=>[KNN 5 @vector $query_vec AS score]" 
RETURN "2" "amount" "class" 
SORTBY "score" "ASC" "DIALECT" "2" "LIMIT" "0" "5" 
"params" "2" "query_vec" "\x1e!2\xbf\x0b\xef\x14?\x184\x18@\xbd\xd5\x81=?\x82\xa8>\xa5T\xe6\xbe\...\xbf"
```