In [None]:
%env MEMORYDB_HOST=clustercfg.memorydb.XXXXXX.memorydb.us-east-1.amazonaws.com
%env MEMORYDB_PORT=6379

# Real-time fraud detection with vector search for MemoryDB

## 1. Architecture
![Architecture](img/AWS-OnAir_01-Architecture.png)

## 2. Install packages
![Packages](img/AWS-OnAir_02-Packages.jpeg)

In [2]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install redis



In [3]:
import pandas as pd
import numpy as np
import uuid
#from sentence_transformers import SentenceTransformer
import datetime
import os
import redis
import time
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField, NumericField
from redis.commands.search.field import TagField
from redis.commands.search.query import Query
from redis.commands.search.result import Result
from redis.cluster import RedisCluster as MemoryDB
from redis.commands.search.field import VectorField, TextField, NumericField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType


## 3. Connect to MemoryDB

![Connection](img/AWS-OnAir_03-Connection.jpeg)

In [4]:
%%time
memorydb_host = os.environ.get("MEMORYDB_HOST", "localhost")
memorydb_port = os.environ.get("MEMORYDB_PORT", 6379)
# print(f"MemoryDB Url = {memorydb_host}:{memorydb_port}")
rc = MemoryDB(host=memorydb_host, port=memorydb_port, ssl=True, decode_responses=False, ssl_cert_reqs="none")
rc.ping()
# rc.flushall()

CPU times: user 75.9 ms, sys: 0 ns, total: 75.9 ms
Wall time: 92.8 ms


True

## 4. [Credit Card Fraud Detection Source](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud)

This dataset presents transactions that occurred in two days, where we have __492__ _frauds_ out of __284,807__ _transactions_.  
The dataset is highly unbalanced, the positive class (_frauds_) account for __0.172%__ of all _transactions_.

It contains only numerical input variables:
- Features __V1__, __V2__, … __V28__, 'Time' 'Amount' and 'Class'.
- Feature __'Time'__ contains the seconds elapsed between each transaction and the first transaction in the dataset.
- Feature __'Amount'__ is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning.
- Feature __'Class'__ is the response variable and it takes value 1 in case of fraud and 0 otherwise.

![CreditCardFraud](img/AWS-OnAir_04-NeedleHaystack.jpeg)

In [5]:
df = pd.read_csv("creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
num_rows = df.shape[0]
print(f"Number of rows in dataset {num_rows:,}")

Number of rows in dataset 284,807


In [7]:
slice = num_rows - 10_000
newDF = df.iloc[:slice]
# newDF=df
#df_dropped = newDF.drop(columns=['Time', 'Amount', 'Class'])
num_rows = newDF.shape[0]
print(f"Number of rows in dataset {num_rows:,}")

Number of rows in dataset 274,807


In [8]:
%%time
embedding_columns = [f'V{i}' for i in range(1, 29)]
# Ensure the specified columns exist in the DataFrame
missing_columns = [col for col in embedding_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"The following embedding columns are missing from the DataFrame: {missing_columns}")
# Convert the specified columns into a list of lists (each row is a list)
vectors = newDF[embedding_columns].values.tolist()

CPU times: user 545 ms, sys: 148 ms, total: 693 ms
Wall time: 695 ms


In [13]:
newDF['vector'] = vectors
newDF.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newDF['vector'] = vectors


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,vector
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0,"[-1.3598071336738, -0.0727811733098497, 2.5363..."
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0,"[1.19185711131486, 0.26615071205963, 0.1664801..."
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0,"[-1.35835406159823, -1.34016307473609, 1.77320..."
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0,"[-0.966271711572087, -0.185226008082898, 1.792..."
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0,"[-1.15823309349523, 0.877736754848451, 1.54871..."


In [14]:
subset_df = newDF[['vector', 'Amount', 'Class']]
subset_df.head()
num_rows = subset_df.shape[0]
print(f"Number of rows in dataset {num_rows:,}")

Number of rows in dataset 274,807


## 5. Create index in MemoryDB

![Create-Index](img/AWS-OnAir_05-Index.jpeg)

In [15]:
def generate_key(prefix = ""):
    return prefix + str(uuid.uuid4())

In [16]:
def create_hnsw_index(rc, index_name, vector_field_name, number_of_vectors, vector_dimensions=28, distance_metric='L2', M=16, EF=512, key_prefix=''):
    # Create the index
    try:
        rc.ft(index_name).create_index([
            VectorField(vector_field_name, "HNSW", {
                "TYPE": "FLOAT32",
                "DIM": vector_dimensions,
                "DISTANCE_METRIC": distance_metric,
                "INITIAL_CAP": number_of_vectors,
                "M": M,
                "EF_CONSTRUCTION": EF
            }),
            NumericField("amount"),
            NumericField("class")
        ], definition=IndexDefinition(prefix=[key_prefix]))
        print(f"Index {index_name} created successfully.")
    except ResponseError as e:
        print(f"Index {index_name} created previously: {str(e)}")

## Behind the scenes

![KNNdistanceMetrics](img/AWS-OnAir_08-KNNdistanceMetrics.png)

```
FT.CREATE "ccfd_hnsw_index" 
PREFIX "1" "tsx:" 
SCORE "1.0" 
SCHEMA "vector" 
VECTOR "HNSW" "12" "TYPE" "FLOAT32" "DIM" "28" "DISTANCE_METRIC" "Cosine" 
INITIAL_CAP "274807" "M" "16" 
EF_CONSTRUCTION "512" "amount" "NUMERIC" "class" "NUMERIC"
```

In [17]:
KEY_PREFIX = "tsx:"
index_name = "ccfd_hnsw_index"
vector_field_name = "vector"
number_of_vectors = len(subset_df)
print(f"Creating Index {index_name} on Field {vector_field_name} expecting {number_of_vectors:,} vectors")

Creating Index ccfd_hnsw_index on Field vector expecting 274,807 vectors


In [18]:
%%time
# Create index in MemoryDB
create_hnsw_index(rc, index_name, vector_field_name, number_of_vectors, 
                  vector_dimensions=28, distance_metric='Cosine', M=16, EF=512, key_prefix=KEY_PREFIX)

Index ccfd_hnsw_index created successfully.
CPU times: user 0 ns, sys: 1.7 ms, total: 1.7 ms
Wall time: 9.66 ms


## 

6. Load vector embeddings into MemoryDB

![Index](img/AWS-OnAir_06-Load.jpeg)

In [19]:
%%time
# Load data into MemoryDB
BATCH_SIZE = 100
pipe = rc.pipeline()
for index, row in subset_df.iterrows():
    key = generate_key(prefix=KEY_PREFIX)
    vector = np.array(row['vector'], dtype=np.float32).tobytes()
    pipe.hset(key, mapping={
        'vector': vector,
        'amount': row['Amount'],
        'class': row['Class']
    })
    if index % BATCH_SIZE == 0 or index == number_of_vectors - 1:
        pipe.execute()
        pipe = rc.pipeline()
print("Data indexed successfully.")

Data indexed successfully.
CPU times: user 23.4 s, sys: 498 ms, total: 23.9 s
Wall time: 1min 15s


In [20]:
%%time
# Add a python script to find a random key that stats with the prefix and fetch the value and show it
count = 0
while True:
    count += 1
    keyname = rc.randomkey()
    keyname = keyname.decode('utf-8')
    print(str(keyname))
    if keyname.startswith(KEY_PREFIX) == True:
        print(rc.hgetall(keyname))
        break
    elif count > 10:
        break


tsx:a9540106-5206-48fa-8615-4f6d67b8f390
{b'class': b'0', b'vector': b'\xcc^\xfd?\xe1\x10\xd8\xbe\xca7\xe3\xbe\x03\xbc\xc3>2]\xe9\xbe8`\x14\xbd\xc3\x80\'\xbfJ\xaeW=\x97R\xb7?\xefHA\xbe\x88c\x84\xbfoX\x08?\x91\xf5\xb9>\xd9\xd7\x82\xbeAl\r?I\xd6\x8e=|\x12\x06\xbf\x89\xd9#>k\xc2}\xbeV\xe41\xbeliB>\x07\xfaF?\xf4\x17\x10>|\x00"?\xca\xcc\x8f\xbdI\x06s\xbe\x0c1&=\x11}\xf6\xbc', b'amount': b'12.99'}
CPU times: user 1.65 ms, sys: 57 µs, total: 1.71 ms
Wall time: 5.47 ms


## 7. Find fraudulent transactions

![Find-Tsx](img/AWS-OnAir_07-Find.jpeg)

In [27]:
df_class_1 = subset_df.query('Class == 1').tail(10)
df_class_1


Unnamed: 0,vector,Amount,Class
262560,"[0.567539266873172, 3.30938527728796, -6.63126...",4.69,1
262826,"[-0.417339782403896, 4.70005527392636, -7.5217...",0.77,1
263080,"[2.13238602134104, 0.705607819703042, -3.53075...",1.0,1
263274,"[-0.644277679025173, 5.00235242592827, -8.2527...",0.77,1
263324,"[-0.848290216565114, 2.71988211593934, -6.1990...",127.14,1
263877,"[-3.38760102859385, 3.97788074139497, -6.97858...",0.38,1
268375,"[-5.23880773573245, 0.623013013517624, -5.7845...",39.98,1
272521,"[-7.50392623748137, -0.360628009949399, -3.830...",12.31,1
274382,"[-5.76687873469586, -8.40215367768915, 0.05654...",0.0,1
274475,"[-0.956390354534089, 2.36159360978361, -3.1711...",39.9,1


In [28]:
selected_row = subset_df.iloc[263324]
print(selected_row)
query_vector = selected_row['vector']

vector    [-0.848290216565114, 2.71988211593934, -6.1990...
Amount                                               127.14
Class                                                     1
Name: 263324, dtype: object


In [29]:
def similarity_search(redis_client, index_name, query_vector, top_n=5):
    # Convert the query vector to bytes
    query_vector_bytes = np.array(query_vector, dtype=np.float32).tobytes()
    # Create the query
    query = Query(f"*=>[KNN {top_n} @vector $query_vec AS score]") \
        .sort_by("score") \
        .return_fields( "amount", "class") \
        .paging(0, top_n) \
        .dialect(2)
    params = {
        "query_vec": query_vector_bytes
    }
    # Process the query
    result = redis_client.ft(index_name).search(query, query_params=params)
    return result

### Query behind the scenes

```
FT.SEARCH "ccfd_hnsw_index" "*=>[KNN 5 @vector $query_vec AS score]" 
RETURN "2" "amount" "class" 
SORTBY "score" "ASC" "DIALECT" "2" "LIMIT" "0" "5" 
"params" "2" "query_vec" "\x1e!2\xbf\x0b\xef\x14?\x184\x18@\xbd\xd5\x81=?\x82\xa8>\xa5T\xe6\xbe\...\xbf"
```

In [30]:
%%time
results = similarity_search(rc, index_name, query_vector, top_n=5)

CPU times: user 1.75 ms, sys: 0 ns, total: 1.75 ms
Wall time: 1.94 ms


In [31]:
%%time

for doc in results.docs:
    doc_id = doc.id
  #  score = doc.score
    amount = doc.amount
    class_value = doc['class']  # Accessing with square brackets
    print(f"ID: {doc_id}, Amount: {amount}, Class: {class_value}")

ID: tsx:ea16bcc3-fc61-4d36-8805-3376cecba7e8, Amount: 127.14, Class: 1
ID: tsx:176246c9-039c-4b3e-9901-e4b691bfea1d, Amount: 14.46, Class: 1
ID: tsx:18b968d9-06ad-4c6f-889e-2543d8cb6b85, Amount: 310.42, Class: 1
ID: tsx:4399be97-3e5e-436b-8697-85d5aa34ce14, Amount: 316.06, Class: 1
ID: tsx:8ac9b05e-6aab-44db-b4de-e7d38181e97f, Amount: 316.06, Class: 1
CPU times: user 319 µs, sys: 11 µs, total: 330 µs
Wall time: 293 µs
