### db struct

```
# found in brand_index.json field - brands
:Brand {
    code: "code",
    name: "name",
    elabDescription: "elabDescription" if any,
    brandUrl: "brandUrl",
    products: ["code" for each product in full_product_list.json],
}

# found in all .json under promotions
:Promotion {
    code: "code",
    title: "title",
    description: "description",
    longDescription: "longDescription",
    formattedRewardValue: "formattedRewardValue",
    startDate: "startDate",
    endDate: "endDate",
    promotionUrl: "promotionUrl",
    rewardUrl: "rewardUrl",
    tagCode: "tag.elabTagCode" if any,
    tagLabel: "tag.label" if any,
}

# found in BP_XXXXXX.json field - categories
:Category {
    code: "code",
    name: "name",
    url: "url"
}

# found in full_product_list.json field - products & BP_XXXXXX.json
:Product {
    code: "code",
    name: "name",
    defaultVariantCode: "defaultVariantCode",
    description: "description",
    elabCountryOfOrigin: "elabCountryOfOrigin",
    url: "url",
    stockLevelStatus: "stock.stockLevelStatus in full_product_list.json",
    categories: ["code" for each category in field categories in BP_XXXXXX.json],
    promotions: ["code" for each promotion found in field elabPromotions in $defaultVariantCode.json under promotions]
}
```

In [16]:
import os
import json
from tqdm import tqdm
from neo4j import GraphDatabase
from dotenv import load_dotenv

load_dotenv()
NEO4J_URI = os.getenv("NEO4J_URI", "")
NEO4J_USER = os.getenv("NEO4J_USER", "")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "")

class Neo4jHelper:

    def __init__(self, uri, user, password):
        self._driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self._driver.close()

    def create_node_if_not_exists(self, label, code, properties):
        query = f"""
        MERGE (n:{label} {{code: $code}})
        SET
        {', '.join([f'n.{key} = ${key}' for key in properties.keys()])}
        RETURN n
        """
        parameters = {"code": code, **properties}
        
        with self._driver.session() as session:
            result = session.run(query, parameters)
            return result.single()  # Return the first result (the created or matched node)
    
    def run_query(self, query):
        with self._driver.session() as session:
            result = session.run(query)
            return result
    
    def execute_query(self, query, **kwargs):
        records, summary, keys = self._driver.execute_query(
            query,
            **kwargs
        )
        return [record.data() for record in records]

# Connect to Neo4j
neo4j_helper = Neo4jHelper(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

In [14]:
# insert brands
label = "Brand"
"""
:Brand {
    code: "code",
    name: "name",
    elabDescription: "elabDescription" if any,
    brandUrl: "brandUrl",
    products: ["code" for each product in full_product_list.json],
}
"""

with open("watsons_online_store/brand_index.json", "r") as j:
    brand_index = json.load(j)

brand_list = brand_index["brands"]
for brand in tqdm(brand_list, desc="brand injection"):
    code = brand.get("code", "")
    brandUrl = brand.get("brandUrl", "")
    name = brand.get("name", "")
    elabDescription = brand.get("elabDescription", "")
    with open(f"watsons_online_store/brands/{name}_{code}/catalog/full_product_list.json") as j:
        full_product_list = json.load(j)
        full_product_list = full_product_list.get("products", [])
    products = [p["code"] for p in full_product_list]
    properties = {
        "brandUrl": brandUrl,
        "name": name,
        "elabDescription": elabDescription,
        "products": products
    }
    node = neo4j_helper.create_node_if_not_exists(label, code, properties)
    # print("Created/Matched Node:", node)

brand injection: 100%|██████████| 893/893 [01:23<00:00, 10.72it/s]


In [19]:
# insert promotions
label = "Promotion"
"""
:Promotion {
    code: "code",
    title: "title",
    description: "description",
    longDescription: "longDescription",
    formattedRewardValue: "formattedRewardValue",
    startDate: "startDate",
    endDate: "endDate",
    promotionUrl: "promotionUrl",
    rewardUrl: "rewardUrl",
    tagCode: "tag.elabTagCode" if any,
    tagLabel: "tag.label" if any,
}
"""

import os

def merge_elab_promotions(folder_path: str = "watsons_online_store/promotions"):
    # Initialize an empty list to store the merged "elabPromotions"
    merged_elab_promotions = []

    # Iterate over all files in the folder
    for filename in os.listdir(folder_path):
        # Construct the full path to the file
        file_path = os.path.join(folder_path, filename)

        # Check if the file is a JSON file
        if filename.endswith(".json"):
            try:
                # Open and load the JSON file
                with open(file_path, 'r') as file:
                    data = json.load(file)

                    # If "elabPromotions" exists and is a list, extend the merged list
                    if "elabPromotions" in data and isinstance(data["elabPromotions"], list):
                        merged_elab_promotions.extend(data["elabPromotions"])
            except Exception as e:
                print(f"Error reading {file_path}: {e}")

    return merged_elab_promotions

all_promotions = merge_elab_promotions()
all_promotions_map = {p["code"]: p for p in all_promotions}

for code, promotion in tqdm(all_promotions_map.items(), desc="promotion injection"):
    properties = {
        "description" : promotion.get("description"),
        "title" : promotion.get("title"),
        "endDate" : promotion.get("endDate"),
        "formattedRewardValue" : promotion.get("formattedRewardValue"),
        "longDescription" : promotion.get("longDescription"),
        "promotionUrl" : promotion.get("promotionUrl"),
        "rewardUrl" : promotion.get("rewardUrl"),
        "startDate" : promotion.get("startDate"),
        "tagCode" : promotion.get("tag",{}).get("elabTagCode", ""),
        "tagLabel" : promotion.get("tag",{}).get("label", ""),
    }
    node = neo4j_helper.create_node_if_not_exists(label, code, properties)
    # print("Created/Matched Node:", node)

promotion injection: 100%|██████████| 710/710 [01:03<00:00, 11.10it/s]


In [None]:
# insert category & product
"""
:Category {
    code: "code",
    name: "name",
    url: "url"
}

:Product {
    code: "code",
    name: "name",
    defaultVariantCode: "defaultVariantCode",
    description: "description",
    elabCountryOfOrigin: "elabCountryOfOrigin",
    url: "url",
    stockLevelStatus: "stock.stockLevelStatus",
    categories: ["code" for each category],
    promotions: ["code" for each promotion]
}
"""

full_categories = {}
category_label = "Category"
product_label = "Product"
MAX_RETRIES = 5
for root, dirs, files in os.walk("watsons_online_store/brands"):
    # Check if the current directory is a 'catalog' folder
    if os.path.basename(root) == 'catalog':
        # Look for 'full_product_list.json' in the current 'catalog' folder
        if 'full_product_list.json' in files:
            # Extract the 'xxx' part of the path
            # root: brands/xxx/catalog
            # os.path.dirname(root): brands/xxx
            # os.path.basename(os.path.dirname(root)): xxx
            subfolder_name = os.path.basename(os.path.dirname(root))
            
            # Construct the full path to the JSON file
            json_file_path = os.path.join(root, 'full_product_list.json')
            
            with open(json_file_path, "r") as j:
                product_list_json = json.load(j)
                product_list = product_list_json.get("products", [])
                
            for product in tqdm(product_list, desc=f"product injection for {subfolder_name}"):
                product_code = product.get("code", "")
                
                with open(f"watsons_online_store/brands/{subfolder_name}/products/{product_code}.json", "r") as j:
                    product_details = json.load(j)
                    
                defaultVariantCode = product_details.get("defaultVariantCode", "")
                categories_code_list = []
                promotion_code_list = []
                categories = product_details.get("categories", [])
                for category in categories:
                    category_code = category.get("code", "")
                    categories_code_list.append(category_code)
                    category_properties = {
                        "name": category.get("name", ""),
                        "url": category.get("url", ""),
                    }
                    full_categories[category_code] = category_properties
                try:
                    with open(f"watsons_online_store/promotions/{defaultVariantCode}.json", "r") as j:
                        promotion_list = json.load(j)
                        promotion_code_list.extend([p["code"] for p in promotion_list.get("elabPromotions", [])])
                except Exception as e:
                    print(e)
                
                
                product_properties = {
                    "name": product_details.get("name", ""),
                    "defaultVariantCode": defaultVariantCode,
                    "description": product_details.get("description", ""),
                    "elabCountryOfOrigin": product_details.get("elabCountryOfOrigin", ""),
                    "url": product_details.get("url", ""),
                    "stockLevelStatus": product.get("stock", {}).get("stockLevelStatus", ""),
                    "categories": categories_code_list,
                    "promotions": promotion_code_list
                }
                try_count = 0
                while try_count < MAX_RETRIES:
                    try:
                        node = neo4j_helper.create_node_if_not_exists(product_label, product_code, product_properties)
                        break
                    except Exception as e:
                        try_count += 1
                        print(e)

In [25]:
# categories injection
for code, properties in tqdm(full_categories.items(), desc="category injection"):
    node = neo4j_helper.create_node_if_not_exists(category_label, code, properties)

category injection: 100%|██████████| 7971/7971 [12:30<00:00, 10.62it/s]


In [27]:
# relations creations

# Product PART_OF Category
query = """
MATCH (p:Product)
MATCH (c:Category)
WHERE c.code IN p.categories
MERGE (p)-[:PART_OF]->(c);
"""
result = neo4j_helper.run_query(query)
print(result)
# Product PROMOTED_BY Promotion
query = """
MATCH (p:Product)
MATCH (pr:Promotion)
WHERE pr.code IN p.promotions
MERGE (p)-[:PROMOTED_BY]->(pr);
"""
result = neo4j_helper.run_query(query)
print(result)
# Brand SELLS Product
query = """
MATCH (b:Brand)
MATCH (p:Product)
WHERE p.code IN b.products
MERGE (b)-[:SELLS]->(p);
"""
result = neo4j_helper.run_query(query)
print(result)


<neo4j._sync.work.result.Result object at 0x1174c0590>
<neo4j._sync.work.result.Result object at 0x117527090>
<neo4j._sync.work.result.Result object at 0x116ce1dd0>


In [67]:
import os
import boto3
import json
from dotenv import load_dotenv

load_dotenv()
model_name = "us.meta.llama3-2-3b-instruct-v1:0"

client = boto3.client(
    "bedrock-runtime",
    aws_access_key_id=os.getenv("AWS_BEDROCK_SA_AK",""),
    aws_secret_access_key=os.getenv("AWS_BEDROCK_SA_SK",""),
    region_name=os.getenv("AWS_BEDROCK_META_REGION","")
)

def generate_embedding(input_texts, type: str = "search_document"):
    """
    Generate embeddings using Amazon Bedrock.
    """
    model_id = "cohere.embed-multilingual-v3"
    try:
        # Prepare the payload
        payload = {
            "texts": input_texts,
            "input_type": type,
            "truncate": "START"
        }
        
        # Invoke the model
        response = client.invoke_model(
            modelId=model_id,  # The model ID as configured in Bedrock
            contentType="application/json",  # Content type for the payload
            accept="*/*",  # Accept type for the response
            body=json.dumps(payload)  # Serialize the payload into JSON
        )
        
        # Parse the response
        # print(response['body'].read())
        response_body = json.loads(response['body'].read())
        embedding = response_body.get("embeddings", None)
        
        if embedding is None:
            raise ValueError("No embedding returned in the response.")
        
        return embedding
    
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

embedded = generate_embedding(["Hi", "Bye"])

In [14]:
# Embed Brand

query = "MATCH (b:Brand) RETURN b.code AS code, b.name AS name, b.elabDescription AS description"
query_result = neo4j_helper.execute_query(query=query)
for node in tqdm(query_result):
    if node.get("description"):
        phrase = f"{node['name']}: {node['description']}"
    else: phrase = node['name']
    
    node["embedding"] = generate_embedding(phrase)


100%|██████████| 893/893 [04:01<00:00,  3.70it/s]


KeyError: 0

In [17]:
# Store the embeddings

for node in tqdm(query_result):
    query = "MATCH (b:Brand {code: $code}) SET b.embedding = $embedding"
    result = neo4j_helper.execute_query(
        query,
        code=node["code"],
        embedding=node["embedding"]
    )


100%|██████████| 893/893 [02:00<00:00,  7.40it/s]


In [20]:
# Embed Promotion

query = "MATCH (b:Promotion) RETURN b.code AS code, b.title AS title, b.description AS description"
query_result = neo4j_helper.execute_query(query=query)
for node in tqdm(query_result):
    if node.get("description"):
        phrase = f"{node['title']}: {node['description']}"
    else: phrase = node['title']
    
    node["embedding"] = generate_embedding(phrase)

100%|██████████| 710/710 [03:19<00:00,  3.56it/s]


In [21]:
# Store the embeddings

for node in tqdm(query_result):
    query = "MATCH (b:Promotion {code: $code}) SET b.embedding = $embedding"
    result = neo4j_helper.execute_query(
        query,
        code=node["code"],
        embedding=node["embedding"]
    )

100%|██████████| 710/710 [01:35<00:00,  7.46it/s]


In [22]:
# Embed Category

query = "MATCH (b:Category) RETURN b.code AS code, b.name AS name"
query_result = neo4j_helper.execute_query(query=query)
for node in tqdm(query_result):
    phrase = node['name']
    
    node["embedding"] = generate_embedding(phrase)

100%|██████████| 7971/7971 [45:35<00:00,  2.91it/s]  


In [23]:
# Store the embeddings

for node in tqdm(query_result):
    query = "MATCH (b:Category {code: $code}) SET b.embedding = $embedding"
    result = neo4j_helper.execute_query(
        query,
        code=node["code"],
        embedding=node["embedding"]
    )

  0%|          | 0/7971 [00:00<?, ?it/s][#C0B7]  _: <CONNECTION> error: Failed to read from defunct connection ResolvedIPv4Address(('34.124.169.171', 7687)) (ResolvedIPv4Address(('34.124.169.171', 7687))): OSError('No data')
Unable to retrieve routing information
Transaction failed and will be retried in 0.8828584676308153s (Unable to retrieve routing information)
100%|██████████| 7971/7971 [18:49<00:00,  7.06it/s]  


In [51]:
# Embed Product

query = "MATCH (b:Product) RETURN b.code AS code, b.name AS name, b.description AS description"
query_result = neo4j_helper.execute_query(query=query)
phrase_cache = []
batch_count = 0
for idx, node in enumerate(query_result):
    if node.get("description"):
        phrase = f"{node['name']}: {node['description']}"
    elif node.get("name"): phrase = node['name']
    else: phrase = ""
    phrase_cache.append(phrase[:2048])
    if (idx + 1) % 96 == 0 or (idx + 1) == len(query_result):
        embedding_batch = generate_embedding(phrase_cache)
        for eid, embedding in enumerate(embedding_batch):
            node_id = eid + (96 * batch_count)
            print(node_id)
            query_result[node_id]["embedding"] = embedding
        phrase_cache = []
        batch_count += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [54]:
print(query_result[0])
print(query_result[7526])

{'code': 'BP_463046', 'name': 'WayWay - Anti-bac Laundry Pods - Orange Blossom & Rose', 'description': 'Long-lasting Fragrance<br/>‧Anti-Bacterial 99.99%<br/>‧Effectively removes odors<br/>‧Super Concentrated, 8x Cleaning Power<br/>‧No Fluorescent Agent<br/>‧High Solubility, No Residue, Environmentally Friendly<br/>‧Softening<br/>‧Aromatic Fragrance<br/>1.Put laundry detergent pod into the bottom of the drum of the washing machine<br/>2.Put clothes into the washing machine and start the washing program. Do not choose the pre-wash function/ quick wash function.<br/>3.Keep the packaging tightly closed after opening.<br/><br/>Use 1pc laundry detergent pod can wash 6kg clothes (about 15 - 18 clothes). Recommended 1 pc per load. Use 2 pcs <br/>for large or heavily soiled clothes. Directly put pods with dry hand into wash machine without piercing it.<br/><br/>Suitable for fabric such as cotton, linen, blended fabrics and synthetic fibers., etc.<br/>', 'embedding': [0.04812622, 0.037475586, 0

In [55]:
# Store the embeddings

for node in query_result:
    if "embedding" not in node:
        print(node)

for node in tqdm(query_result):
    query = "MATCH (b:Product {code: $code}) SET b.embedding = $embedding"
    result = neo4j_helper.execute_query(
        query,
        code=node["code"],
        embedding=node["embedding"]
    )

100%|██████████| 7527/7527 [18:27<00:00,  6.79it/s]  


In [63]:
# vector index creation

q = """CREATE VECTOR INDEX categoryDescription
FOR (n:Category)
ON n.embedding
OPTIONS {
    indexConfig: {
        `vector.dimensions`: 1024,
        `vector.similarity_function`: "cosine"
    }
}"""

neo4j_helper.execute_query(
    q,
)

[]

In [68]:
# vector search

q = """MATCH (p:Product)

WITH p,
     vector.similarity.cosine(p.embedding, $embedding) AS score
WHERE score > 0.7

RETURN p.name AS name, score
ORDER BY score DESC LIMIT 10;"""

neo4j_helper.execute_query(
    q,
    embedding=generate_embedding(["stomach medecine"], type="search_query")[0]
)

[{'name': 'STOMACH ACHE POWDER', 'score': 0.8165736794471741},
 {'name': 'Gaviscon Extra Strength 500 Peppermint Tablets 24tablets',
  'score': 0.814276933670044},
 {'name': "Gaviscon Double Action Tablet 16's", 'score': 0.8015005588531494},
 {'name': 'Stomach Aid Symbiotics (30 Capsules)', 'score': 0.8007451295852661},
 {'name': 'PO SUM ON MEDICATED OIL', 'score': 0.7956420183181763},
 {'name': 'Po Sum On Medicated Oil (H) (18.6ml)', 'score': 0.7898169755935669},
 {'name': 'Gaviscon Liquid Peppermint 200ml', 'score': 0.7889425158500671},
 {'name': 'STOMACH EX PROTECT', 'score': 0.7876700162887573},
 {'name': 'Beauluck A Gastro-resistant Tablets 5mg',
  'score': 0.7832819223403931},
 {'name': 'BioGaia Reuteri Gastrus Probiotic Chewable Tablets',
  'score': 0.7824836373329163}]