In [1]:
import marqo
import json
import os
from pprint import pprint

from index_settings import pmec_index_settings

mq = marqo.Client(url='http://localhost:8882')
index_name = "wny-manufacturing-companies"
local_json_filename = 'sample.json'

def read_json_documents(mode='single', preprocess=True, folder_prefix='data'):
    """"
    Read JSON objects (either a single consolidated file or multiple individual files) 
    in the specified folder and return their content as a list of objects.

    :param: mode: Either 'single' or 'multi'. In 'single' mode, it is assumed that all
            data is in a single consolidated JSON file (list of objects). In 'multi'
            mode, all JSON files in the specified directory are ingested.
    :param: preprocess: Toggle preprocessing (str-concatenating lists, etc)
    :param: folder_prefix: A folder path relative from the working directory which contains
            JSON data. By default, it assumes the folder is called 'data'.
    """
    document_list = []
    print(f'Document fetch mode: {mode}. Preprocessing: {preprocess}')
    if mode == 'single':
        with open(os.path.join(folder_prefix, os.listdir(folder_prefix)[0]), 'r') as f:
            print(f"Using consolidated file: '{os.path.join(folder_prefix, os.listdir(folder_prefix)[0])}'")
            json_data = json.load(f)
            for json_object in json_data:
                if preprocess:
                    json_data = flatten_json(json_object=json_object)
                else:
                    json_data=json_object
                document_list.append(json_data)
    elif mode == 'multiple':
        print(f"Found {len(os.listdir())} objects under {folder_prefix}/.")
        for filename in os.listdir(folder_prefix):
            with open(os.path.join(folder_prefix, filename), 'r') as f:
                json_data = json.load(f)
                if preprocess:
                    json_data = flatten_json(json_object=json_object)
                else:
                    json_data=json_object
                document_list.append(json_data)
    else:
        raise ValueError(f"Invalid mode: '{mode}' ")
    print(f"{len(document_list)} object(s) extracted.")
    return document_list

def flatten_json(json_object):
    """Flatten composite structured in a JSON object and return it."""
    for key, value in json_object.items():
        if type(value) is list:
            json_object[key] = ', '.join(value)
    return json_object

def clear_indexes_if_exist(index_name=index_name):
    # Drop existing index - for development purposes
    for index in mq.get_indexes()['results']:
        if index['indexName'] == index_name:
            mq.index(index_name=index['indexName']).delete()
            print(f"Index '{index_name}' cleared.")

In [2]:
# Drop and recreate index
clear_indexes_if_exist()
pprint(pmec_index_settings)
mq.create_index(index_name, settings_dict=pmec_index_settings)

Index 'wny-manufacturing-companies' cleared.
{'allFields': [{'features': ['lexical_search', 'filter'],
                'name': 'company_id',
                'type': 'text'},
               {'features': ['lexical_search'],
                'name': 'company_name',
                'type': 'text'},
               {'features': ['lexical_search'],
                'name': 'datapoint_type',
                'type': 'text'},
               {'features': ['lexical_search'],
                'name': 'datapoint_value',
                'type': 'text'},
               {'features': ['lexical_search'],
                'name': 'source_urls',
                'type': 'array<text>'},
               {'features': ['lexical_search'],
                'name': 'source_inferred',
                'type': 'array<text>'},
               {'features': ['lexical_search'],
                'name': 'source_excerpts',
                'type': 'array<text>'}],
 'tensorFields': ['company_name', 'datapoint_value', 'datapoint_type

{'acknowledged': True, 'index': 'wny-manufacturing-companies'}

In [8]:
# Add JSON data
json_documents = read_json_documents(mode='single', preprocess=False)
result = mq.index(index_name).add_documents(json_documents)
print(result)

Document fetch mode: single. Preprocessing: False
Using consolidated file: 'data/sample.json'
7 object(s) extracted.
{'errors': False, 'processingTimeMs': 1089.7209590002603, 'index_name': 'wny-manufacturing-companies', 'items': [{'status': 200, '_id': 'b3ce19ed-9baa-4ea3-8f05-555ab64f0fd4'}, {'status': 200, '_id': '8361b4e3-6cad-4138-a52f-fd46f9249e60'}, {'status': 200, '_id': '8a0171a3-ad18-45f2-a0bc-23f59f03490b'}, {'status': 200, '_id': 'c36153e5-812a-48cd-b0a0-928af0b11a5b'}, {'status': 200, '_id': '1046612d-2e14-46b1-a1e6-fce9817b7f52'}, {'status': 200, '_id': '2c09dd7c-714e-414f-9a50-44c70ddaac0f'}, {'status': 200, '_id': 'eca16d1d-cbfa-46e9-aee8-d880d3af00ad'}]}


In [13]:
from llama_cpp import Llama

LLM = Llama(
    model_path="models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
    n_ctx=1024
)

llama_load_model_from_file: using device Metal (Apple M3 Pro) - 12282 MiB free
llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Meta-Llama-3.1
llama_model_loader: - kv   5:                         general.size_label str              = 8B
llama_model_loader: - kv   6:                     

In [14]:
query="What companies are located in Buffalo that service gas pipes and lines?"
print(f"\nThe query was: {query}")

# Vector Search Results
print("\nHits from Vector DB: \n==============\n")
results = mq.index(index_name).search(q=query)
print(f"ID: {results['hits'][0]['_id']}, Score: {results['hits'][0]['_score']:.3f}.")
print("Company Name:", results['hits'][0]['company_name'])
print("Relevant data:")
pprint(results['hits'][0]['_highlights'])
pprint(results)



The query was: What companies are located in Buffalo that service gas pipes and lines?

Hits from Vector DB: 

ID: 1046612d-2e14-46b1-a1e6-fce9817b7f52, Score: 0.841.
Company Name: Pii Holdings, Inc.
Relevant data:
[{'datapoint_value': 'gas pipe fixtures'}]
{'hits': [{'_highlights': [{'datapoint_value': 'gas pipe fixtures'}],
           '_id': '1046612d-2e14-46b1-a1e6-fce9817b7f52',
           '_score': 0.8411167170910742,
           'company_id': '999',
           'company_name': 'Pii Holdings, Inc.',
           'datapoint_type': 'product',
           'datapoint_value': 'gas pipe fixtures',
           'source_excerpts': ['we do gas pipes on a daily basis.'],
           'source_inferred': ['we do gas pipes on a daily basis.'],
           'source_urls': ['https://www.growertalks.com/Article/?articleid=21683']},
          {'_highlights': [{'datapoint_value': '3288 Main St. Buffalo NY'}],
           '_id': '2c09dd7c-714e-414f-9a50-44c70ddaac0f',
           '_score': 0.8306013006661755,
 

In [21]:
# Create Context for Prompt
likely_results = []
prompt_context_string=""
ignored_fields = ["_highlights","_id", "company_id","_score","source_urls","source_inferred"]
for result in results['hits']:
    for field in ignored_fields:
        if field in result:
            result.pop(field)
    for key, value in flatten_json(result).items():
        prompt_context_string += f"{key}: {value}\n"
    #prompt_context_string += '\n'

# Using LLama with context
print("\nHits from LLama 3.1-8B Model: \n==============\n")
prompt = f"""
Here is some information about various companies:
{prompt_context_string}
Use this information to answer questions.
Do NOT make up an answer if you do not know.
Your brief, short response should not be more than two sentences long.
Do not write code - return a plaintext answer to the following question:
Q: {query}
A:"""
print(prompt)

model_output = LLM(
    prompt, 
    max_tokens=512, 
    stop=["Q:"],
    top_p=0.1,
    temperature=0.1
)["choices"][0]["text"]

for sentence in model_output.split('.')[:3]:
    print(sentence)


Hits from LLama 3.1-8B Model: 


Here is some information about various companies:
company_name: Pii Holdings, Inc.
datapoint_type: product
datapoint_value: gas pipe fixtures
source_excerpts: we do gas pipes on a daily basis.
company_name: XYZ Inc.
datapoint_type: location
datapoint_value: 3288 Main St. Buffalo NY
source_excerpts: visit our website.
company_name: Pii Holdings, Inc.
datapoint_type: location
datapoint_value: 2150 Elmwood Avenue Buffalo, NY
source_excerpts: they also have a division that designs and installs racks for solar panels.
company_name: Pii Holdings, Inc.
datapoint_type: location
datapoint_value: 2150 Elmwood Avenue Buffalo, NY
source_excerpts: they also have a division that designs and installs racks for solar panels.
company_name: XYZ Inc.
datapoint_type: product
datapoint_value: 3D printing
source_excerpts: our latest resin printers and sintering machines.
company_name: Rough Brothers Holding Co., Inc
datapoint_type: product
datapoint_value: greenhouses
sourc

Llama.generate: 550 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   12604.88 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   473 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   42167.26 ms /   474 tokens


 Pii Holdings, Inc
 is located in Buffalo and services gas pipes and lines
  Pii Holdings, Inc
