In [33]:
import os
import json
from uuid import uuid4
from dotenv import load_dotenv
from ipynb.fs.full.crawl import craw_web
from langchain_milvus import Milvus
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document

In [34]:
load_dotenv()

True

In [35]:
def load_data_from_local(filename: str, directory: str) -> tuple:
    file_path = os.path.join(directory, file_path)
    with open(file_path, 'r') as file:
        data = json.load(file)
    print(f'Data loaded from {file_path}')
    
    return data, filename.rsplit('.', 1)[0].replace('_', ' ')

In [36]:
def seed_milvus(URI_link: str, collection_name: str, filename: str, directory: str, use_ollama: bool = False) -> Milvus:
    if use_ollama: 
        embedding = OllamaEmbeddings(model="llama3")
    
    local_data, doc_name = load_data_from_local(filename, directory)
    
    documents = [
        Document(
            page_content=doc.get('page_content') or '',
            metadata={
                'source': doc['metadata'].get('source') or '',
                'content_type': doc['metadata'].get('content_type') or 'text/plain',
                'title': doc['metadata'].get('title') or '',
                'description': doc['metadata'].get('description') or '',
                'language': doc['metadata'].get('language') or 'en',
                'doc_name': doc_name,
                'start_index': doc['metadata'].get('start_index') or 0
            }
        )
        for doc in local_data
    ]
    
    print('documents: ', documents)
    
    uuids = [str(uuid4()) for _ in range(len(documents))]
    
    vectorstore = Milvus(
        embedding_function=embedding,
        connection_args={"uri": URI_link},
        collection_name=collection_name,
        drop_old=True
    )
    
    vectorstore.add_documents(documents=documents, ids=uuids)
    print('vector: ', vectorstore)
    return vectorstore
    

In [37]:
def seed_milvus_live(URL: str, URI_link: str, collection_name: str, doc_name: str, use_ollama: bool = False) -> Milvus:
    if use_ollama: 
        embedding = OllamaEmbeddings(model = "llama3")
    
    documents = craw_web(URL)
    
    for doc in documents:
        metadata={
            'source': doc['metadata'].get('source') or '',
            'content_type': doc['metadata'].get('content_type') or 'text/plain',
            'title': doc['metadata'].get('title') or '',
            'description': doc['metadata'].get('description') or '',
            'language': doc['metadata'].get('language') or 'en',
            'doc_name': doc_name,
            'start_index': doc['metadata'].get('start_index') or 0
        }
        doc.metadata = metadata
    
    uuids = [str(uuid4()) for _ in range(len(documents))]
    
    vectorstore = Milvus(
        embedding_function=embedding,
        connection_args={"uri": URI_link},
        collection_name=collection_name,
        drop_old=True
    )
    
    vectorstore.add_documents(documents=documents, ids=uuids)
    print('vector: ', vectorstore)
    return vectorstore

In [38]:
def connect_to_milvus(URI_link: str, collection_name: str) -> Milvus:
    embedding = OllamaEmbeddings(model="llama3")
    vectorstore = Milvus(
        embedding_function=embedding, 
        connection_args={"uri": URI_link},
        collection_name=collection_name
    )
    return vectorstore

In [39]:
#def main():
    #seed_milvus('http://localhost:19530', 'data_test', 'titech.json', 'data', use_ollama=False)
    # # seed_milvus_live('https://www.owc.titech.ac.jp', 'http://localhost:19530', 'data_test_live', 'titech-ai', use_ollama=False)


In [40]:
#if __name__ == '__main__':
    #main()