In [None]:
from langchain_community.document_loaders import ConcurrentLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = ConcurrentLoader.from_filesystem("/home/leapfrog/llm/llm_files/phase_2/RAG_LANGCHAIN_LLAMA/api_langchain_cohere_pinecone_llama3/data/leapfrog/aggregated_content.txt", glob="**/*.txt")
loaded_data = loader.load()
leapfrog_confluence_data = loaded_data[0].page_content

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.create_documents([leapfrog_confluence_data])

# ========================================================================================
# ========================================================================================

from langchain_cohere import CohereEmbeddings
from langchain_pinecone import PineconeVectorStore

import os
from dotenv import load_dotenv
load_dotenv()
os.environ['COHERE_API_KEY'] = os.getenv('COHERE_API_KEY')
os.environ['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')

embeddings = CohereEmbeddings()
index_name = "index-leapfrog-confluence"

# ========================================================================================
# ========================================================================================

# pip install pinecone-client
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone()

if index_name not in pc.list_indexes().names():
  # Create the index
  pc.create_index(
    name=index_name,
    dimension=4096,
    metric='cosine',
    spec=ServerlessSpec(
      cloud="aws",
      region="us-east-1"
    )
  )

index = pc.Index(index_name)
print(index.describe_index_stats())
print(index.describe_index_stats()['total_vector_count'])

from langchain_community.vectorstores import Pinecone

# Check if there is already some data in the index on Pinecone
if index.describe_index_stats()['total_vector_count'] > 0:
    # If there is, use from_existing_index to use the vector store
    vectorstore = Pinecone.from_existing_index(
        index_name,
        embeddings,
    )
    print("Vector index already exists, will use this.")
else:
    # If there is not, use from_documents to fill the vector store
    vectorstore = PineconeVectorStore.from_documents(
        splits,
        embeddings,
        index_name=index_name
    )
    print("Vector index created successfully.")

print(index.describe_index_stats())
