In [12]:
import os
from llama_index.core.llms import CustomLLM, CompletionResponse, CompletionResponseGen
from typing import Any, List, Optional
# from groq import Groq, GroqError

from llama_index.core import ServiceContext, VectorStoreIndex
from llama_index.llms.langchain import LangChainLLM
from llama_index.llms.groq import Groq

import requests
from pathlib import Path
from pprint import pprint
from dotenv import load_dotenv
import nest_asyncio

from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.core import VectorStoreIndex, Document

from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever, SummaryIndexRetriever, TransformRetriever

from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings


from dotenv import load_dotenv
import nest_asyncio

load_dotenv()
nest_asyncio.apply()

In [13]:
GROQ_API_KEY = os.environ["GROQ_API_KEY"] 
COHERE_API_KEY = os.environ["COHERE_API_KEY"] 
LLAMA_CLOUD_API_KEY = os.environ["LLAMA_CLOUD_API_KEY"]

In [14]:
llm = Groq(
    model="llama3-groq-70b-8192-tool-use-preview", api_key=GROQ_API_KEY
)

In [15]:
def load_documents(doc_path: Path):
    file_type = doc_path.suffix
    parser = LlamaParse(
        result_type="text",
    )
    file_extractor = {file_type: parser}
    documents = SimpleDirectoryReader(input_files=[doc_path], file_extractor=file_extractor).load_data()

    return documents


In [16]:
doc_path = Path("../data/Academic-CV-V1 .pdf")
documents = load_documents(doc_path)

Started parsing the file under job_id 01acc329-0f47-4b86-8f43-09a0ebfddaab


In [18]:
llm = Groq(
    model="llama3-groq-70b-8192-tool-use-preview",
     api_key=GROQ_API_KEY
)

embed_model = CohereEmbedding(
    api_key=COHERE_API_KEY,
    model_name="embed-english-v3.0",
    input_type="search_query",
)

In [19]:

Settings.llm = llm 
Settings.embed_model = embed_model
# Settings.node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
# Settings.num_output = 512
# Settings.context_window = 3900

In [20]:
index = VectorStoreIndex.from_documents(
    documents,
    # service_context=service_context,
    model=embed_model,
)

In [28]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
    # vector_store_kwargs={"score_threshold": 0.7},
    # mmr_threshold=0.8
)

In [29]:
response_synthesizer = get_response_synthesizer()

In [30]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    # node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [31]:
response = query_engine.query("what are the skills")

In [32]:
pprint(response.response)

('The skills listed in the context include:\n'
 '\n'
 '1. Programming Languages: Python, C++\n'
 '2. Databases: MySQL, PostgreSQL, SQL Server\n'
 '3. Machine Learning: Scikit-learn, TensorFlow, PyTorch, NumPy, Pandas\n'
 '4. Web Development: FastAPI, Django, Streamlit\n'
 '5. Cloud & Deployment: AWS, Docker\n'
 '6. Data Collection: Selenium, BeautifulSoup, Scrapy\n'
 '7. NLP: NLTK')
