In [2]:
from llama_index.readers.google import GoogleDriveReader
loader = GoogleDriveReader()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
help(GoogleDriveReader)

Help on class GoogleDriveReader in module llama_index.readers.google.drive.base:

class GoogleDriveReader(llama_index.core.readers.base.BaseReader)
 |  GoogleDriveReader(credentials_path: str = 'credentials.json', token_path: str = 'token.json', pydrive_creds_path: str = 'creds.txt') -> None
 |  
 |  Google drive reader.
 |  
 |  Method resolution order:
 |      GoogleDriveReader
 |      llama_index.core.readers.base.BaseReader
 |      abc.ABC
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, credentials_path: str = 'credentials.json', token_path: str = 'token.json', pydrive_creds_path: str = 'creds.txt') -> None
 |      Initialize with parameters.
 |  
 |  load_data(self, folder_id: str = None, file_ids: List[str] = None, mime_types: List[str] = None) -> List[llama_index.core.schema.Document]
 |      Load data from the folder id and file ids.
 |      
 |      Args:
 |          folder_id: folder id of the folder in google drive.
 |          file_ids: file i

In [3]:
help(loader.load_data)

Help on method load_data in module llama_index.readers.google.drive.base:

load_data(folder_id: str = None, file_ids: List[str] = None, mime_types: List[str] = None) -> List[llama_index.core.schema.Document] method of llama_index.readers.google.drive.base.GoogleDriveReader instance
    Load data from the folder id and file ids.
    
    Args:
        folder_id: folder id of the folder in google drive.
        file_ids: file ids of the files in google drive.
        mime_types: the mimeTypes you want to allow e.g.: "application/vnd.google-apps.document"
    Returns:
        List[Document]: A list of documents.



In [11]:
def load_data(folder_id: str):
    docs = loader.load_data(folder_id=folder_id)
    for doc in docs:
        # print(doc.metadata)
        print(doc.id_)
    return docs


docs = load_data(folder_id="1RFhr3-KmOZCR5rtp4dlOMNl3LKe1kOA5")

1-ZGC-6UZC_lq_iImaW4nvgFct_jlWpJEZhHwKJHS6Mc
1Dte8R_SjzzQtoq5go8l0I9--U9P5fssdvusSGyiqAw4
1qSzKrTyj30SUy93zN03st2f8CFBGeScRUZ0ogGb-P3E
1ce7wdv5LO8nWHbx-TnjVn7N-b4w4IUmRmB6NuOZSqj4
1TGiETOFt86El-hI4FbE5lYjo65Cw_l5b6yjKlvl7TVg


## Google Drive API Experiments 

In [1]:
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError


In [4]:


# If modifying these scopes, delete the file token.json.
CLIENT_SECRET = 'client_secret_944885290760-eol8kggkm9kv1bgmlpvisr619v2mk4ul.apps.googleusercontent.com.json'

credential=None
SCOPES = ["https://www.googleapis.com/auth/drive.metadata.readonly"]
if os.path.exists("token.json"):
  credential = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not credential or not credential.valid:
  if credential and credential.expired and credential.refresh_token:
    credential.refresh(Request())
  else:
    flow = InstalledAppFlow.from_client_secrets_file(
        CLIENT_SECRET, SCOPES
    )
    credential = flow.run_local_server(port=0)
  # Save the credentials for the next run
  with open("token.json", "w") as token:
    token.write(credential.to_json())

try:
  service = build("drive", "v3", credentials=credential)

  # Call the Drive v3 API
  results = (
      service.files()
      .list(pageSize=10, fields="nextPageToken, files(id, name)")
      .execute()
  )
  items = results.get("files", [])

  if not items:
    print("No files found.")
  else:
    print("Files:")
    for item in items:
      print(f"{item['name']} ({item['id']})")
except HttpError as error:
  # TODO(developer) - Handle errors from drive API.
  print(f"An error occurred: {error}")




RefreshError: ('invalid_scope: Bad Request', {'error': 'invalid_scope', 'error_description': 'Bad Request'})

## Project Experiments 

In [1]:
### Constants 
# Here are some Global Constants variables 
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.gemini import Gemini 
from pathlib import Path 

BASE_DIR = Path(r'C:\Users\Admin\AritraRanjanChowdhury\GEN_AI\Document_RAG_from_GDrive_with_Llama_Index\WebService').resolve()
## Please Change the Path according to your Google Credential Json
GOOGLE_CREDENTIALS_PATH = BASE_DIR.parent/'Google_Credentials'/'promactprojecttask4-46c0f330aee7.json'

VECTOR_STORE_PATH = BASE_DIR/'ChromaDB'

LLM = Gemini()
EMBED_MODEL =  HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
DRIVE_FOLDER_ID = '1sLmLETXRAUA1NAoJDf6TxnJ21SJiXzQ3';


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.schema import BaseNode
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core import StorageContext, VectorStoreIndex 
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.llms.llm import LLM
from typing import Union,Sequence,Optional
from pathlib import Path 


class ChromaVectorStoreIndex(object):
    '''This class is a simple control tool for ChromaDB 
    '''
    
    def __init__(self,persist_dir : Union[Path,str], 
                collection:str='default', 
                embed_model:Optional[BaseEmbedding] = None,
                llm: Optional[Union[str, LLM]] = None,
                 **kwargs):    

        _persist_dir:str = persist_dir if type(persist_dir)==str else persist_dir.resolve().__str__()
        self.chroma_client = chromadb.PersistentClient(path=_persist_dir)
        self.chroma_collection = self.chroma_client.get_or_create_collection(collection)
        self.chroma_vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)   
        self.llm = llm 
        if embed_model :
            self.embed_model = embed_model
        else :
            self.embed_model = EMBED_MODEL
        self.kwargs = kwargs
        

    def create_index(self,nodes:Sequence[BaseNode]):
        self.kwargs['nodes']=nodes
        self.kwargs['embed_model'] = self.embed_model
        self.kwargs['storage_context']=StorageContext.from_defaults(
                vector_store=self.chroma_vector_store) 
        self.__index = VectorStoreIndex(**self.kwargs)
        return self.__index
    
    def load_index(self, persist_dir: Optional[Union[Path,str]]=None, 
                            collection_name:Optional[str]=None,embed_model:Optional[BaseEmbedding]=None) :
        if persist_dir :
            _persist_dir:str = persist_dir if type(persist_dir)==str else persist_dir.resolve().__str__()  
            self.chroma_client = chromadb.PersistentClient(path=_persist_dir)
        if collection_name :
            self.chroma_collection = self.chroma_client.get_or_create_collection(collection_name)
        if persist_dir and collection_name:
            self.chroma_vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)            
        if  embed_model:
            self.embed_model = embed_model
            
        self.__index = VectorStoreIndex.from_vector_store(
            self.chroma_vector_store,
            embed_model=self.embed_model)
        return self.__index

        
    def get_retriever(self, llm: Optional[Union[str, LLM]] = None):
        if not llm: llm = self.llm
        return self.__index.as_retriever(llm = llm)
    
    def get_query_engine (self, llm: Optional[Union[str, LLM]] = None):
        if not llm: llm = self.llm
        return self.__index.as_query_engine(llm = llm)
        
    def get_chat_engine (self, llm: Optional[Union[str, LLM]] = None):
        if not llm: llm = self.llm
        return self.__index.as_chat_engine(llm = llm)
        
    
# lOADING TEXT AFTER SELECT 






In [3]:
from llama_index.readers.google import GoogleDriveReader
loader = GoogleDriveReader(credentials_path=GOOGLE_CREDENTIALS_PATH)

def load_data_from_drive_folder(folder_id: str):
    docs = loader.load_data(folder_id=folder_id)
    if not docs :
        raise Exception("Content Load Error","No content is loaded")
        
    return docs



In [12]:
from llama_index.core.node_parser import SentenceSplitter
    
documents = load_data_from_drive_folder(DRIVE_FOLDER_ID)    
node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

nodes = node_parser.get_nodes_from_documents(
    documents, show_progress=False
)
# print(nodes)
index = chroma_index.get_chroma_index()
# print("****************************************************",index)
index2 = chroma_index.load_vector_store(persist_dir=VECTOR_STORE_PATH, collection_name ='Set1')
print(index2)

C:\Users\Admin\AritraRanjanChowdhury\GEN_AI\Document_RAG_from_GDrive_with_Llama_Index\WebService\ChromaDB
Create create_index_from_documents call Chroma Vector Store
None


In [26]:
# chroma_index = ChromaVectorStoreIndex()
chroma_index = ChromaVectorStoreIndex(persist_dir=VECTOR_STORE_PATH, collection='Set1')

index = chroma_index.load_index(persist_dir=VECTOR_STORE_PATH, collection_name='Set1')

In [29]:
dir(index)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_add_nodes_to_index',
 '_aget_node_with_embedding',
 '_async_add_nodes_to_index',
 '_build_index_from_nodes',
 '_callback_manager',
 '_delete_node',
 '_docstore',
 '_embed_model',
 '_get_node_with_embedding',
 '_graph_store',
 '_index_struct',
 '_insert',
 '_insert_batch_size',
 '_is_protocol',
 '_object_map',
 '_service_context',
 '_show_progress',
 '_storage_context',
 '_store_nodes_override',
 '_transformations',
 '_use_async',
 '_vector_store',
 'as_chat_engine',
 'as_qu

In [31]:
index.storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x000002090E5836D0>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x0000020914157510>, vector_stores={'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={}), 'image': <llama_index.core.vector_stores.simple.SimpleVectorStore object at 0x0000020913F6E590>}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x000002091417E050>)

## Drive Automation 



In [33]:
import os.path

from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
SCOPES = ["https://www.googleapis.com/auth/drive.activity.readonly"]

In [34]:
creds = None

if os.path.exists("token.json"):
    creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            r".\Google_Credentials\client_secret_944885290760-eol8kggkm9kv1bgmlpvisr619v2mk4ul.apps.googleusercontent.com.json", SCOPES
        )
    creds = flow.run_local_server(port=0)
    # Save the credentials for the next run
    with open("token.json", "w") as token:
        token.write(creds.to_json())

service = build("driveactivity", "v2", credentials=creds)

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=944885290760-eol8kggkm9kv1bgmlpvisr619v2mk4ul.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A59726%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.activity.readonly&state=ruEDIIWsx101xjbsXgPU3OYfGI57Yw&access_type=offline


In [35]:
results = service.activity().query(body={'ancestorName':"items/1sLmLETXRAUA1NAoJDf6TxnJ21SJiXzQ3",
                                        "filter":'time >= "2024-02-26T11:35:58+00:00"',
                                        # "pageSize": 12
                                        }).execute()
activities = results.get('activities', [])
activities[0]


{'primaryActionDetail': {'edit': {}},
 'actors': [{'user': {'knownUser': {'personName': 'people/109607179008447993810'}}}],
 'actions': [{'detail': {'edit': {}}}],
 'targets': [{'driveItem': {'name': 'items/1XRYcaHI_5nrg_Ht82hF9q42wuzVbtmqZUtUecNlsx14',
    'title': 'Example',
    'file': {},
    'mimeType': 'application/vnd.google-apps.document',
    'owner': {'user': {'knownUser': {'personName': 'people/101190240686752487263',
       'isCurrentUser': True}}},
    'driveFile': {}}}],
 'timestamp': '2024-02-26T11:59:06.685Z'}

In [30]:
from llama_index.readers.google import GoogleDriveReader
# from constants import GOOGLE_CREDENTIALS_PATH
loader = GoogleDriveReader(credentials_path=GOOGLE_CREDENTIALS_PATH)

def load_data_from_drive_folder(folder_id: str):
    docs = loader.load_data(folder_id=folder_id)
    if not docs :
        raise Exception("Content Load Error","No content is loaded")
        
    return docs




  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'GOOGLE_CREDENTIALS_PATH' is not defined

In [13]:
help(loader.load_data)

Help on method load_data in module llama_index.readers.google.drive.base:

load_data(folder_id: str = None, file_ids: List[str] = None, mime_types: List[str] = None) -> List[llama_index.core.schema.Document] method of llama_index.readers.google.drive.base.GoogleDriveReader instance
    Load data from the folder id and file ids.
    
    Args:
        folder_id: folder id of the folder in google drive.
        file_ids: file ids of the files in google drive.
        mime_types: the mimeTypes you want to allow e.g.: "application/vnd.google-apps.document"
    Returns:
        List[Document]: A list of documents.



In [24]:
help(service.activity().query)

Help on method method in module googleapiclient.discovery:

method(**kwargs) method of googleapiclient.discovery.Resource instance
    Query past activity in Google Drive.
    
    Args:
      body: object, The request body.
        The object takes the form of:
    
    { # The request message for querying Drive activity.
      "ancestorName": "A String", # Return activities for this Drive folder, plus all children and descendants. The format is `items/ITEM_ID`.
      "consolidationStrategy": { # How the individual activities are consolidated. If a set of activities is related they can be consolidated into one combined activity, such as one actor performing the same action on multiple targets, or multiple actors performing the same action on a single target. The strategy defines the rules for which activities are related. # Details on how to consolidate related actions that make up the activity. If not set, then related actions aren't consolidated.
        "legacy": { # A strategy tha

In [30]:
import datetime 
import time 
obj = None
def callbacks(obj:dict,container:object):
        container = obj
def watch_drive(folder_id : str, callbacks:callable):
    previous_time = datetime.datetime.now() - datetime.timedelta(seconds=10)
    while True:
        current_time = datetime.datetime.now()
        current_time_formate=current_time.astimezone(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")+'+00:00'
        previous_time_formate=previous_time.astimezone(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")+'+00:00'
        
        results = service.activity().query(body={
                "filter":f'time > "{previous_time_formate}" AND time < "{current_time_formate}"',
                ## `time >= "2016-01-10T01:02:03-05:00"`
                'ancestorName':"items/1sLmLETXRAUA1NAoJDf6TxnJ21SJiXzQ3",
                "pageSize": 2}).execute()
        
        activities = results.get('activities', [])
        previous_time = current_time
        # function(activities,obj)
        time.sleep(10)
        print(load_data_from_drive_folder('1sLmLETXRAUA1NAoJDf6TxnJ21SJiXzQ3'))
        print(activities)
        
watch_drive('1sLmLETXRAUA1NAoJDf6TxnJ21SJiXzQ3',callbacks)





KeyboardInterrupt: 

In [83]:
import datetime

# Get the current datetime
current_datetime = datetime.datetime.now()
current_time_utc = current_datetime.astimezone(datetime.timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")+'+00:00'

print(current_time_utc)

2024-02-26T11:39:28+00:00


In [17]:
dir(z)


['__aiter__',
 '__anext__',
 '__class__',
 '__class_getitem__',
 '__del__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'aclose',
 'ag_await',
 'ag_code',
 'ag_frame',
 'ag_running',
 'asend',
 'athrow']

In [3]:
a={}
for i in range(45):
    a[f'{i}'] = i**2

In [12]:
c = iter(a)


'3'

## LlamaIndex Experiment 

In [1]:
# Here are some Global Constants variables 
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.gemini import Gemini 
from pathlib import Path 

BASE_DIR = Path(r'C:\Users\Admin\AritraRanjanChowdhury\GEN_AI\Document_RAG_from_GDrive_with_Llama_Index\WebService\constants.py').resolve().parent
## Please Change the Path according to your Google Credential Json
GOOGLE_CREDENTIALS_PATH = BASE_DIR.parent/'Google_Credentials'/'promactprojecttask4-46c0f330aee7.json'
GOOGLE_CLIENT_SECRET = BASE_DIR.parent/'Google_Credentials'/'client_secret_944885290760-eol8kggkm9kv1bgmlpvisr619v2mk4ul.apps.googleusercontent.com.json'

VECTOR_STORE_PATH = BASE_DIR/'ChromaDB'

LLM = Gemini()
EMBED_MODEL =  HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
DRIVE_FOLDER_ID = '1sLmLETXRAUA1NAoJDf6TxnJ21SJiXzQ3';
DRIVE_API_SCOPES = ["https://www.googleapis.com/auth/drive.activity.readonly"]

MONITORING_TIME_DELAY  = 10 
BASE_DIR

  from .autonotebook import tqdm as notebook_tqdm


WindowsPath('C:/Users/Admin/AritraRanjanChowdhury/GEN_AI/Document_RAG_from_GDrive_with_Llama_Index/WebService')

In [2]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.schema import BaseNode
from llama_index.core.embeddings import BaseEmbedding
from llama_index.core import StorageContext, VectorStoreIndex 
from llama_index.vector_stores.chroma import ChromaVectorStore
# from constants import EMBED_MODEL
from llama_index.core.llms.llm import LLM
from typing import Union,Sequence,Optional
from pathlib import Path 


class ChromaVectorStoreIndex(object):
    '''This class is a simple control tool for ChromaDB 
    '''
    
    def __init__(self,persist_dir : Union[Path,str], 
                collection:str='default', 
                embed_model:Optional[BaseEmbedding] = None,
                llm: Optional[Union[str, LLM]] = None,
                 **kwargs):    

        _persist_dir:str = persist_dir if type(persist_dir)==str else persist_dir.resolve().__str__()
        self.chroma_client = chromadb.PersistentClient(path=_persist_dir)
        self.chroma_collection = self.chroma_client.get_or_create_collection(collection)
        self.chroma_vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)   
        self.llm = llm 
        if embed_model :
            self.embed_model = embed_model
        else :
            self.embed_model = EMBED_MODEL
        self.kwargs = kwargs
        

    def create_index(self,nodes:Sequence[BaseNode]):
        self.kwargs['nodes']=nodes
        self.kwargs['embed_model'] = self.embed_model
        self.kwargs['storage_context']=StorageContext.from_defaults(
                vector_store=self.chroma_vector_store) 
        self.__index = VectorStoreIndex(**self.kwargs)
        return self.__index
    
    def load_index(self, persist_dir: Optional[Union[Path,str]]=None, 
                            collection_name:Optional[str]=None,embed_model:Optional[BaseEmbedding]=None) :
        if persist_dir :
            _persist_dir:str = persist_dir if type(persist_dir)==str else persist_dir.resolve().__str__()  
            self.chroma_client = chromadb.PersistentClient(path=_persist_dir)
        if collection_name :
            self.chroma_collection = self.chroma_client.get_or_create_collection(collection_name)
        if persist_dir and collection_name:
            self.chroma_vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)            
        if  embed_model:
            self.embed_model = embed_model
            
        self.__index = VectorStoreIndex.from_vector_store(
            self.chroma_vector_store,
            embed_model=self.embed_model)
        return self.__index

        
    def get_retriever(self, llm: Optional[Union[str, LLM]] = None,**kwargs):
        if not llm: llm = self.llm
        return self.__index.as_retriever(llm = llm)
    
    def get_query_engine (self, llm: Optional[Union[str, LLM]] = None,**kwargs):
        if not llm: llm = self.llm
        return self.__index.as_query_engine(llm = llm)
        
    def get_chat_engine (self, llm: Optional[Union[str, LLM]] = None,**kwargs):
        if not llm: llm = self.llm
        return self.__index.as_chat_engine(llm = llm)
        

In [3]:
from llama_index.readers.google import GoogleDriveReader
from llama_index.core.node_parser import SentenceSplitter
drive_loader = GoogleDriveReader(credentials_path=GOOGLE_CREDENTIALS_PATH,
                token_path='WebService/llama_index_drive_loader/token.json',
                pydrive_creds_path='WebService/llama_index_drive_loader/creds.txt'
                )
docs = drive_loader.load_data(file_ids = ['1Q5rLWK85eSYMS6mSh9gA9WZegIJa3EMJ'])
# node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
# nodes = node_parser.get_nodes_from_documents(
#                 docs, show_progress=False
#             )

In [4]:
import nest_asyncio

nest_asyncio.apply()

In [5]:
from llama_index.core.extractors import (
    TitleExtractor,
    # QuestionsAnsweredExtractor,
)
from llama_index.core.node_parser import TokenTextSplitter
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=512, chunk_overlap=128
)
title_extractor = TitleExtractor(nodes=5,llm=Gemini())
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[text_splitter, title_extractor]
)

nodes = pipeline.run(
    documents=docs,
    in_place=True,
    show_progress=True,
)

Parsing nodes: 100%|██████████| 15/15 [00:00<00:00, 59.78it/s]
100%|██████████| 5/5 [00:11<00:00,  2.22s/it]


In [6]:
chroma_index = ChromaVectorStoreIndex(persist_dir=VECTOR_STORE_PATH, collection='Set1')
index  = chroma_index.create_index(nodes=nodes)


Add of existing embedding ID: c05959d3-2673-4e4a-9e20-bb6219b0b764
Add of existing embedding ID: 7c853a31-42d2-4a5f-8949-ad3db0c6d5d9
Add of existing embedding ID: 2d67e423-5625-4836-80ba-830c92fdf370
Add of existing embedding ID: ba2d4539-a4b9-4f26-a0e3-9aa9702b5c61
Add of existing embedding ID: 893fe48f-b9c8-4b06-a2b1-428cae4de106
Add of existing embedding ID: 66415da4-205b-421d-a573-c782a2685752
Add of existing embedding ID: 104f9485-ec69-4baa-8a65-14f7ff92a2ec
Add of existing embedding ID: bdf4938a-6f0f-4071-b38c-b79f8c611698
Add of existing embedding ID: eaf6fa78-8846-405c-96db-33a0b9724094
Add of existing embedding ID: 0f66144f-404b-4380-bbea-0869ef6bacde
Add of existing embedding ID: 3a79f895-6062-43aa-a932-d4c6f13b52f6
Add of existing embedding ID: 21a4d5ff-86d6-4070-a045-4ef47ac823ba
Add of existing embedding ID: 77d20cd6-883d-4c91-9ae4-788e0fa32fb3
Add of existing embedding ID: 44754c7a-668f-4bc9-bffa-59c49142d5f1
Add of existing embedding ID: 489a8507-beb5-4b4a-a92f-82858c72

In [9]:
from llama_index.llms.gemini import Gemini
query_engine = index.as_query_engine(llm=Gemini())
response = query_engine.query("What is Self Attention mechanism ")
response.response 

'Self-Attention is a mechanism that allows a neural network to attend to different parts of its own input sequence. This is done by calculating a weighted sum of the values in the input sequence, where the weights are determined by a query vector. The query vector is typically a learned linear projection of the input sequence. The Self-Attention mechanism can be used to model long-range dependencies in the input sequence, and it has been shown to be effective for a variety of sequence transduction tasks, such as machine translation and text summarization.'

In [10]:
response.metadata

{'66415da4-205b-421d-a573-c782a2685752': {'page_label': '15',
  'file_name': 'C:\\Users\\Admin\\AppData\\Local\\Temp\\tmpitebs3dk\\AttentionIsAllYouNeed.pdf',
  'file id': '1Q5rLWK85eSYMS6mSh9gA9WZegIJa3EMJ',
  'author': 'Aritra Ranjan Chowdhury',
  'file name': 'AttentionIsAllYouNeed.pdf',
  'mime type': 'application/pdf',
  'created at': '2024-02-27T08:42:41.800Z',
  'modified at': '2024-02-22T04:55:21.000Z'},
 '159c0bcb-5ff9-4483-b3d6-f6edded98122': {'page_label': '15',
  'file_name': 'C:\\Users\\Admin\\AppData\\Local\\Temp\\tmp8cc9dlva\\AttentionIsAllYouNeed.pdf',
  'file id': '1Q5rLWK85eSYMS6mSh9gA9WZegIJa3EMJ',
  'author': 'Aritra Ranjan Chowdhury',
  'file name': 'AttentionIsAllYouNeed.pdf',
  'mime type': 'application/pdf',
  'created at': '2024-02-27T08:42:41.800Z',
  'modified at': '2024-02-22T04:55:21.000Z',
  'document_title': 'The Transformer: A Novel Neural Network Architecture for Machine Translation'}}

### Meta Data Extraction 

In [6]:
from llama_index.readers.google import GoogleDriveReader

drive_loader = GoogleDriveReader(credentials_path=GOOGLE_CREDENTIALS_PATH,
                token_path='WebService/llama_index_drive_loader/token.json',
                pydrive_creds_path='WebService/llama_index_drive_loader/creds.txt'
                )
docs = drive_loader.load_data(file_ids = ['1Q5rLWK85eSYMS6mSh9gA9WZegIJa3EMJ'])

In [5]:
import nest_asyncio

nest_asyncio.apply()

## More Experiments With Extractor 

In [None]:
docs

[Document(id_='1Q5rLWK85eSYMS6mSh9gA9WZegIJa3EMJ', embedding=None, metadata={'page_label': '1', 'file_name': 'C:\\Users\\Admin\\AppData\\Local\\Temp\\tmpe1_d8rz1\\AttentionIsAllYouNeed.pdf', 'file id': '1Q5rLWK85eSYMS6mSh9gA9WZegIJa3EMJ', 'author': 'Aritra Ranjan Chowdhury', 'file name': 'AttentionIsAllYouNeed.pdf', 'mime type': 'application/pdf', 'created at': '2024-02-27T08:42:41.800Z', 'modified at': '2024-02-22T04:55:21.000Z'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.comNoam Shazeer∗\nGoogle Brai