In [1]:
from bs4 import BeautifulSoup
from crawler_llama_index.ingest import Parser, ParseRecord

class thisParser(Parser):
    def parse(self, url, html) -> ParseRecord:
        soup = BeautifulSoup(html, 'html.parser')

        main_tag = soup.find('main', {'property': 'mainContentOfPage'})
        if main_tag:
            content = main_tag.get_text(separator=' ')
        else:
            content = None
            
        title = soup.title.string if soup.title else ""
    
        return ParseRecord(url=url, title=title, content=content)

In [2]:
name="IRCC"

In [None]:
from crawler_llama_index.crawler import Crawler

crawl_queue = []

crawler = Crawler(name=name, seed="https://www.canada.ca/en/immigration-refugees-citizenship/services/immigrate-canada/express-entry/works.html", 
                      wait=2,
                      to_be_crawled=lambda u: u.startswith("https://www.canada.ca/en/immigration-refugees-citizenship/"),
                      crawl_queue = crawl_queue)

In [None]:
from crawler_llama_index.ingest import Ingestor, CrawlReader

crawl_reader = CrawlReader(crawl_queue=crawl_queue, parser=thisParser())

In [None]:
ingestor = Ingestor(reader=crawl_reader)

In [None]:
import logging

crawler.logger.setLevel(logging.INFO)
ingestor.logger.setLevel(logging.DEBUG)
crawl_reader.logger.setLevel(logging.DEBUG)

In [None]:
crawler.start()

#crawler.join()
while len(crawl_queue) < 2000:
    time.sleep(0.1)
        
crawler.stop()

In [None]:
from crawler_llama_index.index import Indexer
indexer = Indexer(name=name, data_loader=ingestor)
indexer.logger.setLevel(logging.INFO)

In [3]:
from crawler_llama_index.index import Indexer
import logging

indexer = Indexer(name=name)
indexer.logger.setLevel(logging.INFO)


  from .autonotebook import tqdm as notebook_tqdm

09:40:19.851 [INFO    ] sentence_transformers.SentenceTransformer - Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5
09:40:19.853 [DEBUG   ]    urllib3.connectionpool - Starting new HTTPS connection (1): huggingface.co:443
09:40:19.981 [DEBUG   ]    urllib3.connectionpool - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/modules.json HTTP/1.1" 200 0
09:40:20.027 [DEBUG   ]    urllib3.connectionpool - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0
09:40:20.078 [DEBUG   ]    urllib3.connectionpool - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/README.md HTTP/1.1" 200 0
09:40:20.119 [DEBUG   ]    urllib3.connectionpool - https://huggingface.co:443 "HEAD /BAAI/bge-large-en-v1.5/resolve/main/modules.json HTTP/1.1" 200 0
09:40:20.161 [DEBUG   ]    urllib3.connectionpool - https://huggingface.co:443 "HEAD /BAAI/bg

LLM is explicitly disabled. Using MockLLM.


09:40:22.662 [INFO    ]                   Indexer - try to load index
09:40:22.662 [DEBUG   ] llama_index.storage.kvstore.simple_kvstore - Loading llama_index.storage.kvstore.simple_kvstore from /home/behnam/workspace/omniscient/data/index/IRCC/docstore.json.
09:40:22.663 [DEBUG   ]              fsspec.local - open file: /home/behnam/workspace/omniscient/data/index/IRCC/docstore.json


/home/behnam/workspace/omniscient/data/index/IRCC/docstore.json


09:40:25.651 [DEBUG   ] llama_index.storage.kvstore.simple_kvstore - Loading llama_index.storage.kvstore.simple_kvstore from /home/behnam/workspace/omniscient/data/index/IRCC/index_store.json.
09:40:25.652 [DEBUG   ]              fsspec.local - open file: /home/behnam/workspace/omniscient/data/index/IRCC/index_store.json
09:40:25.673 [DEBUG   ] llama_index.graph_stores.simple - Loading llama_index.graph_stores.simple from /home/behnam/workspace/omniscient/data/index/IRCC/graph_store.json.
09:40:25.674 [DEBUG   ]              fsspec.local - open file: /home/behnam/workspace/omniscient/data/index/IRCC/graph_store.json
09:40:25.676 [DEBUG   ] llama_index.vector_stores.simple - Loading llama_index.vector_stores.simple from /home/behnam/workspace/omniscient/data/index/IRCC/image__vector_store.json.
09:40:25.676 [DEBUG   ]              fsspec.local - open file: /home/behnam/workspace/omniscient/data/index/IRCC/image__vector_store.json
09:40:25.677 [DEBUG   ] llama_index.vector_stores.simple 

/home/behnam/workspace/omniscient/data/index/IRCC/index_store.json


09:45:57.814 [INFO    ] llama_index.indices.loading - Loading all indices.
09:45:59.536 [INFO    ]                   Indexer - number of docs = 95234
09:45:59.536 [INFO    ]                   Indexer - finish loading index


In [4]:
indexer.query("What is express entry?")


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.91it/s]
09:46:09.544 [DEBUG   ] llama_index.indices.utils - > Top 2 nodes:
> [Node 4e22c360-c730-48e8-be7f-fe8cbc408e26] [Similarity score:             0.800698] Express Entry is an application management system for the Federal Skilled Workers Program, the Fe...
> [Node a85f42f1-0fab-4f34-8c7a-c44155eae834] [Similarity score:             0.792029] 
 
 
How Express Entry works 
 
 
 Express Entry is an online system that we use to manage immigr...
09:46:09.546 [INFO    ]                   Indexer - response = Context information is below.
---------------------
title: Express Entry Reports and Publications - Canada.ca

Express Entry is an application management system for the Federal Skilled Workers Program, the Federal Skilled Trades Program, the Canadian Experience Class and a portion of the Provincial Nominee Program.

title: How Express Entry works - Canada.ca


 
 
How Express Entry works 
 
 
 Express Entry is an online system that we 