# Scrape the data first and load the documents

In [None]:
%pip install nest_asyncio

In [18]:
import nest_asyncio
from langchain_community.document_loaders.sitemap import SitemapLoader
import os
nest_asyncio.apply()

In [7]:
site='https://jobs.excelcult.com/wp-sitemap.xml'
sitemap_loader = SitemapLoader(web_path=site)
docs = sitemap_loader.load()

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  4.83it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  5.30it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  4.54it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  5.15it/s]
Fetching pages: 100%|##########| 1/1 [00:00<00:00,  4.86it/s]
Fetching pages: 100%|##########| 242/242 [00:24<00:00,  9.90it/s]


## chunking the data 

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [16]:
text_split=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=20,length_function=len)
data=text_split.split_documents(docs)
len(data)

881

## embedding the data 

In [19]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

os.environ['HUGGINGFACEHUB_API_KEY']='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'

In [117]:
embedding=SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-V2')

In [105]:
dim=embedding.client.get_sentence_embedding_dimension()
dim

384

## vector store pinecone

In [110]:
from langchain.vectorstores import Pinecone
from pinecone import Pinecone as pc ,PodSpec

key='f7187647-5beb-44c7-b5a6-29e725128334'
os.environ['PINECONE_API_KEY']=key
index_name='web-chat'

# create new index and delete previous
pc_config = pc(api_key=key)

for name in pc_config.list_indexes().names():
  try:
    pc_config.delete_index(name)
  except Exception as e:
    print('no index is there')

pc_config.create_index(
        index_name,
        dimension=dim,  
        metric='dotproduct',
        spec=PodSpec(environment='gcp-starter')
    )


# uploading the documents embedding
index=Pinecone.from_documents(data,embedding,index_name=index_name)

# index.similarity_search('tcs',5)

In [111]:
#retreival code 
from langchain.vectorstores import Pinecone
from pinecone import Pinecone as pc

pc_config = pc(api_key=key)
index=pc_config.Index(name=index_name)
vc=Pinecone(index,embedding.embed_query,text_key='text')
vc.similarity_search('tcs',5)



[Document(page_content='TCS Hiring RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                            Learn || Apply || Share\n\n\n\n\nBecome RPA Developer\nBecome RPA Business Analyst\nPractice RPA\nRPA Interview Preparation\n \n\n\n\n\n\n\n\n\n\n\n\n\n\nTCS Hiring RPA Blueprism Developers\n\n\n\n\n\n\n\n\n\nHome Blueprism TCS Hiring RPA Blueprism Developers', metadata={'loc': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/', 'source': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/'}),
 Document(page_content='TCS Hiring for RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                       

In [116]:
#retreival 2
index_2=Pinecone.from_existing_index(index_name,embedding)
index_2.similarity_search('tcs',6)

[Document(page_content='TCS Hiring RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                            Learn || Apply || Share\n\n\n\n\nBecome RPA Developer\nBecome RPA Business Analyst\nPractice RPA\nRPA Interview Preparation\n \n\n\n\n\n\n\n\n\n\n\n\n\n\nTCS Hiring RPA Blueprism Developers\n\n\n\n\n\n\n\n\n\nHome Blueprism TCS Hiring RPA Blueprism Developers', metadata={'loc': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/', 'source': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/'}),
 Document(page_content='TCS Hiring for RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                       

In [59]:
from langchain.vectorstores.faiss import FAISS
db=FAISS.from_documents(data,embedding)
db.similarity_search('tcs',k=5)

[Document(page_content='TCS Hiring RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                            Learn || Apply || Share\n\n\n\n\nBecome RPA Developer\nBecome RPA Business Analyst\nPractice RPA\nRPA Interview Preparation\n \n\n\n\n\n\n\n\n\n\n\n\n\n\nTCS Hiring RPA Blueprism Developers\n\n\n\n\n\n\n\n\n\nHome Blueprism TCS Hiring RPA Blueprism Developers', metadata={'source': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/', 'loc': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/', 'text': 'TCS Hiring RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                            Learn || Apply |

In [60]:
from langchain.vectorstores.chroma import Chroma
db=Chroma.from_documents(data,embedding)
db.as_retriever(search_kwargs={'k':5}).get_relevant_documents('tcs')

[Document(page_content='TCS Hiring RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                            Learn || Apply || Share\n\n\n\n\nBecome RPA Developer\nBecome RPA Business Analyst\nPractice RPA\nRPA Interview Preparation\n \n\n\n\n\n\n\n\n\n\n\n\n\n\nTCS Hiring RPA Blueprism Developers\n\n\n\n\n\n\n\n\n\nHome Blueprism TCS Hiring RPA Blueprism Developers', metadata={'loc': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/', 'source': 'https://jobs.excelcult.com/tcs-hiring-rpa-blueprism-developers/', 'text': 'TCS Hiring RPA Blueprism Developers – ExcelCult\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n                                            ExcelCult                                            Learn || Apply |

## retrieval the documents

In [52]:
def get_similar_docs(index,query,k):
  result=index.similarity_search(query,k)
  return result

In [115]:

#retreival code 
from langchain.vectorstores import Pinecone
from pinecone import Pinecone as pc

query='wipro'
k=3
pc_config = pc(api_key=key)
index=pc_config.Index(name=index_name)
vc=Pinecone(index,embedding.embed_query,text_key='text')
docs=get_similar_docs(vc,query,k)


for i in docs:
  print(i.metadata['loc'])
  print(i.page_content.split('\n')[0])
  print(i)



https://jobs.excelcult.com/wipro-hiring-for-rpa-blueprism-developers/
Apply At:
page_content='Apply At:\nsunetra.gumaste@wipro.com' metadata={'loc': 'https://jobs.excelcult.com/wipro-hiring-for-rpa-blueprism-developers/', 'source': 'https://jobs.excelcult.com/wipro-hiring-for-rpa-blueprism-developers/'}
https://jobs.excelcult.com/wipro-hiring-rpa-uipath-senior-developer-lead/
2 Replies to “Wipro hiring RPA UiPath Senior Developer & Lead”		
page_content='2 Replies to “Wipro hiring RPA UiPath Senior Developer & Lead”\t\t\n\n\n\n\n\n Govardhan Reddy says: \n\nJanuary 10, 2021 at 5:11 am' metadata={'loc': 'https://jobs.excelcult.com/wipro-hiring-rpa-uipath-senior-developer-lead/', 'source': 'https://jobs.excelcult.com/wipro-hiring-rpa-uipath-senior-developer-lead/'}
https://jobs.excelcult.com/wipro-hiring-for-rpa-uipath-technical-architect/
Apply At:
page_content='Apply At:\nmoumita.pandey@wipro.com' metadata={'loc': 'https://jobs.excelcult.com/wipro-hiring-for-rpa-uipath-technical-archite

https://jobs.excelcult.com/wipro-hiring-for-rpa-blueprism-developers/
Apply At:
https://jobs.excelcult.com/wipro-hiring-rpa-uipath-senior-developer-lead/
2 Replies to “Wipro hiring RPA UiPath Senior Developer & Lead”		
https://jobs.excelcult.com/wipro-hiring-for-rpa-uipath-technical-architect/
Apply At:
