In [5]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

import numpy as np

In [2]:
loader = PyPDFDirectoryLoader('./docs')
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
final_documents = text_splitter.split_documents(documents)

In [3]:
len(final_documents)

326

In [4]:
hgface_embeddings = HuggingFaceBgeEmbeddings(
    model_name='BAAI/bge-small-en-v1.5',
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
final_documents[10].page_content

'easily? Those customers might be internal or external – the key is to think beyond simply offering data \nsources, and expecting users to adapt or compromise the way they work to use it.\nUnfortunately, there’s no silver bullet here. It takes time to understand your customers and their goals, \nand involves real-world testing and constant refinement. And once you’ve solved that for one group of \ncustomers, how do you scale and expand this? Can you make those products reusable, satisfying the \nneeds of a broader range of customers?\nAt Thoughtworks, we have adapted the Double-Diamond design process model to make sure that we \nbuild the right thing and build it right . This starts with identifying what a customer needs. We use a \nstructured discovery and inception process to uncover these requirements for any new data product. \nWe then apply a set of well-understood practices and tools that are known to deliver high-quality \nsoftware and data.'

In [13]:
len(np.array(hgface_embeddings.embed_query(final_documents[10].page_content)))

384

In [14]:
vector_store = FAISS.from_documents(final_documents[:120],hgface_embeddings)

In [15]:
query = 'what are the skills for data engineering?'

In [16]:
relevant_document = vector_store.similarity_search(query)

In [24]:
len(relevant_document)

4

In [25]:
for page_num in range(len(relevant_document)):
    print(relevant_document[page_num].page_content)

Contents
I Introduction 10
1 How To Use This Cookbook 11
2 Data Engineer vs Data Scientists 12
2.1 Data Scientist . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 12
2.2 Data Engineer . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 13
2.3 Who Companies Need . . . . . . . . . . . . . . . . . . . . . . . . . . . . 14
II Basic Data Engineering Skills 16
3 Learn To Code 17
4 Get Familiar With Git 18
5 Agile Development – available 19
5.1 Why is agile so important? . . . . . . . . . . . . . . . . . . . . . . . . . . 19
5.2 Agile rules I learned over the years – available . . . . . . . . . . . . . . . 20
5.2.1 Is the method making a diﬀerence? . . . . . . . . . . . . . . . . . 20
5.2.2 The problem with outsourcing . . . . . . . . . . . . . . . . . . . . 20
5.2.3 Knowledge is king: A lesson from Elon Musk . . . . . . . . . . . . 21
5.2.4 How you really can be agile . . . . . . . . . . . . . . . . . . . . . 21
Modern data 
engineering 
playbook
Modern data en

In [26]:
retriever = vector_store.as_retriever(search_type='similarity',search_kwargs={'k':3})

In [32]:
import os
hf_api_key = os.environ['HUGGING_FACE_API_KEY']

In [None]:
from langchain_community.llms import HuggingFaceHub

hf = HuggingFaceHub(
    repo_id='meta-llama/Meta-Llama-3-8B',
    model_kwargs={'temperature':0.1,'max_length':500},
    huggingfacehub_api_token=hf_api_key
)

In [None]:
query = 'what skills are needed to a data engineer?'
hf.invoke(query)

In [None]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id = 'mistralai/Mistral-7B-v0.1'
    
)