In [1]:
!pip install langchain langchain-community faiss-cpu sentence-transformers pypdf newspaper3k beautifulsoup4

Collecting faiss-cpu
  Using cached faiss_cpu-1.10.0-cp310-cp310-win_amd64.whl.metadata (4.5 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting pypdf
  Using cached pypdf-5.4.0-py3-none-any.whl.metadata (7.3 kB)
Collecting newspaper3k
  Using cached newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.7.0-cp310-cp310-win_amd64.whl.metadata (29 kB)
Collecting scikit-learn (from sentence-transformers)
  Using cached scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.15.2-cp310-cp310-win_amd64.whl.metadata (60 kB)
Collecting cssselect>=0.9.2 (from newspaper3k)
  Using cached cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Col

In [3]:
import os

# Edge user-agent string
os.environ["USER_AGENT"] = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31"
)

In [4]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain_community.llms import Ollama

In [5]:
!pip list

Package                                  Version
---------------------------------------- --------------
aiohappyeyeballs                         2.6.1
aiohttp                                  3.11.16
aiosignal                                1.3.2
annotated-types                          0.7.0
anyio                                    4.9.0
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
arrow                                    1.3.0
asgiref                                  3.8.1
asttokens                                3.0.0
async-lru                                2.0.5
async-timeout                            4.0.3
attrs                                    25.3.0
babel                                    2.17.0
backoff                                  2.2.1
bcrypt                                   4.3.0
beautifulsoup4                           4.13.3
bleach                                   6.2.0
build                                    1

In [9]:
#from langchain_community.document_loaders import WebBaseLoader
# Step 1: Load Web Page Content

# Set URL
url = "https://medium.com/techartifact-technology-learning/understanding-genai-in-depth-c820289ddeb3"  # Replace with any URL 

# Load web page content
loader = WebBaseLoader(url)
documents = loader.load()

# Check the loaded documents
print(f"Number of documents loaded: {len(documents)}")
print(f"First document content: {documents[0].page_content[:500]}")  # Print first 500 chars of the first document


Number of documents loaded: 1
First document content: Understanding GenAI Models in depth | by vinay kumar | Techartifact-Technology learning | MediumOpen in appSign upSign inWriteSign upSign inTechartifact-Technology learning·An experience of the journey of development →architecture → transformation leadershipUnderstanding GenAI Models in depthvinay kumarFollow6 min read·Dec 5, 2024--ListenShareGenerative AI is a subset of artificial intelligence that focuses on creating models capable of generating new content, such as text, images, music, etc. I


In [7]:
doc = documents[0]
print("Document metadata:", doc.metadata)
print("Content preview:", doc.page_content[:300])


Document metadata: {'source': 'https://medium.com/techartifact-technology-learning/understanding-genai-in-depth-c820289ddeb3', 'title': 'Understanding GenAI Models in depth | by vinay kumar | Techartifact-Technology learning | Medium', 'description': 'Generative AI is a subset of artificial intelligence that focuses on creating models capable of generating new content, such as text, images, music, etc. It refers to a class of artificial…', 'language': 'en'}
Content preview: Understanding GenAI Models in depth | by vinay kumar | Techartifact-Technology learning | MediumOpen in appSign upSign inWriteSign upSign inTechartifact-Technology learning·An experience of the journey of development →architecture → transformation leadershipUnderstanding GenAI Models in depthvinay k


In [16]:
# from langchain_text_splitters import CharacterTextSplitter
# Step 2: Split Text Into Chunks

# Split documents into chunks of size 1000 with an overlap of 30 characters
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=30, separator="\n")
docs = text_splitter.split_documents(documents=documents)

# Print out the number of chunks and the first chunk
print(f"Number of chunks: {len(docs)}")
print(f"First chunk content: {docs[0].page_content[:500]}")  # Print first 500 chars of the first chunk


Number of chunks: 1
First chunk content: Understanding GenAI Models in depth | by vinay kumar | Techartifact-Technology learning | MediumOpen in appSign upSign inWriteSign upSign inTechartifact-Technology learning·An experience of the journey of development →architecture → transformation leadershipUnderstanding GenAI Models in depthvinay kumarFollow6 min read·Dec 5, 2024--ListenShareGenerative AI is a subset of artificial intelligence that focuses on creating models capable of generating new content, such as text, images, music, etc. I


In [18]:
# from langchain_community.embeddings import OllamaEmbeddings
# create embeddings

# Use Ollama embeddings model
embeddings = OllamaEmbeddings(model="llama3.2")  # Use the model you prefer

# Create embeddings for the documents
embedded_docs = embeddings.embed_documents([doc.page_content for doc in docs])

# Print the first embedding (a numerical vector)
print(f"First document embedding (vector): {embedded_docs[0][:10]}...")  # Print first 10 values of the first embedding vector


First document embedding (vector): [-0.32377153635025024, 0.4590296149253845, 6.410435676574707, -2.1356775760650635, -0.22590740025043488, -0.08978856354951859, 0.1484341323375702, 1.1623140573501587, 1.0394806861877441, -1.6612409353256226]...


In [19]:
# from langchain_community.vectorstores import FAISS
#  Step 4: Store Embeddings in FAISS Vector Store

# Create a FAISS vector store from the embeddings
vectorstore = FAISS.from_documents(docs, embeddings)

# Save the vector store locally
vectorstore.save_local("faiss_index_react")

# Check if FAISS index has been saved
print("FAISS index saved.")


FAISS index saved.


In [20]:
# Step 5: Load FAISS Vector Store

# Load the saved FAISS index
new_vectorstore = FAISS.load_local("faiss_index_react", embeddings, allow_dangerous_deserialization=True)

# Confirm it's loaded
print("FAISS index loaded.")


FAISS index loaded.


In [23]:
# from langchain.chains.retrieval_qa.base import RetrievalQA
# from langchain_community.llms import Ollama
# Step 6: Set Up Retrieval and QA Chain

# Set up the Ollama model
llm = Ollama(model="llama3.2")  # Use your preferred Ollama model

# Create a Retrieval QA chain
retrieval_qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=new_vectorstore.as_retriever())

# Ask a sample question
question = "What do you mean by transformer-based models?"
response = retrieval_qa.run(question)

# Print the response
print(f"Answer to '{question}': {response}")


Answer to 'What do you mean by transformer-based models?': Transformer-based models are a type of generative AI model that revolutionize language processing, enabling advanced tasks such as translation, summarization, and natural language understanding. They primarily use an attention mechanism to process input sequences, focusing on relevant parts of the input regardless of their position, and processes data in parallel, making it more efficient than traditional sequential models (e.g., RNNs).
