# **LangChain Decoded**

## Getting Started

In [None]:
# Install the LangChain package
!pip install langchain

In [None]:
# Install the OpenAI package
!pip install openai

In [None]:
# Configure the API key
import os

openai_api_key = os.environ.get('OPENAI_API_KEY', 'sk-XXX')

## Part 4: Indexes

### Document Loaders

In [None]:
!pip install unstructured tabulate pdf2image pytesseract

In [None]:
# URL Loader
from langchain.document_loaders import UnstructuredURLLoader

urls = ["https://alphasec.io/summarize-text-with-langchain-and-openai"]
loader = UnstructuredURLLoader(urls=urls)
data = loader.load()
print(data)

In [None]:
!pip install pypdf

In [None]:
# PDF Loader
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("./data/attention-is-all-you-need.pdf")
pages = loader.load_and_split()
pages[0]

In [None]:
# File Directory Loader
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader('data', glob="**/*.csv")
docs = loader.load()
len(docs)

In [None]:
!pip install pytube youtube-transcript-api

In [None]:
# YouTube Transcripts Loader
from langchain.document_loaders import YoutubeLoader

loader = YoutubeLoader.from_youtube_url("https://www.youtube.com/watch?v=yEgHrxvLsz0", add_video_info=True)
data = loader.load()
print(data)

In [None]:
!pip install google-cloud-storage

In [None]:
# Google Cloud Storage File Loader
from langchain.document_loaders import GCSFileLoader

loader = GCSFileLoader(project_name="langchain-gcs", bucket="langchain-gcs", blob="lorem-ipsum.txt")
data = loader.load()
print(data)

### Text Splitters

In [None]:
# Character Text Splitter
from langchain.text_splitter import CharacterTextSplitter
from google.colab import files

uploaded = files.upload()
filename = next(iter(uploaded))
text = uploaded[filename].decode("utf-8")

text_splitter = CharacterTextSplitter(        
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

texts = text_splitter.create_documents([text])
print(texts[0])
print(texts[1])
print(texts[2])

In [None]:
# Recursive Character Text Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.colab import files

uploaded = files.upload()
filename = next(iter(uploaded))
text = uploaded[filename].decode("utf-8")

text_splitter = RecursiveCharacterTextSplitter(        
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
)

texts = text_splitter.create_documents([text])
print(texts[0])
print(texts[1])
print(texts[2])

### Vector Stores

In [None]:
!pip install chromadb tiktoken

In [None]:
# Chroma Vector Store
import os, tiktoken
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY = '' # @param {type:"string"}
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

from google.colab import files

uploaded = files.upload()
filename = next(iter(uploaded))

loader = TextLoader(filename)
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(docs, embeddings)

query = "What comes after 'Vestibulum congue convallis finibus'?"
docs = db.similarity_search(query)

print(docs[0].page_content)

### Retrievers

In [None]:
!pip install arxiv pymupdf

In [None]:
# Arxiv Retriever
from langchain.retrievers import ArxivRetriever

retriever = ArxivRetriever(load_max_docs=2)
docs = retriever.get_relevant_documents(query='2203.15556')

docs[0].metadata

In [None]:
!pip install wikipedia

In [None]:
# Wikipedia Retriever
from langchain.retrievers import WikipediaRetriever

retriever = WikipediaRetriever()
docs = retriever.get_relevant_documents(query='large language models')

docs[0].metadata

In [None]:
!pip install chromadb tiktoken

In [None]:
# Chroma Vector Store Retriever
import os, tiktoken
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

OPENAI_API_KEY = '' # @param {type:"string"}
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

from google.colab import files

uploaded = files.upload()
filename = next(iter(uploaded))

loader = TextLoader(filename)
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)

embeddings = OpenAIEmbeddings()
db = Chroma.from_documents(docs, embeddings)

retriever = db.as_retriever()
query = "What comes after 'Vestibulum congue convallis finibus'?"
docs = retriever.get_relevant_documents(query)

print(docs[0].page_content)