# Langchain

In [None]:
import os
import openai
import sys

In [None]:
from dotenv import load_dotenv, find_dotenv
load_dotenv() # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']
openai.organization = os.getenv("OPENAI_ORGANIZATION")

In [None]:
from langchain.vectorstores import SKLearnVectorStore
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

import tempfile

In [None]:
# Load documents in langchain
loader = DirectoryLoader('data/movies/', glob="*.txt", show_progress=True)
docs = loader.load()
len(docs)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 300,
    chunk_overlap = 40,
    length_function = len,
    add_start_index = True,
)

In [None]:
# Use Embeddings in Langchain 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
persist_path = os.path.join(tempfile.gettempdir(), "union.parquet")

vector_store = SKLearnVectorStore(
    embedding=embeddings, 
    persist_path=persist_path, 
    serializer="parquet"
)

In [None]:
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vector_store, 
    docstore=store, 
    child_splitter=text_splitter,
)

In [None]:
retriever.add_documents(docs, None)

In [None]:
#retrieved_docs = retriever.get_relevant_documents("<query>")

## TO-DO: Create a document search system by combining the above elements

Then, perform a search for the following:

1. space
2. what movie was based on magicians?
3. i want to see a historical world war 2 movie


And finally, summarise the movie result for the second query using the `GPT-3.5-turbo` model. Use langchain to create a summarisation pipeline. See the ["Stuff" chain here](https://python.langchain.com/docs/use_cases/summarization)