<a href="https://colab.research.google.com/github/barbaroja2000/llm/blob/main/Langchain_OpenAi_QA_%26_Summarize_Webpages%7CYoutube%7Cpdfs%7Ctxt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Langchain OpenAi - QA & Summarize Webpages|Youtube|pdfs|txt.ipynb

This colab provides a utility with the following functionality

* Summarize Text
* Q&A on text

Text can be loaded from:

* Uploaded .pdf
* Transcribed youtube video 
* Web Page

Uses langchain, FAISS (Vector Store), OpenAi & PromptTemplates

https://python.langchain.com/en/latest/index.html

https://python.langchain.com/en/latest/modules/models/llms/integrations/openai.html

https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/faiss.html

https://python.langchain.com/en/latest/modules/prompts/prompt_templates.html

To run you will need an OpenAi API Key, as an environment variable:

```
import os
os.environ.get("OPENAI_API_KEY")
```


In [None]:
#@title Load Keys
#@markdown Utitily to load keys from fs, replace with environ vars if not using

import os

#os.environ.get("OPENAI_API_KEY")
#os.environ.get("HUGGINGFACE_API_KEY")

!python -m pip install python-dotenv
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
import dotenv
dotenv.load_dotenv('/content/drive/MyDrive/keys/keys.env')

In [None]:
!pip install -q langchain openai pypdf tiktoken faiss-cpu unstructured youtube_transcript_api beautifulsoup4 > /dev/null

In [None]:
from langchain.document_loaders import PyPDFLoader, TextLoader

from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain, OpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import os , requests
from typing import List, Dict
import glob
from langchain.chains.summarize import load_summarize_chain
from bs4 import BeautifulSoup

In [None]:
# @title  Download .pdf
#!wget https://d1.awsstatic.com/whitepapers/building-a-cloud-operating-model.pdf
#!wget https://d1.awsstatic.com/whitepapers/architecture/AWS_Well-Architected_Framework.pdf
!wget https://docs.aws.amazon.com/pdfs/wellarchitected/latest/sustainability-pillar/wellarchitected-sustainability-pillar.pdf

In [None]:
# @title  SummarizeNQA Class 

class SummarizeNQA:
    def __init__(self, key: str, dir: str) -> None:
        if not key:
            raise ValueError("API key must be provided.")
        if not dir  or not os.path.isdir(dir):
            raise ValueError("Directory must be provided.")      
        self.key = key
        self.dir = dir
        self.db = None
        self.docs = None

    @staticmethod
    def request(url:str) -> None:
        page = requests.get(url)
        if not page.status_code == 200:
          raise ValueError("Must pass valid url")

        soup = BeautifulSoup(page.content, "html.parser")

        [s.extract() for s in soup(['style', 'script', '[document]', 'head', 'title'])]
        text_formatted = soup.getText()

        file_name = url.split("/")[-1]
        with open(f'{file_name}.txt', 'w') as f:
            f.write(text_formatted)
            
    @staticmethod
    def _check_video_url(video_id: str):
      checker_url = f"https://www.youtube.com/watch?v={video_id}"
      request = requests.get(checker_url)
      return request.status_code == 200

    def youtube(self, id: str) -> None:

      if not id or not self._check_video_url(id):
        raise ValueError("Must pass valid youtube ID")

      transcript = YouTubeTranscriptApi.get_transcript(id)
      formatter = TextFormatter()
      text_formatted = formatter.format_transcript(transcript)
      with open(f'{id}.txt', 'w') as f:
        f.write(text_formatted)

    def load(self, chunk_size: int = 2000, chunk_overlap: int = 200) -> None:
        
        documents = []
        if not glob.glob(f"{self.dir}*.*"):
            raise ValueError("Directory must contain at least one file.")

        if  glob.glob(f"{self.dir}*.pdf"):
          loader = DirectoryLoader(
              "", glob=f"{self.dir}*.pdf", loader_cls=PyPDFLoader
          )
          documents = [*loader.load(), *documents]

        if  glob.glob(f"{self.dir}*.txt"):
          loader = DirectoryLoader(
              "", glob=f"{self.dir}*.txt", loader_cls=TextLoader
          )
          documents = [*loader.load(), *documents]

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        self.docs  = text_splitter.split_documents(documents)
        embeddings = OpenAIEmbeddings(openai_api_key=self.key)
        self.db = FAISS.from_documents(self.docs, embeddings)
    
    def summarize(self, max_tokens=1000,chain_type='map_reduce' ):

      if not self.db:
         raise ValueError("Load first")

      map_prompt = """
                Write a  summary of the following:
                "{text}"
                 SUMMARY:
                """
      map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

      combine_prompt = """
      Write a  summary of the following text delimited by triple backquotes.
      Return your response in bullet points which covers the key points of the text.
      ```{text}```
      BULLET POINT  SUMMARY:
      """
      combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

      llm = OpenAI(temperature=0, openai_api_key=self.key,  max_tokens=max_tokens)

      summary_chain = load_summarize_chain(llm=llm,
              chain_type=chain_type,
              map_prompt=map_prompt_template,
              combine_prompt=combine_prompt_template,
              verbose=False, return_intermediate_steps=False
          )

      return  summary_chain.run(self.docs)

    def qa(
        self,
        query: str,
        temperature: float = 0,
        count: float = 4,
        chain_type: str = "stuff", #map_reduce, #refine
        return_only_outputs: bool = True,
        return_intermediate_steps: bool = False,
    ) -> Dict:

        docs = self.db.similarity_search(query, k=count)

        if chain_type == "stuff":
            chain = load_qa_with_sources_chain(OpenAI(temperature=temperature, openai_api_key=self.key), chain_type=chain_type)
        else:
            load_qa_with_sources_chain(OpenAI(temperature=temperature, openai_api_key=self.key), chain_type=chain_type, return_intermediate_steps=return_intermediate_steps)
        return chain(
            {"input_documents": docs, "question": query},
            return_only_outputs=return_only_outputs,
        )

In [None]:
# @title  Set Up
summarize_and_qa = SummarizeNQA(os.environ.get("OPENAI_API_KEY"),"./")

In [None]:
# @title Transcribe a youtube video
#https://www.youtube.com/watch?v=PdE-waSx-d8
summarize_and_qa.youtube("PdE-waSx-d8")

In [None]:
# @title Load text from a url
summarize_and_qa.request("https://energysavingtrust.org.uk/case-study/green-energy")

In [None]:
# @title Load the texts into documents and index
summarize_and_qa.load()

In [None]:
# @title Summarize
# @markdown ```Depending on the length of the text, you may have to reduce the max_token parameter```
summary = summarize_and_qa.summarize(max_tokens=1000)
summary

In [None]:
print(summary)

In [None]:
# @title QA
response = summarize_and_qa.qa("Explain the ideas in computational irreducibility")
response["output_text"]