In [116]:
import sys
import os

from Files.csvLoader import loadCSV
from Files.docsLoader import loadDOCS
from Files.htmlLoader import loadHTML
from Files.jsonLoader import loadJSON
from Files.mdLoader import loadMD
from Files.pdfLoader import loadPDF
from Files.txtLoader import loadTXT

from Websites.urlLoader import loadURL
from Websites.seleniumLoader import loadSELENIUM
from Websites.recursiveLoader import loadRECURSIVE

from Videos.youtubeLoader import loadYOUTUBE

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [117]:
os.environ["OPENAI_API_KEY"] = "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"

documents = []

In [118]:
def load_document(file_path: str):
    try:
        if file_path.endswith(".pdf"):
            return loadPDF(file_path)
        elif file_path.endswith(".docx"):
            return loadDOCS(file_path)
        elif file_path.endswith(".txt"):
            return loadTXT(file_path)
        elif file_path.endswith(".csv"):
            return loadCSV(file_path)
        elif file_path.endswith(".md"):
            return loadMD(file_path)
        elif file_path.endswith(".html"):
            return loadHTML(file_path)
        elif file_path.endswith(".json"):
            return loadJSON(file_path)
    except Exception as e:
        print(f"Error while loading {file_path}: {e}")

def docData(dir: str = "file/"):
    documents = []
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        data = load_document(file_path)
        if data:
            documents.extend(data if isinstance(data, list) else [data])
    
    print("Files loaded successfully\n")
    return documents

In [119]:
documents.extend(docData("file"))

Files loaded successfully



In [120]:
def load_website(urls: list):
    try:
        return loadURL(urls)
    except Exception as e:
        print("Error in URL Loader\n")
        return f"Error in loadURL: {e}"

In [121]:
urls = ["https://en.wikipedia.org/wiki/Lockheed_F-117_Nighthawk"]
documents.extend(load_website(urls))

In [122]:
print(type(documents))

<class 'list'>


In [123]:
def load_videos(links: list):
    try:
        return loadYOUTUBE(links)
    except Exception as e:
        print("Error in Youtube Loader\n")
        return f"Error in loadYOUTUBE: {e}"

In [124]:
links = ["https://www.youtube.com/watch?v=lK8gYGg0dkE"]
documents.extend(load_videos(links))

Using the following model:  openai/whisper-large
[youtube] Extracting URL: https://www.youtube.com/watch?v=lK8gYGg0dkE
[youtube] lK8gYGg0dkE: Downloading webpage
[youtube] lK8gYGg0dkE: Downloading ios player API JSON
[youtube] lK8gYGg0dkE: Downloading android player API JSON
[youtube] lK8gYGg0dkE: Downloading m3u8 information
[info] lK8gYGg0dkE: Downloading 1 format(s): 140
[download] /root/Downloads/YouTube/President Franklin D. Roosevelt Declares War on Japan (Full Speech) ｜ War Archives.m4a has already been downloaded
[download] 100% of    4.44MiB
[ExtractAudio] Not converting audio /root/Downloads/YouTube/President Franklin D. Roosevelt Declares War on Japan (Full Speech) ｜ War Archives.m4a; file is already in target format m4a
Transcribing part /root/Downloads/YouTube/President Franklin D. Roosevelt Declares War on Japan (Full Speech) ｜ War Archives.m4a!


In [125]:
documents[8]

Document(page_content="Page 4 of 4 \n Selected Executive Members for the Batch 2023 – 2024:  \nS. No. Name  ID Number  School  Position  \n1. AYUSH SINGH RATHORE  AP21110010570  SEAS  CONVENER  \n2. M. AHMAD RAZA KHAN  AP22110010453  SEAS  CO-CONVENER  \n3. TANUJ SARKAR  AP22110011056  SEAS  AMBASSADOR  \n4. M SREE PRAJWAL  AP22311130024  PSB AMBASSADOR  \n5. MAHI PAMNANI  AP21211210002  SLASS  AMBASSADOR  \n \nConclusion:  \nIn establishing the Student Academic Committee, the aspiration is  to create a platform where students \nactively contribute to the development of the academic programs  of the University . This initiative aligns \nwith the university's commitment to excellence in education and ensures a holistic and inclusive \napproach to academic governance.  \nWe seek your kind approval for the formation of the committee, and we are confident that this \ncollaborative effort will significantly contribute to enhancing the overall academic infrastructure of our \nuniversity, the

In [126]:
text = ""
for docx in documents:
    text += (str(docx))

In [127]:
print(len(text))

78843


In [128]:
print(text[:2000])

page_content='Name: Ahmad\nCGPA: 9.8' metadata={'source': 'file/cgpa.csv', 'row': 0}page_content='Name: Sumit\nCGPA: 9.35' metadata={'source': 'file/cgpa.csv', 'row': 1}page_content='Name: Anandi\nCGPA: 8.23' metadata={'source': 'file/cgpa.csv', 'row': 2}page_content="Interactive Session with President of SRM University AP\n\nTimings: 3:20pm to 5:00pm\n\nAttendees: Director of SA, Director of CLM, Director of ITKM, Asst. Director of Academics, Director of Communications, Registrar, CFAO, ~500 students and other teaching/non teaching faculties.\n\n\nPoints Raised:\n\n1. Nirukhtha - mentioned that the laboratories for Eshwari School of Libreal Arts are not completely equipped with the required things.(Pres. assured for providing the solution and noted the point)\n2. Ahmad - Mentioned the lack of industry connect and requested for attracting core industry working professionals for on-campus workshops and huest lectures to promote student-industry interaction and expand connections. (Pres.

In [129]:
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\\n",
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [130]:
chunks = get_text_chunks(text)

In [131]:
print(len(chunks))

59


In [134]:
print(len(chunks[0]))

1338


In [135]:
def get_vectorstore(text_chunks):
    embeddings = OpenAIEmbeddings()
    # embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

In [136]:
vectordb = get_vectorstore(chunks)

In [141]:
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
    chain_type="stuff",
    retriever=vectordb.as_retriever(),
)

query = "Who is the current Ambassador_SEAS of the SAC?"
answer = qa_chain.run(query)
print(answer)

The current Ambassador_SEAS of the SAC is Tanuj Sarkar.
