In [25]:
import sys
import os

from Files.csvLoader import loadCSV
from Files.docsLoader import loadDOCS
from Files.htmlLoader import loadHTML
from Files.jsonLoader import loadJSON
from Files.mdLoader import loadMD
from Files.pdfLoader import loadPDF
from Files.txtLoader import loadTXT

from Websites.urlLoader import loadURL
from Websites.seleniumLoader import loadSELENIUM
from Websites.recursiveLoader import loadRECURSIVE

from Youtube.youtubeLoader import loadYOUTUBE

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [26]:
os.environ["OPENAI_API_KEY"] = "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"

documents = []

In [27]:
def load_document(file_path: str):
    try:
        if file_path.endswith(".pdf"):
            return loadPDF(file_path)
        elif file_path.endswith(".docx"):
            return loadDOCS(file_path)
        elif file_path.endswith(".txt"):
            return loadTXT(file_path)
        elif file_path.endswith(".csv"):
            return loadCSV(file_path)
        elif file_path.endswith(".md"):
            return loadMD(file_path)
        elif file_path.endswith(".html"):
            return loadHTML(file_path)
        elif file_path.endswith(".json"):
            return loadJSON(file_path)
    except Exception as e:
        print(f"Error while loading {file_path}: {e}")

def docData(dir: str = "file/"):
    documents = []
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        data = load_document(file_path)
        if data:
            documents.extend(data if isinstance(data, list) else [data])
    
    print("Files loaded successfully\n")
    return documents

In [28]:
def loadWebsites(file_path: str):
    try:
        with open(f'{file_path}/websites.txt', 'r') as file:
            lines = file.readlines()
        websites_list = [line.strip() for line in lines]
    except Exception as e:
        print("Error in reading files: ", str(e))
    try:
        print("Websites loaded successfully\n")
        return loadURL(websites_list)
    except Exception as e:
        print("Error in loading in URLs\n")
        return f"Error in loadURL: {e}"        

In [29]:
def loadYoutubeVideos(file_path: str):
    try:
        with open(f'{file_path}/links.txt', 'r') as file:
            lines = file.readlines()
        links_list = [line.strip() for line in lines]
    except Exception as e:
        print("Error in reading files: ", str(e))
    try:
        print("Youtube Videos loaded successfully\n")
        return loadYOUTUBE(links_list)
    except Exception as e:
        print("Error in loading in URLs\n")
        return f"Error in loadURL: {e}"        

In [30]:
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\\n",
        chunk_size=1000,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [31]:
def get_vectorstore(text_chunks):
    # embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

In [32]:
def create_localDB():
    try:
        documents = []
        documents.extend(docData("userData"))
        documents.extend(loadWebsites("userData"))
        documents.extend(loadYoutubeVideos("userData"))

        text = ""
        for docx in documents:
            text += (str(docx))

        text_chunks = get_text_chunks(text)
        db = Chroma.from_texts(text_chunks, embedding=OpenAIEmbeddings(), persist_directory="./userData_embedded")
        db.persist()
        return db
    except Exception as e:
        print("Error while creating chroma databse: ", str(e))

In [33]:
def load_localDB():
    try:
        db = Chroma(persist_directory="./userData_embedded", embedding_function=OpenAIEmbeddings())
        db.get()
        return db
    except Exception as e:
        print("Error while retrieving an embedded database: ", str(e))

In [34]:
documents.extend(docData("userData"))

Files loaded successfully



In [11]:
documents.extend(loadWebsites("userData"))

Websites loaded successfully



In [12]:
documents.extend(loadYoutubeVideos("userData"))

Youtube Videos loaded successfully

[youtube] Extracting URL: https://youtu.be/lK8gYGg0dkE?feature=shared
[youtube] lK8gYGg0dkE: Downloading webpage
[youtube] lK8gYGg0dkE: Downloading ios player API JSON
[youtube] lK8gYGg0dkE: Downloading android player API JSON
[youtube] lK8gYGg0dkE: Downloading m3u8 information
[info] lK8gYGg0dkE: Downloading 1 format(s): 140
[download] /home/dgxuser16/Downloads/YouTube/President Franklin D. Roosevelt Declares War on Japan (Full Speech) ｜ War Archives.m4a has already been downloaded
[download] 100% of    4.44MiB


ERROR: Postprocessing: ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location


In [13]:
text = ""
for docx in documents:
    text += (str(docx))

In [14]:
chunks = get_text_chunks(text)

Created a chunk of size 2590, which is longer than the specified 1000


In [15]:
print(len(chunks))

59


In [16]:
vectordb = get_vectorstore(chunks)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


  return torch._C._cuda_getDeviceCount() > 0


In [17]:
# localDB = create_localDB()

In [18]:
# loadedDB = load_localDB()

In [19]:
# query = "What is the marvel name of Amey Khare?"
# docs = vectordb.similarity_search(query)
# # print(docs[0].page_content)
# print(docs)

In [20]:
# qa_chain = RetrievalQA.from_chain_type(
#     llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
#     chain_type="stuff",
#     retriever=vectordb.as_retriever(),
# )

# query = "what is the marvel name for Shivansh Goel?"
# answer = qa_chain.run(query)
# print(answer)

In [41]:
def driverFunc():
    os.environ["OPENAI_API_KEY"] = "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"

    database_folderName = "userData_embedded"
    current_directory = os.getcwd()
    folder_path = os.path.join(current_directory, database_folderName)
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        db = load_localDB()
        print("Loaded Database Successfully")
    else:
        db = create_localDB()
        print("Created Database Successfully")
    
    # documents = []
    # documents.extend(docData("userData"))
    # documents.extend(loadWebsites("userData"))
    # documents.extend(loadYoutubeVideos("userData"))

    # text = ""
    # for docx in documents:
    #     text += (str(docx))

    # chunks = get_text_chunks(text)
    
    # db = create_localDB(chunks)
    
    # qa_chain = RetrievalQA.from_chain_type(
    # llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
    # chain_type="stuff",
    # retriever=db.as_retriever())

    # query = input("User: ")
    # answer = qa_chain.run(query)
    # return f"EngiPal: {answer}"

    pdf_qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
    vectordb.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True,
    verbose=False)

    chat_history = []
    print('Welcome to the EngiPal. Your Engineering Pal!\n')
    while True:
        query = input("Prompt: ")
        if query == "exit" or query == "quit" or query == "q" or query == "f":
            print('Exiting')
            return chat_history
            sys.exit()
        if query == '':
            continue
        result = pdf_qa({"question": query, "chat_history": chat_history})
        print("Answer: " + result["answer"])
        chat_history.append((query, result["answer"]))
        
    return chat_history

In [47]:
chat = []
chat = driverFunc()

Loaded Database Successfully
Welcome to the EngiPal. Your Engineering Pal!



KeyboardInterrupt: Interrupted by user

In [44]:
print(chat)

[]


In [None]:
answer = driverFunc()

In [None]:
print(answer)

EngiPal
Checklist:

1. Database to store data locally (Completed)
2. custom model from huggingface (Completed)
3. custom model for embedding (Completed)
5. GUI

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM 
import torch

In [None]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="model/", device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="model/")

device = "cuda"
model.to(device)

In [None]:
import transformers

In [None]:
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1024,
)

In [None]:
from langchain.llms import HuggingFacePipeline

In [None]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=mistral_llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever())

query = input("User: ")
answer = qa_chain.run(query)
print(f"EngiPal: {answer}")

In [None]:
del(model)
del(tokenizer)