In [1]:
import sys
import os

from Files.csvLoader import loadCSV
from Files.docsLoader import loadDOCS
from Files.htmlLoader import loadHTML
from Files.jsonLoader import loadJSON
from Files.mdLoader import loadMD
from Files.pdfLoader import loadPDF
from Files.txtLoader import loadTXT

from Websites.urlLoader import loadURL
from Websites.seleniumLoader import loadSELENIUM
from Websites.recursiveLoader import loadRECURSIVE

from Videos.youtubeLoader import loadYOUTUBE

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [2]:
os.environ["OPENAI_API_KEY"] = "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"

documents = []

In [3]:
def load_document(file_path: str):
    try:
        if file_path.endswith(".pdf"):
            return loadPDF(file_path)
        elif file_path.endswith(".docx"):
            return loadDOCS(file_path)
        elif file_path.endswith(".txt"):
            return loadTXT(file_path)
        elif file_path.endswith(".csv"):
            return loadCSV(file_path)
        elif file_path.endswith(".md"):
            return loadMD(file_path)
        elif file_path.endswith(".html"):
            return loadHTML(file_path)
        elif file_path.endswith(".json"):
            return loadJSON(file_path)
    except Exception as e:
        print(f"Error while loading {file_path}: {e}")

def docData(dir: str = "file/"):
    documents = []
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        data = load_document(file_path)
        if data:
            documents.extend(data if isinstance(data, list) else [data])
    
    print("Files loaded successfully\n")
    return documents

In [4]:
def loadWebsites(file_path: str):
    try:
        with open(f'{file_path}/websites.txt', 'r') as file:
            lines = file.readlines()
        websites_list = [line.strip() for line in lines]
    except Exception as e:
        print("Error in reading files: ", str(e))
    try:
        print("Websites loaded successfully\n")
        return loadURL(websites_list)
    except Exception as e:
        print("Error in loading in URLs\n")
        return f"Error in loadURL: {e}"        

In [5]:
def loadYoutubeVideos(file_path: str):
    try:
        with open(f'{file_path}/links.txt', 'r') as file:
            lines = file.readlines()
        links_list = [line.strip() for line in lines]
    except Exception as e:
        print("Error in reading files: ", str(e))
    try:
        print("Youtube Videos loaded successfully\n")
        return loadYOUTUBE(links_list)
    except Exception as e:
        print("Error in loading in URLs\n")
        return f"Error in loadURL: {e}"        

In [6]:
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\\n",
        chunk_size=1000,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [7]:
def get_vectorstore(text_chunks):
    # embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

In [8]:
def create_localDB():
    try:
        documents = []
        documents.extend(docData("userData"))
        documents.extend(loadWebsites("userData"))
        documents.extend(loadYoutubeVideos("userData"))

        text = ""
        for docx in documents:
            text += (str(docx))

        text_chunks = get_text_chunks(text)
        db = Chroma.from_texts(text_chunks, embedding=OpenAIEmbeddings(), persist_directory="./userData_embedded")
        db.persist()
        return db
    except Exception as e:
        print("Error while creating chroma databse: ", str(e))

In [9]:
def load_localDB():
    try:
        db = Chroma(persist_directory="./userData_embedded", embedding_function=OpenAIEmbeddings())
        db.get()
        return db
    except Exception as e:
        print("Error while retrieving an embedded database: ", str(e))

In [10]:
documents.extend(docData("userData"))

Files loaded successfully



In [11]:
documents.extend(loadWebsites("userData"))

Websites loaded successfully



In [12]:
documents.extend(loadYoutubeVideos("userData"))

Youtube Videos loaded successfully



2023-12-29 13:29:54.974413: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-29 13:29:55.009956: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Using the following model:  openai/whisper-large
[youtube] Extracting URL: https://youtu.be/lK8gYGg0dkE?feature=shared
[youtube] lK8gYGg0dkE: Downloading webpage
[youtube] lK8gYGg0dkE: Downloading ios player API JSON
[youtube] lK8gYGg0dkE: Downloading android player API JSON
[youtube] lK8gYGg0dkE: Downloading m3u8 information
[info] lK8gYGg0dkE: Downloading 1 format(s): 140
[download] /root/Downloads/YouTube/President Franklin D. Roosevelt Declares War on Japan (Full Speech) ｜ War Archives.m4a has already been downloaded
[download] 100% of    4.44MiB
[ExtractAudio] Not converting audio /root/Downloads/YouTube/President Franklin D. Roosevelt Declares War on Japan (Full Speech) ｜ War Archives.m4a; file is already in target format m4a
[youtube] Extracting URL: https://youtu.be/gjT2NvQo0n4?feature=shared
[youtube] gjT2NvQo0n4: Downloading webpage
[youtube] gjT2NvQo0n4: Downloading ios player API JSON
[youtube] gjT2NvQo0n4: Downloading android player API JSON
[youtube] gjT2NvQo0n4: Download

In [13]:
text = ""
for docx in documents:
    text += (str(docx))

In [14]:
chunks = get_text_chunks(text)

In [15]:
# print(len(chunks))

In [16]:
vectordb = get_vectorstore(chunks)

load INSTRUCTOR_Transformer


  return self.fget.__get__(instance, owner)()


max_seq_length  512


In [None]:
# localDB = create_localDB()

In [None]:
# loadedDB = load_localDB()

In [None]:
# query = "What is the marvel name of Amey Khare?"
# docs = vectordb.similarity_search(query)
# # print(docs[0].page_content)
# print(docs)

In [None]:
# qa_chain = RetrievalQA.from_chain_type(
#     llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
#     chain_type="stuff",
#     retriever=vectordb.as_retriever(),
# )

# query = "what is the marvel name for Shivansh Goel?"
# answer = qa_chain.run(query)
# print(answer)

In [None]:
def driverFunc():
    os.environ["OPENAI_API_KEY"] = "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"

    database_folderName = "userData_embedded"
    current_directory = os.getcwd()
    folder_path = os.path.join(current_directory, database_folderName)
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        db = load_localDB()
        print("Loaded Database Successfully")
    else:
        db = create_localDB()
        print("Created Database Successfully")
    
    # documents = []
    # documents.extend(docData("userData"))
    # documents.extend(loadWebsites("userData"))
    # documents.extend(loadYoutubeVideos("userData"))

    # text = ""
    # for docx in documents:
    #     text += (str(docx))

    # chunks = get_text_chunks(text)
    
    # db = create_localDB(chunks)
    
    # qa_chain = RetrievalQA.from_chain_type(
    # llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0),
    # chain_type="stuff",
    # retriever=db.as_retriever())

    # query = input("User: ")
    # answer = qa_chain.run(query)
    # return f"EngiPal: {answer}"

    pdf_qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
    db.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True,
    verbose=False)

    chat_history = []
    print('Welcome to the EngiPal. Your Engineering Pal!\n')
    while True:
        query = input("Prompt: ")
        if query == "exit" or query == "quit" or query == "q" or query == "f":
            print('Exiting')
            return chat_history
            sys.exit()
        if query == '':
            continue
        result = pdf_qa({"question": query, "chat_history": chat_history})
        print("Answer: " + result["answer"])
        chat_history.append((query, result["answer"]))

In [None]:
chat = []
chat = driverFunc()

In [None]:
print(chat)

In [None]:
answer = driverFunc()

In [None]:
print(answer)

EngiPal
Checklist:

1. Database to store data locally (Completed)
2. custom model from huggingface (Completed)
3. custom model for embedding (Completed)
5. GUI

In [17]:
from transformers import AutoTokenizer, AutoModelForCausalLM 
import torch

In [18]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir="model/", device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="model/")

device = "cuda"
model.to(device)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRMSNorm()
  

In [19]:
import transformers

In [20]:
text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1024,
)

In [21]:
from langchain.llms import HuggingFacePipeline

In [22]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [24]:
qa_chain = RetrievalQA.from_chain_type(
    llm=mistral_llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever())

query = input("User: ")
answer = qa_chain.run(query)
print(f"EngiPal: {answer}")

User:  explain the day of infamy


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


EngiPal:  On December 7, 1941, the United States was suddenly and deliberately attacked by the naval and air forces of the Empire of Japan. This event, known as the Day of Infamy, occurred despite the fact that the United States was at peace with Japan and was still in negotiations for maintaining peace in the Pacific. The attack resulted in significant damage to American naval and military forces and led to the loss of many American lives. President Franklin D. Roosevelt addressed the nation and declared that a state of war existed between the United States and the Japanese Empire.


In [None]:
del(model)
del(tokenizer)