In [None]:
import sys
import os

from Files.csvLoader import loadCSV
from Files.docsLoader import loadDOCS
from Files.htmlLoader import loadHTML
from Files.jsonLoader import loadJSON
from Files.mdLoader import loadMD
from Files.pdfLoader import loadPDF
from Files.txtLoader import loadTXT

from Websites.urlLoader import loadURL
from Websites.seleniumLoader import loadSELENIUM
from Websites.recursiveLoader import loadRECURSIVE

from Youtube.youtubeLoader import loadYOUTUBE

from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.prompts.chat import SystemMessagePromptTemplate

In [None]:
os.environ["OPENAI_API_KEY"] = "sk-xxxx"

documents = []

In [None]:
def load_document(file_path: str):
    try:
        if file_path.endswith(".pdf"):
            return loadPDF(file_path)
        elif file_path.endswith(".docx"):
            return loadDOCS(file_path)
        elif file_path.endswith(".txt"):
            return loadTXT(file_path)
        elif file_path.endswith(".csv"):
            return loadCSV(file_path)
        elif file_path.endswith(".md"):
            return loadMD(file_path)
        elif file_path.endswith(".html"):
            return loadHTML(file_path)
        elif file_path.endswith(".json"):
            return loadJSON(file_path)
    except Exception as e:
        print(f"Error while loading {file_path}: {e}")

def docData(dir: str = "file/"):
    documents = []
    for file in os.listdir(dir):
        file_path = os.path.join(dir, file)
        data = load_document(file_path)
        if data:
            documents.extend(data if isinstance(data, list) else [data])
    
    print("Files loaded successfully\n")
    return documents

In [None]:
def loadWebsites(file_path: str):
    try:
        with open(f'{file_path}/websites.txt', 'r') as file:
            lines = file.readlines()
        websites_list = [line.strip() for line in lines]
    except Exception as e:
        print("Error in reading files: ", str(e))
    try:
        print("Websites loaded successfully\n")
        return loadURL(websites_list)
    except Exception as e:
        print("Error in loading in URLs\n")
        return f"Error in loadURL: {e}"        

In [None]:
def loadYoutubeVideos(file_path: str):
    try:
        with open(f'{file_path}/links.txt', 'r') as file:
            lines = file.readlines()
        links_list = [line.strip() for line in lines]
    except Exception as e:
        print("Error in reading files: ", str(e))
    try:
        print("Youtube Videos loaded successfully\n")
        return loadYOUTUBE(links_list)
    except Exception as e:
        print("Error in loading in URLs\n")
        return f"Error in loadURL: {e}"        

In [None]:
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\\n",
        chunk_size=1000,
        chunk_overlap=50,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [None]:
def get_vectorstore(text_chunks):
    # embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

In [None]:
def create_localDB():
    try:
        documents = []
        documents.extend(docData("userData"))
        documents.extend(loadWebsites("userData"))
        documents.extend(loadYoutubeVideos("userData"))

        text = ""
        for docx in documents:
            text += (str(docx))

        text_chunks = get_text_chunks(text)
        db = Chroma.from_texts(text_chunks, embedding=OpenAIEmbeddings(), persist_directory="./userData_embedded")
        db.persist()
        return db
    except Exception as e:
        print("Error while creating chroma databse: ", str(e))

In [None]:
def load_localDB():
    try:
        db = Chroma(persist_directory="./userData_embedded", embedding_function=OpenAIEmbeddings())
        db.get()
        return db
    except Exception as e:
        print("Error while retrieving an embedded database: ", str(e))

In [None]:
def driverFunc():
    os.environ["OPENAI_API_KEY"] = "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"

    database_folderName = "userData_embedded"
    current_directory = os.getcwd()
    folder_path = os.path.join(current_directory, database_folderName)
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        db = load_localDB()
        print("Loaded Database Successfully")
    else:
        db = create_localDB()
        print("Created Database Successfully")
        
    pdf_qa = ConversationalRetrievalChain.from_llm(
    ChatOpenAI(temperature=0.9, model_name="gpt-3.5-turbo"),
    db.as_retriever(search_kwargs={'k': 6}),
    return_source_documents=True,
    verbose=False)

    chat_history = []
    print('Welcome to the EngiPal. Your Engineering Pal!\n')
    while True:
        query = input("Prompt: ")
        if query == "exit" or query == "quit" or query == "q" or query == "f":
            print('Exiting')
            return chat_history
            sys.exit()
        if query == '':
            continue
        result = pdf_qa({"question": query, "chat_history": chat_history})
        print("Answer: " + result["answer"])
        chat_history.append((query, result["answer"]))
        
    return chat_history

In [None]:
chat = []
chat = driverFunc()

In [None]:
chat

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM 
import torch

In [11]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

device = "cuda"
model.to(device)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

RuntimeError: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.

In [None]:
model.save_pretrained("mistral7B/")
tokenizer.save_pretrained("mistral7B/")

In [10]:

from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2", device_map="cuda", cache_dir="phi2")
model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2", cache_dir="phi2")

device = "cuda"
model.to(device)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/phi-2 were not used when initializing PhiForCausalLM: ['model.layers.8.self_attn.q_proj.bias', 'model.layers.20.self_attn.v_proj.weight', 'model.layers.13.self_attn.q_proj.weight', 'model.layers.4.self_attn.q_proj.bias', 'model.layers.26.self_attn.q_proj.bias', 'model.layers.29.self_attn.k_proj.weight', 'model.layers.22.self_attn.q_proj.weight', 'model.layers.18.self_attn.k_proj.weight', 'model.layers.8.self_attn.v_proj.bias', 'model.layers.30.self_attn.v_proj.weight', 'model.layers.6.self_attn.k_proj.bias', 'model.layers.21.self_attn.k_proj.bias', 'model.layers.11.self_attn.q_proj.bias', 'model.layers.16.self_attn.k_proj.bias', 'model.layers.12.self_attn.v_proj.weight', 'model.layers.2.self_attn.v_proj.bias', 'model.layers.26.self_attn.k_proj.bias', 'model.layers.13.self_attn.k_proj.weight', 'model.layers.12.self_attn.k_proj.weight', 'model.layers.8.self_attn.v_proj.weight', 'model.layers.22.self_attn.k_proj.weight', 'model.layers.20.s

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

RuntimeError: The NVIDIA driver on your system is too old (found version 11040). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.