In [None]:
!pip install langchain
!pip install typing-inspect==0.8.0 typing_extensions==4.5.0
!pip install pydantic==1.10.11
# !pip install llama-cpp-python
# !pip install tensorflow
!pip install accelerate
!pip install bitsandbytes
!pip install -U tokenizers
!pip install transformers
!pip install sentence-transformers

In [None]:
%pip install ctransformers -q

In [None]:
import glob
import pandas as pd
import sqlite3
import warnings
warnings.filterwarnings('ignore')
from langchain.llms import CTransformers
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler 

callback_manager=CallbackManager([StreamingStdOutCallbackHandler()])

config_sql = {'max_new_tokens': 512, 'repetition_penalty': 1.1,'temperature':0,'stop':[';']}

In [None]:
file = "example.db"
try: 
    conn = sqlite3.connect(file) 
    print(f"Database {file} formed.") 
except: 
    print(f"Database {file} not formed.")

with sqlite3.connect(file) as conn:
    try:
        df.to_sql('df',conn,if_exists='replace',index=False)
        print('df table created')
    except:
        print('df table not created')
        
con = sqlite3.connect("example.db")
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

In [None]:
def generate_sql_query(question):
    prompt=f"""
    {context}
    Question: Write a sql query for the question '{question}'
    Answer:
    """
    llm = CTransformers(model='openorca-platypus2-13b.Q4_K_M.gguf', 
                        config=config_sql,callback_manager=callback_manager)
    return llm(prompt)

In [None]:
question = ""
query_result = generate_sql_query(question)
with sqlite3.connect(file) as conn:
    df_result = pd.read_sql_query(query_result, conn)

In [None]:
from langchain.docstore.document import Document 
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS,Chroma

class vectorstore:
    def __init__(self,sheet_name):
        self.sheet_name = sheet_name
        self.df = pd.read_excel('All_Files.xlsx',sheet_name = sheet_name)
        self.df = self.df.fillna('No Data')
        self.df = self.df.astype(str)
        
        self.columns_to_embed = self.df.columns.values
        self.columns_to_metadata = self.df.columns.values

        self.df_columns_to_embed=self.df[self.columns_to_embed]
        self.df_columns_to_metadata=self.df[self.columns_to_metadata]

        self.to_embed_dict=self.df_columns_to_embed.to_dict('records')
        self.to_metadata_dict=self.df_columns_to_metadata.to_dict('records')
   
    def load_documents(self):
        # Define the columns we want to embed vs which ones we want in metadata
        self.docs=[]
        for i in range(len(self.to_embed_dict)):
            to_embed = self.to_embed_dict[i]
            to_metadata = self.to_metadata_dict[i]
            to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in to_embed.items())
            newDoc=Document(page_content=to_embed,metadata=to_metadata)
            self.docs.append(newDoc)

    def document_splitter(self):
        text_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
        self.docs = text_splitter.split_documents(self.docs)

    def save_embedding(self):
        self.hf = HuggingFaceEmbeddings()
        self.db = FAISS.from_documents(self.docs, self.hf)
        self.db.save_local(f"faiss_index_{self.sheet_name}")
        print(f'{self.sheet_name} saved')

    def embedding_load_and_search(self,query,num_of_matches):
        self.hf = HuggingFaceEmbeddings()
        self.db = FAISS.load_local(f"faiss_index_{self.sheet_name}", self.hf)
        df_result = pd.DataFrame(data=[],columns=self.columns_to_metadata)

        results = self.db.similarity_search(query,k=num_of_matches)
        for i in range (len(results)):
            df_embed_result = pd.DataFrame([results[i].metadata])
            df_result = pd.concat([df_result,df_embed_result],axis=0,ignore_index=True)

        return df_result
    
    def load_split_save(self):
        self.load_documents()
        self.document_splitter()
        self.save_embedding()
        