In [2]:
# !pip install unstructured

In [3]:
import os
import json

# Path to the 'scraped' folder
folder_path = 'catbotdata/'

# Function to recursively extract text from JSON data
def extract_text(data):
    if isinstance(data, dict):
        result = []
        for key, value in data.items():
            result.extend(extract_text(value))
        return result
    elif isinstance(data, list):
        result = []
        for item in data:
            result.extend(extract_text(item))
        return result
    elif isinstance(data, str):
        return [data]
    else:
        return []

In [4]:
# Function to load all JSON files from the 'scraped' folder
def load_all_json_files(folder_path):
    # List all files in the 'scraped' folder
    json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
    # print(json_files)
    
    extracted_text = []
    # Iterate over each JSON file
    for json_file in json_files:
        file_path = os.path.join(folder_path, json_file)
        # Open and load the JSON file
        with open(file_path, 'r') as f:
            try:
                data = json.load(f)
                # print(f"Loaded {json_file} successfully.")
                
                # Extract text from the JSON data
                extracted_text.extend(extract_text(data))
                extracted_text.append('\n')

            except json.JSONDecodeError as e:
                print(f"Error loading {json_file}: {e}")
    return extracted_text

In [5]:
# Call the function to load all JSON files and extract text
extracted_text = load_all_json_files(folder_path)

extracted_text='-'.join(extracted_text)

# Return the collected data
# print(extracted_text)

In [6]:
# with open('training_data.txt', 'a') as fp:
#     fp.write(extracted_text)

In [7]:
#creating chunks of data
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

chunks=splitter.split_text(extracted_text)

In [8]:
# !pip install langchain-huggingface

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss
import pickle
import glob 
from langchain.prompts import PromptTemplate
from langchain.chains import VectorDBQAWithSourcesChain

In [10]:
embeddings=HuggingFaceEmbeddings()

  from tqdm.autonotebook import tqdm, trange


In [11]:
store=FAISS.from_texts(chunks, embedding=embeddings, )
faiss.write_index(store.index, 'docsind.index')
store.index=None
with open(r'faiss_store.pkl', 'wb') as fp:
    pickle.dump(store, fp)

In [12]:
from langchain.chains.conversation.memory import ConversationBufferMemory



In [13]:
index=faiss.read_index("docsind.index")
with open("faiss_store.pkl", "rb") as f:
    store=pickle.load(f)

store.index=index


In [14]:
print(store.similarity_search("syllabus of automobile"))

[Document(page_content='-Curriculum-Automobile SyllabusB.Tech. in Automobile Engg. Admission Year 20-21 (B.Tech. 23-24)B.Tech. in Automobile Engg. Admission Year 21-22 (T.Y. 23-24)B.Tech. in Automobile Engg. Admission Year 22-23 (S.Y. 23-24)B.Tech. in Automobile Engg. Admission Year 23-24 (F.Y. 23-24) NEP SchemeF.E. (REV- 2019 – C Scheme)S.E. (REV- 2019 – C Scheme)T.E. (REV- 2019 – C Scheme)S.E. to B.E. (Rev- 2016 – CBCGS Scheme)-B.Tech. in Automobile Engg. Admission Year 20-21 (B.Tech. 23-24)-https://www.pce.ac.in/wp-content/uploads/2023/12/1-B.Tech-in-Automobile-Engg-Admission-Year-20-21-B.Tech-23-24.pdf-B.Tech. in Automobile Engg. Admission Year 21-22 (T.Y. 23-24)-https://www.pce.ac.in/wp-content/uploads/2023/12/2-B.Tech-in-Automobile-Engg-Admission-Year-21-22-TY-23-24.pdf-B.Tech. in Automobile Engg. Admission Year 22-23 (S.Y. 23-24)-https://www.pce.ac.in/wp-content/uploads/2023/12/3-B.Tech-in-Automobile-Engg-Admission-Year-22-23-SY-23-24.pdf-B.Tech. in Automobile Engg. Admission Ye

In [15]:
# print(store.similarity_search("syllabus of automobile"))

# Your predefined template
template = """
You are a chatbot assistant by Pillai College of Engineering that provides information about student services and the college.
If you don't know the answer, just say "sorry..!, I'm not sure about the answer. Please visit the website for further assistance." 
Don't try to make up an answer.

HUMAN: {question}
=========
{summaries}
=========
CHATBOT:
"""

# Function to generate the prompt using the template, chat history, and retrieved documents
def generate_prompt(question, retrieved_docs):
    # Combine previous conversation (chat history)
    # history_text = "\n".join([f"HUMAN: {item['question']}\nCHATBOT: {item['answer']}" for item in chat_history])
    
    # Combine the retrieved documents from vector store (if any)
    doc_summaries = "\n".join([f"CONTENT: {doc.page_content}" for doc in retrieved_docs])

    
    # Fill in the template with chat history, question, and document summaries
    prompt = template.format(question=question, summaries=doc_summaries)
    
    return prompt




In [16]:
# Function to query LLaMA with the generated prompt
from huggingface_hub import InferenceClient

def query_llama(prompt):
    hf_token='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'
    client = InferenceClient(
        "meta-llama/Meta-Llama-3-8B-Instruct",
        token=hf_token,
)
    response = client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=500,
        stream=False  # You can use streaming if you prefer
    )
    return response['choices'][0]['message']['content']

# Main function to search vector store, generate prompt using the template, and query LLaMA
def query_with_template_and_sources(question, vectorstore):
    # global chat_history
    # Retrieve relevant documents from vector store
    docs = vectorstore.similarity_search(question)
    
    # Generate the prompt using the template, including chat history and document summaries
    prompt = generate_prompt(question, docs)
    
    # Query LLaMA model with the generated prompt
    answer = query_llama(prompt)
    
    # Add the current question and answer to chat history
    # chat_history.append({"question": question, "answer": answer})
    
    return answer

In [17]:
question='principal of pillai?'
print(query_with_template_and_sources(question, store))

HUMAN: principal of pillai?
The Principal of Pillai College of Engineering is Dr. Sandeep M. Joshi.


