# This file will help create the Vector databases of 
# General Inquiry AND Campus Navigation

# 1. Gen Inquiry

In [2]:
import os
import json

# Path to the 'scraped' folder
folder_path = 'chatbotdata/'

# Function to recursively extract text from JSON data
def extract_text(data):
    if isinstance(data, dict):
        result = []
        for key, value in data.items():
            result.extend(extract_text(value))
        return result
    elif isinstance(data, list):
        result = []
        for item in data:
            result.extend(extract_text(item))
        return result
    elif isinstance(data, str):
        return [data]
    else:
        return []

In [3]:
# Function to load all JSON files from the 'scraped' folder
def load_all_json_files(folder_path):
    # List all files in the 'scraped' folder
    json_files = [file for file in os.listdir(folder_path) if file.endswith('.json')]
    # print(json_files)
    
    extracted_text = []
    # Iterate over each JSON file
    for json_file in json_files:
        file_path = os.path.join(folder_path, json_file)
        # Open and load the JSON file
        with open(file_path, 'r') as f:
            try:
                data = json.load(f)
                # print(f"Loaded {json_file} successfully.")
                
                # Extract text from the JSON data
                extracted_text.extend(extract_text(data))
                extracted_text.append('\n')

            except json.JSONDecodeError as e:
                print(f"Error loading {json_file}: {e}")
    return extracted_text

In [None]:
# Call the function to load all JSON files and extract text
extracted_text = load_all_json_files(folder_path)

extracted_text='-'.join(extracted_text)

# Return the collected data
print(extracted_text[:500])

docLink-https://www.pce.ac.in/wp-content/uploads/2019/08/academic-and-administrative-planner-AAAP.pdf-
-Atal Ranking of Institutions on Innovation Achievements (ARIIA)-We are proud to announce that Pillai College of Engineering (ARI-C-33505) has gained All India rank Band Performer (Private or Self-Financed College / Institutes) in ATAL RANKING OF INSTITUTIONS ON INNOVATION ACHIEVEMENTS (ARIIA) 2021.view certificateThe result announcement function was held on 18th August in the presence of Hon. Vice President of India.In ARIIA, the institutes are judged based on the innovation and incubation facilities, startups supported, entrepreneurship and funding support at all India level. This is an initiative of MHRD, AICTE and MIC.2021-222020-212019-20view ARIIA 2021 Certificateview ARIIA 2020 Certificateview ARIIA 2020-21 Reportview ARIIA 2019-20 Report-view certificate-https://www.pce.ac.in/wp-content/uploads/2022/01/ARI-C-33505-ARIIA-2021.pdf-view ARIIA 2021 Certificate-https://www.pce.ac.i

In [5]:
#creating chunks of data
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter=RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)

chunks=splitter.split_text(extracted_text)

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss
import pickle

embeddings=HuggingFaceEmbeddings()

  from tqdm.autonotebook import tqdm, trange







# The next step below will create two files 
# [index_PCE.index]&[faiss_PCE.pkl]

In [7]:
store=FAISS.from_texts(chunks, embedding=embeddings, )
faiss.write_index(store.index, 'index_PCE.index')
store.index=None
with open(r'faiss_PCE.pkl', 'wb') as fp:
    pickle.dump(store, fp)

# 2. Campus Navigation
Reset the kernel and then Execute the Below code to for Vector DB of Campus Navigation to avoid any errors and maintain reliability.

In [8]:
import json
import os

# First, define the helper functions that were missing
def create_room_chunk(room_data, wing_name):
    return f"""
    Room {room_data['room_number']} ({room_data['name']}) is located in {wing_name}.
    Type: {room_data['type']}
    Nearby rooms: {', '.join(room_data['nearby'])}
    Directions: {' '.join(room_data['directions'].values())}
    """

def create_wing_chunk(wing_data):
    return f"""
    {wing_data['name']} contains the following rooms: 
    {', '.join([room['room_number'] + ' (' + room['name'] + ')' for room in wing_data['rooms']])}
    """

def create_general_directions_chunk(directions):
    return "\n".join([f"To go from {k.split('_to_')[0].upper()} Wing to {k.split('_to_')[1].upper()} Wing: {v}" 
                     for k, v in directions.items()])

In [9]:
# Main processing function
def process_college_data(json_data):
    chunks = []
    
    # Process each wing
    for wing_id, wing_data in json_data['wings'].items():
        # Add wing-level chunk
        chunks.append({
            "text": create_wing_chunk(wing_data),
            "metadata": {
                "type": "wing",
                "wing": wing_id
            }
        })
        
        # Add room-level chunks
        for room in wing_data['rooms']:
            chunks.append({
                "text": create_room_chunk(room, wing_data['name']),
                "metadata": {
                    "type": "room",
                    "wing": wing_id,
                    "room_number": room['room_number'],
                    "room_type": room['type']
                }
            })
    
    # Add general directions
    chunks.append({
        "text": create_general_directions_chunk(json_data['general_directions']),
        "metadata": {
            "type": "directions",
        }
    })
    
    return chunks

In [10]:
# Process all JSON files
final_chunks = []
json_files = [file for file in os.listdir('navigation') if file.endswith('.json')]

for file in json_files:
    # Construct full file path
    file_path = os.path.join('navigation', file)
    
    # Read and load the JSON file
    with open(file_path, 'r') as f:
        json_data = json.load(f)
    
    # Process the JSON data
    chunks = process_college_data(json_data)
    final_chunks.extend(chunks)  # Use extend instead of append to add all chunks

print(f"Processed {len(json_files)} files, created {len(final_chunks)} chunks")

Processed 5 files, created 90 chunks


In [11]:
final_chunks[1]

{'text': '\n    Room S101 (Computer Graphics Lab) is located in S Wing.\n    Type: lab\n    Nearby rooms: S102, Staircase\n    Directions: Facing towards reception, turn to your right and walk to the end. You can find the lift or grab the staircase for the first floor. When on the first floor, you can find the Computer Graphics Lab to your left.\n    ',
 'metadata': {'type': 'room',
  'wing': 'S',
  'room_number': 'S101',
  'room_type': 'lab'}}

In [12]:
texts = [chunk["text"] for chunk in final_chunks]
metadatas = [chunk["metadata"] for chunk in final_chunks]

In [13]:
################################# THIS IS JUST FOR TESTING WILL WORK THOUGH NOT EXECUTED #################################

for key, val in metadatas[2].items():
    print(f"{key}={val}, ", end='')

type=room, wing=S, room_number=S102, room_type=lab, 

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss
import pickle

embeddings=HuggingFaceEmbeddings()

# The next step below will create two files 
# [index_nav.index]&[faiss_nav.pkl]

In [15]:
store=FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
faiss.write_index(store.index, 'index_nav.index')
store.index=None
with open(r'faiss_nav.pkl', 'wb') as fp:
    pickle.dump(store, fp)

YOU ARE NOW GOOD TO GO JUST ENSURE YOU HAVE THE ABOVE FILES GENERATED !!

ALL THE BEST 👍👍