# Creating the Knowledge Base for RAG

In [1]:
%%capture
!pip install -q langchain faiss-gpu sentence-transformers langchain-huggingface langchain-community

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

web_data = pd.read_csv('/kaggle/input/miniproject/Scraped_Data.csv')
web_data.shape

(8895, 3)

In [3]:
web_data.head()

Unnamed: 0,url,text,length
0,https://www.nitt.edu/home/students/facilitiesn...,Training and Placement NIT Trichy is an equal ...,4011
1,https://www.nitt.edu/home/students/facilitiesn...,CENTRAL LIBRARY Library Catalog- Web OPAC | Ce...,4827
2,https://www.nitt.edu/home/academics/departments/,Academic Departments Architecture Fax: +91-431...,1279
3,https://www.nitt.edu/home/students/events/,Events The College calendar is interspersed wi...,669
4,https://www.nitt.edu/home/students/facilitiesn...,Computer Support Group The Octagon - Main CC b...,9078


In [4]:
import re
email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phone_pattern = r'\+?\d{1,4}[\s-]?\(?\d{1,4}\)?[\s-]?\d{1,4}[\s-]?\d{1,4}[\s-]?\d{1,9}'

def remove_emails_phones(text):
    text = re.sub(email_pattern, '', text)
    text = re.sub(phone_pattern, '', text)
    return text.strip()

web_data['text'] = web_data['text'].apply(remove_emails_phones)
web_data['length']=web_data['text'].str.len()
web_data.head()

Unnamed: 0,url,text,length
0,https://www.nitt.edu/home/students/facilitiesn...,Training and Placement NIT Trichy is an equal ...,3433
1,https://www.nitt.edu/home/students/facilitiesn...,CENTRAL LIBRARY Library Catalog- Web OPAC | Ce...,4750
2,https://www.nitt.edu/home/academics/departments/,Academic Departments Architecture Fax: Phone: ...,729
3,https://www.nitt.edu/home/students/events/,Events The College calendar is interspersed wi...,669
4,https://www.nitt.edu/home/students/facilitiesn...,Computer Support Group The Octagon - Main CC b...,8967


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

In [6]:
text_data = set(web_data['text'])
documents = [Document(page_content=text) for text in text_data]

# Create document chunks
text_splitter = RecursiveCharacterTextSplitter(separators=[" ", '.'],
                                              chunk_size=1024,
                                              chunk_overlap=256)
text_chunks = text_splitter.split_documents(documents)

In [7]:
# Load the vector embedding model
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
knowledge_base = FAISS.from_documents(text_chunks, embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# This function will help us to fetch the top 3 matching context
# for the given user query.
def fetch_context(query, k=3):
    top_docs = knowledge_base.similarity_search(query, k=k)
    unique_docs=set()
    for doc in top_docs:
        unique_docs.add(doc.page_content)
    return list(unique_docs)

In [9]:
query = "What is the placement opprtunities are available at NIT Trichy? \
Also tell me like the percentage placements as well as company who come to \
recruit."
top_docs = fetch_context(query)

for i, doc in enumerate(top_docs):
    print(f"Document {i+1}:")
    print(doc)
    print("\n" + "="*80 + "\n")

Document 1:
Placements Placement statistics for B.Tech Production Engineering Year--Percentage69.786.376.995.797..4Year--Percentage94.794.397..791.996.993.1Year--Percentage97.394.3------Placement statistics for M.Tech(Manufacturing Technology)Year--Percentage30.5854.554.581..484.6Year--Percentage6071.478.643.845.544..6Year--Percentage9681.8------Placement Statistics for M Tech (Industrial Engineering)Year--Percentage..576.9Year--Percentage68.884.283.338.172.252.252.992.0Year--Percentage91.376.2------


Document 2:
Training and Placement NIT Trichy is an equal opportunity Institution and urges Organizations to recognize the skillsets and innate abilities of Pw D/SLD students and provide them opportunities for an inclusive environment Mobile:  /  / Email: ,  Mobile:  /  / Email: ,  Telephone: , , The department of Training and Placement, the marketing division of the institute has these following functions and responsibilities:Nurtures Industry Institute interaction, by organizing and co

# Generating Instruction Prompt and Responses to User Query

In [10]:
import ctypes, gc
libc = ctypes.CDLL('libc.so.6')
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()
clear_memory()

In [11]:
def generate_prompt(query, k=3):
    contexts=fetch_context(query, k=k)

    prompt = ''
    context_data = ''
    for context in contexts:
        context_data+=context
        context_data+="\n"
        
    prompt += f"""
      ### Context:
      {context_data}

      ### Instruction:
      Based on the information provided above you are supposed to answer the below query. 
      Make sure that you answer the query in the most informative and descriptive manner possible.:
      {query}
        
      ### Response:
    """
    return prompt

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "mosaicml/mpt-7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.36G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Creating API EndPoint

In [13]:
!pip install Flask pyngrok

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Downloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.1


In [14]:
from pyngrok import ngrok
!ngrok authtoken 2oZQ2IEQGpxV1O3iYMe6ioRz0lw_3f7RWhqMBf94h3NKBDqoi
tunnel = ngrok.connect(8501, "http", "us")
public_url = tunnel.public_url
print("Public URL:", public_url)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml                                
Public URL: https://cf37-34-151-81-214.ngrok-free.app


In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)
@app.route('/predict', methods=['POST'])
def get_response():
    clear_memory()
    query=request.json['query']
    print(query)
    
    # Generating Instruction Prompt
    instruction_prompt=generate_prompt(query, k=4)

    # Tokenize the input
    inputs = tokenizer(instruction_prompt,
                       max_length=4096,
                       truncation=True,
                       return_tensors="pt").to(device)
    
    # Generate a response
    clear_memory()
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        no_repeat_ngram_size=2,
    )
    
    # Convert the output the the responses.
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response[len(instruction_prompt):]
    print(response)
    return jsonify({'response': response})

if __name__ == '__main__':
    app.run(host='127.0.0.1', port=8501)

 * Serving Flask app '__main__'
 * Debug mode: off


# Evaluation of ChatBot

In [13]:
def get_response(query):
    clear_memory()
    
    # Generating Instruction Prompt
    instruction_prompt=generate_prompt(query, k=4)

    # Tokenize the input
    inputs = tokenizer(instruction_prompt,
                       max_length=4096,
                       truncation=True,
                       return_tensors="pt").to(device)
    
    # Generate a response
    clear_memory()
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,
        no_repeat_ngram_size=2,
    )
    
    # Convert the output the the responses.
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response[len(instruction_prompt):]
    return response

In [14]:
queries = [
    "I want to know what are some of the fest that happen in NIT Trichy around the year?",
    "What are the three most famous technical clubs in NIT Trichy?",
    "Can you tell me something about placements at NIT Trichy?",
    "Can I get Music related opprtunities in college?",
    "What are the courses offered by NIT Trichy?",
    "Can I get Dance related opprtunities in college?",
    "What are some of the things I should take care of before joining NIT Trichy?",
    "What are the main research areas that are being worked upon?",
    "Can I get sports related opprtunities in college?",
    "Which course is really famous in NIT Trichy?",
    "Suggest me some places near NIT Trichy which I can visit about joining NIT Trichy?",
    "Tell me about the library related facilities in NIT Trichy?",
    "Tell me about computer center and computer lab at NIT Trichy?",
    "Is there scope of doing research if I take admission in NIT Trichy?",
    "Tell me about the hostel and mess related information.",
    "Does the food in mess is edible? What are the different varieties of food provided?",
    "Shall I pursue M.Sc. from NIT Trichy? Can I secure good placements from there?",
    "Is this college good for M.C.A or B.Tech?",
    "Can you tell me more about the Computer Applications Department?",
    "What is the campus size of the NIT Trichy?"
]
len(queries)

20

In [15]:
for query in queries:
    response = get_response(query)
    print("="*100)
    print(f"Query: {query}")
    print(f"Response: {response}")
    print("="*100)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: I want to know what are some of the fest that happen in NIT Trichy around the year?
Response: The college calendar has many fest happening around year, some are technical and some cultural. Some of them are Pragnyan, Nittfest, festember, convocation day, alumni day and sports day.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: What are the three most famous technical clubs in NIT Trichy?
Response: The three technical club in National Institute of Technology Trichi are: 
    1. Robotics & Machine intelligence club (R&MI)
2. Designers club
3. PSI Racing club


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can you tell me something about placements at NIT Trichy?
Response: richi has a very good placement record. The average package is around 8 lakhs per annum. Companies like TCS, Infosys, Wipro, HCL, Amazon, Flipkart, Microsoft, Oracle, Capgemini, Deloitte, Accenture, Cognizant, Tech Mahindra, etc visit the campus for placememt. Some of them are from core branches like ECE, EEE, ME, CE, IT, CS, EC, IN, EE, Civil, Mechanical, Aeronautical, Biotechnology, Bioinformatics, Physics, Chemistry, Mathematics, Statistics, Geology, Computer Science, Information Science and Architecture.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can I get Music related opprtunities in college?
Response: you can get a lot of opportunities in Music. You can join the music club and represent the college in various inter college events and competitions and also you will get to learn a new skill.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: What are the courses offered by NIT Trichy?
Response: The National Institute of Technology, Tiruchirappalli (NATIONAL INSTITUTE OF TECHNOLOGY, TRICHY) is a public engineering institution located in Tiruchi, Tamil Nadu, India. It was established in 1964 as Regional Engineering College, Madras. In 1970, it was renamed as National Engineering School, Karaikudi. On 1 May 1987, the institution was granted the status of a Deemed University by the UGC.
The institute has been accredited by NAAC with 'A' grade and is an ISO 9001:2008 certified institution. NATIONAL Institute Of Technology Trichi is one of India's top engineering colleges and ranks among the top 10 engineering institutes in India according to NIRF rankings.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can I get Dance related opprtunities in college?
Response:  you can get various opportunities in Dance. You can join the Dance Troupes in your college. Apart from that there will be various intercollege competitions where you will get a chance to showcase your talent.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: What are some of the things I should take care of before joining NIT Trichy?
Response: 1.  Make a list of all your requirements and expectations from the college.
2.   Make an informed decision based on your list. 
3.    Make arrangements for your stay.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: What are the main research areas that are being worked upon?
Response: The main areas of research are: 
    1. Design and Development of Machine Tools and Automation
2. Manufacturing Processes and Productivity Improvement
3. Design of Machines and Machine Elements
4. Computer Aided Design, Computer Integrated Manufacturing and Computer Simulation
5. Robotics and Mechatronics
6. Modelling and Simulations of Manufacturing Systems
7. Product Design for Manufacturing
8. Advanced Materials and Process Engineering
9. Energy Efficient Manufacturing


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can I get sports related opprtunities in college?
Response: , you can get various opportunities in sports like playing sports, organising sports events, joining sports clubs, etc.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Which course is really famous in NIT Trichy?
Response: he course which is famous and popular in National institute of technology, trichy, is Computer Science and Engineering.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Suggest me some places near NIT Trichy which I can visit about joining NIT Trichy?
Response: There are many places around Nit Trichi which you can explore. Some of them are: 
    1. Rock Fort Temple: This temple is very famous and is just 2 km away from Nitt Trichia.
2. Kallanai Dam: It's a beautiful dam which is around 20km away. One can enjoy boating in this dam. 
3. Srirangam Temple : This famous temple of Lord Ranganathar is at a distance of around 30km from nit trichy and can be reached by local buses.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Tell me about the library related facilities in NIT Trichy?
Response: he library of Nitt Trichi has a collection of more than 1,50,000 books, journals, and periodicals. The library has an open access system and is fully automated. There are separate reading rooms for students and faculty members. A total of 10,500 students are enrolled in various courses in this institute.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Tell me about computer center and computer lab at NIT Trichy?
Response: The National institute of technology, trichy has a centralized Computer center with 300 computers and a separate computer labs for each department with about 100 computers. The computer centre has all the software required for various branches of engineering.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Is there scope of doing research if I take admission in NIT Trichy?
Response: Yes, there is a lot of scope for doing Research in various fields if you take Admissions in National institute of technology trichy.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Tell me about the hostel and mess related information.
Response: The institute has excellent hostels for its student. It provides all amenities to make their life worthwhile. Hostellers are required to follow the rules and regulations of hostellers.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Does the food in mess is edible? What are the different varieties of food provided?
Response: The food served in NITT mess are edible and are prepared by professional chefs. The food is prepared in a hygienic environment and is served hot. There is a variety of North and South indian food.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Shall I pursue M.Sc. from NIT Trichy? Can I secure good placements from there?
Response: Nit Trichi is one of best engineering college in Tamilnadu. It is also one among top 10 engineering colleges in india. You can pursue your Msc from here. Placements are good as it is an Nit.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Is this college good for M.C.A or B.Tech?
Response: CA is not offered by this institution. However, B Tech in CSE is offered.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can you tell me more about the Computer Applications Department?
Response: e Department is the pioneer in Information technology education.  It offers MCA, MSc Computer science, and Mtech Data analytics.



Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: What is the campus size of the NIT Trichy?
Response: The Nitt Trichi campus is spread over a vast area of about 1000 acres.


In [14]:
queries = [
    "Can you give me the names of the famous sports clubs in NIT Trichy?",
    "Can you give me the names of the famous music clubs in NIT Trichy?",
    "Can you give me the names of the famous dance clubs in NIT Trichy?",
    "Is there any famous restaurent in the NIT Trichy campus?",
    "How far is it from the Railway Station?",
    "Can you tell me about VLSI program in NIT Trichy?",
]
len(queries)

6

In [15]:
for query in queries:
    response = get_response(query)
    print("="*100)
    print(f"Query: {query}")
    print(f"Response: {response}")
    print("="*100)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can you give me the names of the famous sports clubs in NIT Trichy?
Response: he famous sport clubs of National Institute of Technology, Tiruchi are: 
    1) Basketball
2) Cheass
3) Cricket
4) Hand Ball
5) Kabaddi
6) Table tennis
7) Tenni


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can you give me the names of the famous music clubs in NIT Trichy?
Response: he famous Music clubs are Music troup, Amrutavarshini, The thespian society,  Music troupe.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can you give me the names of the famous dance clubs in NIT Trichy?
Response: he famous Dance Club in National Institute of Technology Trichi is Dance Troup.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Is there any famous restaurent in the NIT Trichy campus?
Response: There is no famous restaurant in our college campus but there are many food joints in and around the college.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: How far is it from the Railway Station?
Response: The Nitt is 20 Km from railway station.


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query: Can you tell me about VLSI program in NIT Trichy?
Response: The VLSi program at Nit Trichi is a three year program. It is one of its kind in India. This program is unique in its curriculum and pedagogy. Students are exposed to a lot of industry oriented projects and are trained to work in teams. They are also trained in soft skills and communication.


In [16]:
response = get_response(
    "Is there NCC in the college as I would like to pursue my interest of NCC in college."
)
print(response)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Yes, there is Ncc in this college and you can join it.
