# Step 1: Mount Google Drive in Colab

In [1]:
# Mounting Google drive to access the NPTEL Data
# This brings in all Drive contents to Storage in Google Colab
# Folder link: https://drive.google.com/drive/folders/1DxFVBq2RQxmP_-N2hiAKxyY4FGHmwq2-?usp=sharing
# Copy the folder to your drive before using this notebook

from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!ls "/content/drive/MyDrive/Deep Learning"

 Assignments  'Lecture Slides'	'Lecture Transcripts.pdf'


# Step 2: Extract Text from PDFs

In [3]:
!pip install PyMuPDF



In [4]:
# Method to extract text from PDF

import fitz

def extract_text_from_pdf(pdf_path):
  pdf = fitz.open(pdf_path)
  text = ""
  for page in pdf:
    text += page.get_text()
  return text

In [5]:
# Extract data from Assignments Folder

import os
assignment_folder = "/content/drive/MyDrive/Deep Learning/Assignments"
assignment_data = []

for files in os.listdir(assignment_folder):
  if files.endswith(".pdf"):
    pdf_path = os.path.join(assignment_folder, files)
    text = extract_text_from_pdf(pdf_path)
    assignment_data.append({'title': files, 'content': text})

In [6]:
assignment_data[0]

{'title': 'Assignment_1_2022.pdf',
 'content': '  \n \nNPTEL Online Certification Courses \nIndian Institute of Technology Kharagpur \nDeep Learning \nAssignment- Week 1 \nTYPE OF QUESTION:  MCQ/MSQ \nNumber of questions: 10  \n \n \n \n \n            Total mark: 10 X 2= 20  \n______________________________________________________________________________ \nQUESTION 1: \nSignature descriptor of an unknown shape is given in the figure, can you identify the unknown \nshape? \n \n \n \n \n \n \n \na. Circle \nb. Square \nc. Straight line \nd. Cannot be predicted  \nCorrect Answer: a \nDetailed Solution:  \nDistance from centroid to boundary is same for every value of ϴ. This is true for Circle \nwith a radius k. \n______________________________________________________________________________ \nQUESTION 2: \nTo measure the Smoothness, coarseness and regularity of a region we use which of the \ntransformation to extract feature? \na. Gabor Transformation \nb. Wavelet Transformation \nc. Both

In [7]:
# Extract Lecture Slides content

lecture_slides_path = "/content/drive/MyDrive/Deep Learning/Lecture Slides"
lecture_slides_data = []

for files in os.listdir(lecture_slides_path):
  pdf_path = os.path.join(lecture_slides_path,files)
  text = extract_text_from_pdf(pdf_path)
  lecture_slides_data.append({'title': files, 'content': text})


In [8]:
lecture_slides_data[0]

{'title': 'WEEK 1.pdf',
 'content': 'Course Name: Deep Learning \nFaculty Name: Prof. Prabir Kumar Biswas\nDepartment : E &  ECE, IIT Kharagpur\nTopic\nLecture 01:  Introduction\nConcepts Covered:\n\uf071Deep Learning: An Introduction\n\uf071Descriptors/ Feature Vectors\n\uf071Machine Learning vs. Deep Learning\n\uf071Discriminative/ Generative Model\n\uf071Challenges\n\uf071Power of Deep Learning\nWhat is \nlearning?\nImage Source: Internet\nImage Source: Internet\nCan You Recognize these \nPictures   ?\n• If Yes, How do you Recognize it?\nOrigin of Machine Learning?\nImage Source: Internet\n…..Lies in very early efforts of understanding \nIntelligence.\n• What is Intelligence?\n• It can be defined as the ability to comprehend; to \nunderstand and profit from experience. \n• Capability to acquire and Apply Knowledge.\nLearning?\nImage Source: Internet\n2300 Years ago….\n• Plato (427-347 BC )\n• The concept of Abstract Ideas \nare known to us a priori, through\na Mystic connection with

In [9]:
# Extract content from Lecture Transcripts

def extract_text_from_book(pdf_path,start,end):
  pdf = fitz.open(pdf_path)
  text = ""
  for page_num in range(start-1,end):
    text += pdf[page_num].get_text()
  return text

In [10]:
lecture_transcripts_path = "/content/drive/MyDrive/Deep Learning/Lecture Transcripts.pdf"
lecture_transcripts_data = extract_text_from_book(lecture_transcripts_path,5,920)

In [11]:
# Extract first 500 characters

lecture_transcripts_data[:500]

'Deep Learning\nProf. Prabir Kumar Biswas\nDepartment of Electronics And Electrical Communication Engineering\nIndian Institute of Technology, Kharagpur\nLecture – 01\nIntroduction\nHello. Welcome to the NPTEL certification course on Deep Learning. So, in today we are\ngoing to introduce the content of this course and we are going to talk about that what all we\nwill be covering in this lecture series on Deep Learning.\n(Refer Slide Time: 00:49)\nSo, the topics that I will covered is; obviously, the first '

# Step 3: Combine all the data and Chunk them  

In [12]:
#Combine the data
all_data = []

for data in assignment_data:
  all_data.append({'type':'assignments','title': data['title'], 'content': data['content']})

for data in lecture_slides_data:
  all_data.append({'type':'Lecture Slides','title': data['title'], 'content': data['content']})

all_data.append({'type':'Lecture Transcripts','title':'Lecture Transcripts','content': lecture_transcripts_data})

In [13]:
all_data[0]

{'type': 'assignments',
 'title': 'Assignment_1_2022.pdf',
 'content': '  \n \nNPTEL Online Certification Courses \nIndian Institute of Technology Kharagpur \nDeep Learning \nAssignment- Week 1 \nTYPE OF QUESTION:  MCQ/MSQ \nNumber of questions: 10  \n \n \n \n \n            Total mark: 10 X 2= 20  \n______________________________________________________________________________ \nQUESTION 1: \nSignature descriptor of an unknown shape is given in the figure, can you identify the unknown \nshape? \n \n \n \n \n \n \n \na. Circle \nb. Square \nc. Straight line \nd. Cannot be predicted  \nCorrect Answer: a \nDetailed Solution:  \nDistance from centroid to boundary is same for every value of ϴ. This is true for Circle \nwith a radius k. \n______________________________________________________________________________ \nQUESTION 2: \nTo measure the Smoothness, coarseness and regularity of a region we use which of the \ntransformation to extract feature? \na. Gabor Transformation \nb. Wavelet 

In [14]:
# Install langchain, ensuring a successful installation
!pip install langchain



In [15]:
# Import the necessary class from the text_splitters submodule
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the splitter
splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)

# Chunk_size - it is the size of the chunk
# Chunk_overlap - it is the number of characters from previous chunk that has
# to be added into the current chunk to maintain context and continuity in the
# data

In [16]:
chunks = []

for doc in all_data:
  chunk = splitter.split_text(doc['content'])

  for i in range(len(chunk)):
    chunks.append({
        'type':doc['type'],
        'title':doc['title'],
        'content':chunk[i]
    })

In [17]:
chunks[0]

{'type': 'assignments',
 'title': 'Assignment_1_2022.pdf',
 'content': 'NPTEL Online Certification Courses \nIndian Institute of Technology Kharagpur \nDeep Learning \nAssignment- Week 1 \nTYPE OF QUESTION:  MCQ/MSQ \nNumber of questions: 10  \n \n \n \n \n            Total mark: 10 X 2= 20  \n______________________________________________________________________________ \nQUESTION 1: \nSignature descriptor of an unknown shape is given in the figure, can you identify the unknown \nshape? \n \n \n \n \n \n \n \na. Circle \nb. Square \nc. Straight line \nd. Cannot be predicted'}

# Step 4: Generate Embeddings and store them in a database

In [18]:
# Install the community package for Langchain,
# which contains the SentenceTransformerEmbeddings
!pip install -U langchain-community



In [19]:
from langchain.embeddings import SentenceTransformerEmbeddings
# Initialise the embedder model
embedder = SentenceTransformerEmbeddings(model_name = "all-MiniLM-L6-v2")

# Generates embeddings of size 384 dimensions
# Lightweight model known for speed and accuracy

  embedder = SentenceTransformerEmbeddings(model_name = "all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [20]:
embeddings = [embedder.embed_query(chunk['content']) for chunk in chunks]

In [21]:
# How does an embedding look like
embeddings[0]

[-0.056526072323322296,
 0.0377962701022625,
 -0.021692384034395218,
 -0.050129108130931854,
 -0.057667382061481476,
 -0.040188979357481,
 -0.013110242784023285,
 0.06838851422071457,
 -0.055620260536670685,
 -0.0633893609046936,
 0.0020593483932316303,
 -0.07261589914560318,
 0.05651377514004707,
 0.021594932302832603,
 -0.05338096246123314,
 -0.03354936093091965,
 -0.051579877734184265,
 0.020002983510494232,
 -0.02720060385763645,
 0.034808773547410965,
 0.04330300912261009,
 0.003262357320636511,
 -0.009122271090745926,
 0.013676169328391552,
 0.03857679292559624,
 0.02318454161286354,
 0.1410757452249527,
 -0.024883223697543144,
 -0.020535718649625778,
 -0.09442716091871262,
 -0.028317080810666084,
 0.018512891605496407,
 -0.053184717893600464,
 0.03569025546312332,
 0.0547134131193161,
 -0.05006833001971245,
 -0.07479299604892731,
 0.11749596148729324,
 0.07243301719427109,
 -0.07464410364627838,
 0.019983438774943352,
 -0.021374864503741264,
 0.1021139919757843,
 0.0125287184491

In [22]:
# Store the embeddings in FAISS database/ index

!pip install faiss-cpu



In [23]:
import faiss
import numpy as np

dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

In [24]:
metadata = [{'type':chunk['type'],'title':chunk['title'],'content':chunk['content']} for chunk in chunks]

faiss.write_index(index,'deepbot_faiss_index')
np.save("deepbot_metadata.npy",metadata)

# Step 5: Testing retrieval using Query

In [25]:
# Reload index

index = faiss.read_index("deepbot_faiss_index")
metadata = np.load("deepbot_metadata.npy",allow_pickle=True)

def retrieve(query, k = 3):
  """
      Retrieves top 3 embeddings based on the query
      top 3 is determined by the Eucledian distance
  """

  embed_query = embedder.embed_query(query)
  distances, indices = index.search(np.array([embed_query]),k)
  # Returns the distances of top 3 embeddings and
  # the indices of embeddings in the index

  results = [metadata[i] for i in indices[0]]
  # The corresponding chunks (which contain human understandable content)
  # are retrieved from metadata.

  return results

In [26]:
query = "Explain topics in Week 1 slides"

results = retrieve(query)
for result in results:
  print("Type: "+result["type"])
  print("Title: "+result["title"])
  print("Content: "+result["content"])
  print()

Type: Lecture Transcripts
Title: Lecture Transcripts
Content: (Refer Slide Time: 00:49)
So, the topics that I will covered is; obviously, the first one is an introduction to deep
learning, then we will talk about that when we learn something how do we learn, in the sense
that we learned certain properties or certain features or certain descriptors using which we
recognize an event or we recognize an object. So, we are going to have a brief introduction to
what are descriptors or what are feature vectors.

Type: Lecture Transcripts
Title: Lecture Transcripts
Content: 22
(Refer Slide Time: 30:57)
Similarly, we can also compose videos with different styles.
(Refer Slide Time: 31:13)
So, all these are possible using modern day planning techniques and there were multiple
applications like in medical image processing, in object recognition, in speech recognition
and so on. So, we will talk about all these different aspects of these deep learning techniques
23
in subsequent lectures of this c

# Step 6: Creation of Generator

In [27]:
!pip install gradio transformers sentence-transformers



In [28]:
import gradio as gr
from transformers import AutoTokenizer,AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [37]:
# Function to generate response

def generate_response(prompt):
  input = tokenizer(prompt , return_tensors = "pt" , padding = True, truncation = True, max_length = 512)
  output = model.generate(**input, max_length = 150, num_beams = 4, early_stopping = True)
  return tokenizer.decode(output[0], skip_special_tokens = True)

# Step 8: Integrate the Retriever and Generator

In [31]:
def ask_rag(query,retriever,generator,system_prompt):

  retrieved_chunks = retrieve(query)
  context = "\n".join(chunk["content"] for chunk in retrieved_chunks)
  # Prompt = Context (retrieved chunks) + Question(Query asked by user) + System Prompt
  # System Prompt - guides the models behaviour
  prompt = f"{system_prompt}\n\nContext:{context}\n\nQuestion:{query}\n\nAnswer:"
  response = generator(prompt)

  return response

# Step 9: Create the interface with gradio

In [41]:
system_prompt = """
You are a kind, emotionally-aware teaching assistant trained on my course PDFs for the subject "Deep Learning" Your name is DeepBot.

Your behavior must replicate a supportive AI like ChatGPT...
"""
def ask_rag_interface(query):
  return ask_rag(query,retrieve,generate_response,system_prompt)

# Gradio Interface
interface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(label="Ask your Question"),
    outputs=gr.Textbox(label="Answer"),
    title="DeepBot: Your Deep Learning TA",
    description="Ask DeepBot any question about Deep Learning!"
)

interface.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4b50abd83f545c2c04.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


