# Advanced AI Assistant for An Website

Scraping data using advanced loader (Unstructured Loader)

### Loading Data from Website URLS

In [28]:
# Define the list of pages of sonfmeets.com website
page_list = [
    "https://softmeets.com/",
    "https://softmeets.com/automation/",
    "https://softmeets.com/internet-of-things/",
    "https://softmeets.com/artificial-intelligence/",
    "https://softmeets.com/analytics/",
    "https://softmeets.com/about-us/",
    "https://softmeets.com/customers/",
    "https://softmeets.com/partners/",
    "https://softmeets.com/contact-us/",
    "https://softmeets.com/vetting-of-detail-project-report-of-automation-of-drinking-water-project-with-national-institute-of-technology-durgapur/",
    "https://softmeets.com/implementation-of-document-management-system-at-sail-bokaro-steel-plant/",
    "https://softmeets.com/application-development-system-modernization-and-support-upv-tms/",
    "https://softmeets.com/barqat/",
    "https://softmeets.com/rbms/",
    "https://softmeets.com/upv/",
    "https://softmeets.com/cms/",
    "https://softmeets.com/privacy-policy/"
]


In [29]:
# Import libraries
from unstructured.partition.html import partition_html
import requests
from langchain.schema import Document
from io import BytesIO

In [30]:
# Image filtering parameters
MIN_IMG_WIDTH = 100
MIN_IMG_HEIGHT = 100
EXCLUDE_KEYWORDS = ['logo', 'icon', 'favicon', 'social', 'profile', 'linkedin', 'twitter', 'facebook']

In [31]:
def is_valid_image(url):
    """Filter small/icon/logo images."""
    url_lower = url.lower()
    if any(kw in url_lower for kw in EXCLUDE_KEYWORDS):
        return False
    try:
        img = Image.open(BytesIO(requests.get(url, timeout=10).content))
        width, height = img.size
        return width >= MIN_IMG_WIDTH and height >= MIN_IMG_HEIGHT
    except:
        return False

In [33]:
documents = []
seen_texts = set()
seen_images = set()

for page in page_list:
    try:
        html = requests.get(page).text
        elements = partition_html(text=html)

        for el in elements:
            if el.category == 'Image' and el.metadata.image_url:
                image_url = el.metadata.image_url.strip()
                if image_url not in seen_images:
                    seen_images.add(image_url)
                    documents.append(Document(
                        page_content="",  # You will fill this later with OCR
                        metadata={"source": page, "image_url": image_url, "type": el.category}
                    ))

            elif el.category == "Table":
                table_content = el.metadata.text_as_html or el.text
                if table_content:
                    table_key = table_content.strip()
                    if table_key not in seen_texts:
                        seen_texts.add(table_key)
                        documents.append(Document(
                            page_content=table_key,
                            metadata={"source": page, "type": el.category}
                        ))

            elif el.category not in ["ListItem", "UncategorizedText"]:
                text = el.text.strip()
                if text and text not in seen_texts:
                    seen_texts.add(text)
                    documents.append(Document(
                        page_content=text,
                        metadata={"source": page, "type": el.category}
                    ))

    except Exception as e:
        print(f"[ERROR] Skipping page {page}: {e}")

In [34]:

len(documents)

364

In [35]:
documents

[Document(metadata={'source': 'https://softmeets.com/', 'image_url': '//softmeets.com/wp-content/uploads/2024/11/soft-logo-1.png', 'type': 'Image'}, page_content=''),
 Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Your modernization journey starts here.'),
 Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Make technology your growth partner, Optimize operations and improve efficiency.'),
 Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Serving up transformation & modernization solutions using:'),
 Document(metadata={'source': 'https://softmeets.com/', 'image_url': 'https://softmeets.com/wp-content/uploads/2024/11/automation-icon.png', 'type': 'Image'}, page_content=''),
 Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Automation'),
 Document(metadata={'source': 'https://softmeets.com/', 'image_url': 'https://softmeets.com/wp-conten

### Process the Image to extract Text from it

In [36]:
# Import libraries
import cv2
import numpy as np
from PIL import Image
from io import BytesIO
import pytesseract
from urllib.parse import urlparse
from transformers import BlipProcessor, BlipForConditionalGeneration


In [37]:
# Load captioning model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

In [38]:
# Normalize the Image URL
def normalize_url(url):
    if url and not urlparse(url).scheme:
        return 'https:' + url  # Default to https
    return url

In [39]:
# Preprocess image before OCR
def preprocess_image_for_ocr(image):
    # Convert to grayscale
    gray = image.convert('L')

    # Convert to OpenCV format
    open_cv_image = np.array(gray)
    open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_GRAY2BGR)

    # Resize to improve OCR accuracy
    open_cv_image = cv2.resize(open_cv_image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

    # Thresholding
    _, thresh = cv2.threshold(cv2.cvtColor(open_cv_image, cv2.COLOR_BGR2GRAY), 150, 255, cv2.THRESH_BINARY)

    return Image.fromarray(thresh)

In [40]:
# Extract text from the images
def extract_text_from_image_url(image_url):
    try:
        image_url = normalize_url(image_url)
        response = requests.get(image_url)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        preprocessed_image = preprocess_image_for_ocr(image)
        ocr_text = pytesseract.image_to_string(preprocessed_image, config="--psm 6")
        return ocr_text.strip()
    except Exception as e:
        print(f"[OCR Error] {image_url} => {e}")
        return ""

In [41]:
def generate_caption(image):
    try:
        inputs = processor(image, return_tensors="pt")
        out = model.generate(**inputs)
        return processor.decode(out[0], skip_special_tokens=True)
    except:
        return None

In [42]:
ocr_image_docs = []

for doc in documents:
    if doc.metadata.get("type") == 'Image':
        image_url = doc.metadata.get("image_url")
        ocr_text = extract_text_from_image_url(image_url).strip()
        # Process image
        try:
            image = Image.open(BytesIO(requests.get(image_url).content)).convert('RGB')
            print(image_url)
        except Exception as e:
            print(f"[Image Reading Error] {image_url} => {e}")
            continue

        caption = generate_caption(image)

        if ocr_text and len(ocr_text) > 10:
            combined_text = f"OCR: {ocr_text}"
            if caption:
                combined_text += f"\nCaption: {caption}"
        else:
            combined_text = caption if caption else "Image with no readable text or caption"
        doc.page_content = combined_text
    ocr_image_docs.append(doc)


Bd
[Image Reading Error] //softmeets.com/wp-content/uploads/2024/11/soft-logo-1.png => Invalid URL '//softmeets.com/wp-content/uploads/2024/11/soft-logo-1.png': No scheme supplied. Perhaps you meant https:////softmeets.com/wp-content/uploads/2024/11/soft-logo-1.png?
orm,
« s
= a
a e
Giz an?
https://softmeets.com/wp-content/uploads/2024/11/automation-icon.png
TNS

\ \ aN

r\ —
|| \
if /
/ | /
f |

/ [4 c
https://softmeets.com/wp-content/uploads/2024/11/world.png
aS
https://softmeets.com/wp-content/uploads/2024/11/artificial-intelligence-icon.png
/
a“
a
a a?
https://softmeets.com/wp-content/uploads/2024/11/doughnut.png
| by 2
PO “sate My
Mh gam day,
ST a
on 1! *
att, et a
ig NG cau oe <
a AGS”,
i + a
|
https://softmeets.com/wp-content/uploads/2024/11/Rail-logo.png
S|
https://softmeets.com/wp-content/uploads/2024/11/Sail-logo.png
||
https://softmeets.com/wp-content/uploads/2024/11/Ministry-of-Communicationlogo.png
INDIAN ARMY
https://softmeets.com/wp-content/uploads/2024/11/indian-army-lo



e ”*
N hw
Ve a | [ y




https://softmeets.com/wp-content/uploads/2024/11/customer.png
! JENS: eg SME SYD): 3
[lags ‘how
ay fut | _/ ( ) ee LOs

IGS eK: Sy

eg Re OE ODN,
Sa rl LEAS) co a ~
https://softmeets.com/wp-content/uploads/2024/11/clw-indian-railways-logo.png
2
a
https://softmeets.com/wp-content/uploads/2024/11/Partners.jpg
-
https://softmeets.com/wp-content/uploads/2024/11/hp.png

https://softmeets.com/wp-content/uploads/2024/12/microsoft-logo.png

https://softmeets.com/wp-content/uploads/2024/12/Schneider_Electric-Logo.png
ELNOMA
https://softmeets.com/wp-content/uploads/2024/12/elnovapower_logo.jpg
Roof Slab
200mm thick
700mm thick TM 4m |{ Sm | 4m [don | 4m
SB ULC TUE ETERS IEEE RETR oe Gallery id cane imenpepnsameaiue an pany
a ae tials inbatetelbtatataiatatattatet ie ata 110cam thick abl ) Tige twgpeeeeceremeres
= — Floor Beam — LT Cofnma
| (1000 *1000)anm Se Ve all 200 * 400}
3m 3n | >
ros / Wall TNs
—j—-— 200mm thick ——— MP) 2 neste GA
EAT stam
; : SIA ev thick wall
° Cofuan = eA ee
(600° 600}ou

In [43]:
ocr_image_docs

[Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Your modernization journey starts here.'),
 Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Make technology your growth partner, Optimize operations and improve efficiency.'),
 Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Serving up transformation & modernization solutions using:'),
 Document(metadata={'source': 'https://softmeets.com/', 'image_url': 'https://softmeets.com/wp-content/uploads/2024/11/automation-icon.png', 'type': 'Image'}, page_content='OCR: orm,\n« s\n= a\na e\nGiz an?\nCaption: a computer with a gear wheel and a red gear wheel'),
 Document(metadata={'source': 'https://softmeets.com/', 'type': 'Title'}, page_content='Automation'),
 Document(metadata={'source': 'https://softmeets.com/', 'image_url': 'https://softmeets.com/wp-content/uploads/2024/11/world.png', 'type': 'Image'}, page_content='OCR: TNS\n\n

### Split the documents into chunks

In [44]:
# Import libraries
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [45]:
splitter = RecursiveCharacterTextSplitter(chunk_size= 500, chunk_overlap= 50)

In [47]:
splitted_docs = splitter.split_documents(ocr_image_docs)

In [48]:
len(splitted_docs)

399

### Ingest in VectorDB

In [49]:
# import libraries
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

In [50]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [52]:
vector_db = Chroma.from_documents(
    documents = splitted_docs,
    embedding= embedding,
    collection_name="softmeets_web_info"
)

### Retrieve data and generate answer

In [53]:
retriever = vector_db.as_retriever()

In [58]:
# Import Libraries
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [59]:
llm = ChatOpenAI(model="gpt-4o")


In [60]:
# Define Prompt Template
template ="""
You are an assistant for question-answering task. Use the following pieces of retrieved context for answering the question.
If you don't know the answer, just say you don't know, don't try to make up an answer.
Your answer should be to the point and consice.
Question: {question}
Context: {context}
"""
prompt = ChatPromptTemplate.from_template(template)

In [61]:
# Define RAG Chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

### Test RAG chain

In [62]:
rag_chain.invoke("Tell me about Softmeets")

'Softmeets offers various analytics services that help businesses and organizations collect, process, analyze, and visualize data. These services are designed to enhance business intelligence, customer insights, operational efficiency, and predictive modeling.'

In [68]:
rag_chain.invoke("Tell me their Office address")

"I don't know."

In [67]:
rag_chain.invoke("Who are their clients?")

"I don't know who their clients are based on the provided context."

In [65]:
rag_chain.invoke("Any certfication do they have?")

'They have the ISO 9001:2015 certification and CMMI Maturity Level 5 certification.'

In [66]:
rag_chain.invoke("Tell me about the UPV?")

'UPV refers to the Unified Plant View, a system implemented at the SAIL IISCO Steel Plant.'

In [69]:
rag_chain.invoke("Which technologies they use?")

'They use SaaS, Automation, Internet of Things (IoT), Artificial Intelligence (AI), and Analytics technologies.'

In [71]:
rag_chain.invoke("Which tech stack they use for App development?")

"I don't know."