In [1]:
! pip install streamlit
! pip install whoosh
! pip install beautifulsoup
! pip install lxml
! pip install pyngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting streamlit
  Downloading streamlit-1.21.0-py2.py3-none-any.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
Collecting pympler>=0.9
  Downloading Pympler-1.0.1-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog
  Downloading watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.8.0-py2.py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
Collecting blinker>=1.0.0
  Downloading blinker-1.6.1-py3-none-any.whl (13 kB)
Collect

In [2]:
import requests
import validators
from bs4 import BeautifulSoup
from queue import Queue

from whoosh.index import create_in, open_dir
from whoosh.fields import *
import os.path

In [3]:
# prepare whoosh for indexing purpose

# create schema to define format of content to index
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), date=TEXT(stored=True), content=TEXT)

ix = None
# check and create a folder index for storing files
if not os.path.exists("index"):
    os.mkdir("index")

    # assign folder index to schema for storing purpose
    ix = create_in("index", schema)
elif os.path.exists("index"):
    ix = open_dir("index")

# create a writer object 
writer = ix.writer()

In [4]:
# Retrieve article with 3 levels
url = 'https://vnexpress.net/'
already_searched = [url]

max_nlevel = 2

# Retrieve content of single article by its url
def retrieve_article(url: str) -> None:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')

    # get page's title
    title = soup.title.text if soup.title else ''
    
    # get page's content
    contents = soup.find_all("p", {"class": "Normal"})
    text_contents = ""
    for content in contents:
        text_contents += content.extract().text
    
    # get page's published date
    date = soup.find("span", {"class": "date"})
    if date is None:
        date = "None"
    else:
        date = date.extract().text
    
    #add title, content of article for indexing purpose
    writer.add_document(title=title, path=url, date=date, content=text_contents)

# Recursively retrieve articles (with number of levels of crawl) from a single url
def retrieve_articles(url: str, max_level: int) -> None:
    queue = Queue()
    queue.put(url)
    visited = set()
    visited.add(url)

    level = 0
    while not queue.empty() and level < max_level:
        s = len(queue.queue)
        for _ in range(s):
            url = queue.get()
            retrieve_article(url)

            response = requests.get(url)
            soup = BeautifulSoup(response.content, 'lxml')

            links = soup.find_all('a', href=True)
            links = [link['href'].split('#')[0] for link in links]

            for link in links:
                if link not in visited and validators.url(link):
                    queue.put(link)
                    visited.add(link)
        level += 1

retrieve_articles(url, max_nlevel)
writer.commit()

In [5]:
%%writefile search_engine.py
# start streamlit
import streamlit as st
from whoosh.index import create_in, open_dir
from whoosh.fields import *
import os.path

st.set_page_config(page_title="VnExpress Articles Search Engine", layout="wide")
st.title("VnExpress Articles Search Engine")

text_search = st.text_input("Search Articles by title or content", value="")

@st.cache_data
def assign_ix():
    # check and create a folder index for storing files
    if not os.path.exists("index"):
        assert False
    elif os.path.exists("index"):
        # open existing index files that created after indexing
        return open_dir("index")

ix = assign_ix()

# start researching the document
from whoosh.query import *
from whoosh.qparser import QueryParser
from typing import List

# Define your Streamlit app
def our_search(text_search: str) -> List[str]:
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        query = parser.parse(text_search)
        results = searcher.search(query)
        vector_result = [{"title": hit["title"], "date": hit["date"], "path": hit["path"]} for hit in results]
        return vector_result



results = our_search(text_search)

# Create columns for displaying the results
col1, col2, col3 = st.columns(3)

# Iterate over the results and display them in the columns
for index, result in enumerate(results):
    if index % 3 == 0:
        col = col1
    elif index % 3 == 1:
        col = col2
    else:
        col = col3
    with col:
        st.markdown(f"***{result['title']}***")
        st.markdown(f"*{result['date']}*")
        st.markdown(f"**{result['path']}**")
        
        st.write("---")

Writing search_engine.py


In [6]:
# expose my streamlit app to the internet
from pyngrok import ngrok

# use ngrok to display our engine 
ngrok.set_auth_token('2ERWLpX0sOWRCY845tvYWkuNTo7_2Qra4qyhzntZQxsLqazjP')



In [7]:
# run streamlit at port 80 and connect ngrok to port 80 also
! nohup streamlit run search_engine.py --server.port 80 &
url = ngrok.connect(port='80')

# run the application by the public_url
print(url.public_url)

nohup: appending output to 'nohup.out'
http://8faa-34-90-10-104.ngrok-free.app
