# Webcrawl
ref: https://github.com/openai/openai-cookbook/blob/main/apps/web-crawl-q-and-a/web-qa.ipynb

In [1]:
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os

In [27]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'

# Define root domain to crawl
domain = "sjsu.edu"
full_url = "https://www.sjsu.edu/cmpe/"
subdomain = "/cmpe"

In [45]:
url_obj = urlparse("https://www.sjsu.edu/cmpe/research/tdd.pdf")
url_obj

In [46]:
url_obj.path.startswith("/cmpe")

In [47]:
url_obj.path.endswith(".pdf")

In [48]:
# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Create a list to store the hyperlinks
        self.hyperlinks = []

    # Override the HTMLParser's handle_starttag method to get the hyperlinks
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)

        # If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
        #The <a> tag defines a hyperlink
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])

#takes a URL as an argument, opens the URL, and reads the HTML content. Then, it returns all the hyperlinks found on that page.
# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    
    # Try to open the URL and read the HTML
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:

            # If the response is not HTML, return an empty list
            if not response.info().get('Content-Type').startswith("text/html"):
                return []
            
            # Decode the HTML
            html = response.read().decode('utf-8')
    except Exception as e:
        print(e)
        return []

    # Create the HTML Parser and then Parse the HTML to get hyperlinks
    parser = HyperlinkParser()
    parser.feed(html)

    return parser.hyperlinks

# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url, subdomain=""):
    clean_links = []
    for link in set(get_hyperlinks(url)):
        clean_link = None

        # If the link is a URL, check if it is within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            # Parse the URL and check if the domain is the same
            url_obj = urlparse(link)
            if url_obj.netloc == local_domain:
                if subdomain:
                    if url_obj.path.startswith(subdomain):
                        #print("match subdomain")
                        clean_link = link
                else:
                    clean_link = link

        # If the link is not a URL, check if it is a relative link
        else:
            #print("relative link", link)
            if subdomain:
                if not link.startswith(subdomain):
                    continue
            if link.startswith("/"):
                link = link[1:]
            elif link.startswith("#") or link.startswith("mailto:"):
                continue
            clean_link = "https://" + local_domain + "/" + link

        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]
            if clean_link.endswith(".pdf") or clean_link.endswith(".jpg") or clean_link.endswith(".png"):
                 continue
            clean_links.append(clean_link)

    # Return the list of hyperlinks that are within the same domain
    return list(set(clean_links))

In [9]:
os.getcwd()

In [32]:
outputpath="./output"
os.mkdir(outputpath)
textoutputpath=os.path.join(outputpath,"text/")
textoutputpath

In [13]:
textoutputpath+local_domain+"/"

In [11]:
local_domain = urlparse(full_url).netloc

In [12]:
local_domain

In [50]:
#crawl the given url and output text files in textoutputpath
def crawl(url, textoutputpath, subdomain=""):
    # Parse the URL and get the domain
    local_domain = urlparse(url).netloc

    # Create a queue to store the URLs to crawl
    queue = deque([url])

    # Create a set to store the URLs that have already been seen (no duplicates)
    seen = set([url])

    # Create a directory to store the text files
    if not os.path.exists(textoutputpath):
            os.mkdir(textoutputpath)

    if not os.path.exists(textoutputpath+local_domain+"/"):
            os.mkdir(textoutputpath + local_domain + "/")

    # Create a directory to store the csv files
    if not os.path.exists(textoutputpath+"processed"):
            os.mkdir(textoutputpath+"processed")

    # While the queue is not empty, continue crawling
    while queue:

        # Get the next URL from the queue
        url = queue.pop()
        print(url) # for debugging and to see the progress

        # Save text from the url to a <url>.txt file
        with open(textoutputpath+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w") as f:

            # Get the text from the URL using BeautifulSoup
            soup = BeautifulSoup(requests.get(url).text, "html.parser")

            # Get the text but remove the tags
            text = soup.get_text()

            # If the crawler gets to a page that requires JavaScript, it will stop the crawl
            if ("You need to enable JavaScript to run this app." in text):
                print("Unable to parse page " + url + " due to JavaScript being required")
            
            # Otherwise, write the text to the file in the text directory
            f.write(text)

        # Get the hyperlinks from the URL and add them to the queue
        for link in get_domain_hyperlinks(local_domain, url, subdomain):
            if link not in seen:
                queue.append(link)
                seen.add(link)

In [51]:
subdomain

In [52]:
crawl(full_url, textoutputpath, subdomain)

CSV is a common format for storing embeddings. You can use this format with Python by converting the raw text files (which are in the text directory) into Pandas data frames.

Converting the text to CSV requires looping through the text files in the text directory created earlier. After opening each file, remove the extra spacing and append the modified text to a list. Then, add the text with the new lines removed to an empty Pandas data frame and write the data frame to a CSV file.

In [53]:
def remove_newlines(serie):
    serie = serie.str.replace('\n', ' ')
    serie = serie.str.replace('\\n', ' ')
    serie = serie.str.replace('  ', ' ')
    serie = serie.str.replace('  ', ' ')
    return serie

In [54]:
textoutputpath

In [56]:
domain

In [76]:
import pandas as pd
import csv
def processtxtfiles(inputfolder, outputpath):
    # Create a list to store the text files
    texts=[]

    if not os.path.exists(outputpath):
        os.mkdir(outputpath)


    # Get all the text files in the text directory
    for file in os.listdir(inputfolder + "/"):
        if file.endswith("doc.txt") or file.endswith("docx.txt"):
            continue
        # Open the file and read the text
        with open(inputfolder + "/" + file, "r") as f:
            text = f.read()

            # Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
            texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))

    # Create a dataframe from the list of texts
    df = pd.DataFrame(texts, columns = ['fname', 'text'])

    # Set the text column to be the raw text with the newlines removed
    df['text'] = df.fname + ". " + remove_newlines(df.text)
    df.to_csv(outputpath+'/scraped.csv', escapechar='\\')
    #csv.writer(f, quoting=csv.QUOTE_NONE, delimiter='|', quotechar='',escapechar='\\')    #Or any other appropriate escapechar
    return df

In [77]:
inputfolder="./output/text/www.sjsu.edu"
outputpath="./output/text/processed"
df=processtxtfiles(inputfolder, outputpath)

In [78]:
df.head()

In [79]:
df['fname'][3]

In [80]:
df['text'][3]

In [81]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

In [82]:
df0 = pd.read_csv(outputpath+'/scraped.csv', index_col=0)
df0.head()

In [91]:
df0.columns = ['title', 'text']
df0.head()

In [92]:
len(df0)

In [93]:
# Tokenize the text and save the number of tokens to a new column
df0['n_tokens'] = df0.text.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
df0.n_tokens.hist()

In [94]:
df0.head()

In [95]:
len(df0)

split the longer lines into smaller chunks

In [96]:
max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks


shortened = []

# Loop through the dataframe
for row in df0.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['text'])

    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['text'] )

Visualizing the updated histogram again can help to confirm if the rows were successfully split into shortened sections.

In [97]:
df = pd.DataFrame(shortened, columns = ['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df.n_tokens.hist()

In [98]:
df.head()

In [99]:
len(df)

In [100]:
sum(df['n_tokens'])

In [None]:
df['']

The content is now broken down into smaller chunks and a simple request can be sent to the OpenAI API specifying the use of the new text-embedding-ada-002 model to create the embeddings:

In [101]:
outputpath

In [104]:
df[8:10]

In [102]:
df.to_csv(outputpath+'/split.csv', escapechar='\\')