# crawler

In [72]:
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from dateutil.parser import parse
import re
import nltk
import pandas as pd
import math
from collections import Counter
def get_html(url):
    response = requests.get(url)
    return response.content.decode('utf-8')

def extract_links(html, base_url):
    soup = BeautifulSoup(html, 'html.parser')
    links = set()
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is not None:
            abs_url = urljoin(base_url, href)
            links.add(abs_url)
    return links

def crawl(start_url, max_urls=30):
    urls_to_crawl = {start_url}
    crawled_urls = set()

    while urls_to_crawl and len(crawled_urls) < max_urls:
        url = urls_to_crawl.pop()
        if url in crawled_urls:
            continue

        try:
            html = get_html(url)
            links = extract_links(html, url)
            crawled_urls.add(url)
            urls_to_crawl.update(links)
        except:
            print(f"Error crawling {url}")
    
    return crawled_urls


In [137]:
crawlded=crawl("https://www.homeandlearn.co.uk/WD/wds5pA.html",100)

Error crawling javascript:void(0);


The get_html function simply sends a HTTP request to the given URL and returns the HTML content. 

The extract_links function uses BeautifulSoup to parse the HTML and extract all hyperlinks (i.e. a tags) in the document. It then converts any relative URLs to absolute URLs using the urljoin function, which takes care of handling any differences in the base URL. 

The crawl function implements a basic breadth-first search algorithm, starting from the start_url and recursively crawling any new URLs that are discovered. It keeps track of URLs that have already been crawled to avoid revisiting them. Finally, it returns the set of all crawled URLs.

# Dictionnary

In [61]:
def extract_index_entries(urls):
    index = defaultdict(list)
    stop_words = set(stopwords.words('english')) # set of stop words
    stemmer = PorterStemmer() # create stemmer object

    for url in urls:
        # Get the HTML code given the URL of a document
        response = requests.get(url)
        content = response.text

        # Extract the text content of the document
        text = extract_text(content)

        # Apply text segmentation
        sentences = segment_text(text)

        # Apply text normalization (lowercase, lemmatization, token filtering)
        for sentence in sentences:
            tokens = tokenize_text(sentence)
            tokens = filter_tokens(tokens, stop_words)
            tokens = lemmatize_stem_tokens(tokens, stemmer)
            for token in tokens:
                index[token].append(url)
    del index['']
    return index


def extract_text(content):
    # Use regular expressions to extract text content between HTML tags
    soup = BeautifulSoup(content, 'html.parser')
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()
    return text


def segment_text(text):
    # Use NLTK to segment text into sentences
    sentences = word_tokenize(text.lower())
    return sentences


def tokenize_text(text):
    # Use NLTK to tokenize text into words
    tokens = word_tokenize(text)
    return tokens


def filter_tokens(tokens, stop_words):
    # Filter out stop words and other unwanted tokens
    filtered_tokens = []
    for token in tokens:
        # Filter out stop words
        if token.lower() not in stop_words:
            # Filter out certain characters
            token = re.sub(r'[^\w\s]+', '', token)
            # Filter out digits
            if not token.isdigit():
                # Add the filtered token to the list of filtered tokens
                filtered_tokens.append(token)
    return filtered_tokens

def lemmatize_stem_tokens(tokens, stemmer):
    # Use NLTK to lemmatize/stem tokens
    normalized_tokens = []
    for token in tokens:
        # Lemmatize nouns
        if 'NN' in nltk.pos_tag([token])[0][1]:
            lemmatizer = WordNetLemmatizer()
            normalized_token = lemmatizer.lemmatize(token)
        # Stem verbs and other parts of speech
        else:
            normalized_token = stemmer.stem(token)
        normalized_tokens.append(normalized_token)
    return normalized_tokens

1-extract_index_entries(urls): This function takes in a list of URLs and returns a dictionary of index entries where the keys are words and the values are lists of URLs where the word appears.

2-extract_text(content): This function takes in HTML content and uses BeautifulSoup to extract the text content of the document by removing any script and style tags.

3-segment_text(text): This function takes in a string of text and uses the NLTK library to segment the text into sentences.

4-tokenize_text(text): This function takes in a string of text and uses the NLTK library to tokenize the text into individual words.

5-filter_tokens(tokens, stop_words): This function takes in a list of tokens and a set of stop words and filters out stop words and unwanted characters.

6-lemmatize_stem_tokens(tokens, stemmer): This function takes in a list of tokens and a stemmer object and uses the NLTK library to lemmatize nouns and stem other parts of speech.

In [138]:
indices=extract_index_entries(crawlded)

# Document indexing

In [39]:
def extract_content(url):
    all_words=[]
    index = defaultdict(list)
    stop_words = set(stopwords.words('english')) # set of stop words
    stemmer = PorterStemmer() # create stemmer object
    response = requests.get(url)
    content = response.text
    # Extract the text content of the document
    text = extract_text(content)
    # Apply text segmentation
    sentences = segment_text(text)
    for sentence in sentences:
        tokens = tokenize_text(sentence)
        tokens = filter_tokens(tokens, stop_words)
        tokens = lemmatize_stem_tokens(tokens, stemmer)            
        for token in tokens:
            all_words.append(token)
    all_words= list(filter(lambda x: x != '', all_words))
    return all_words

In [139]:
x=list(crawlded)

In [140]:
my_words=extract_content(x[0])

In [141]:
def document_indexing(dictionnary, content):

    tf = Counter(content)

    # initialize document representation
    boolean_model = [0] * len(dictionnary)
    tf_model = [0] * len(dictionnary)
    wf_model = [0] * len(dictionnary)
    tf_idf_model = [0] * len(dictionnary)
    wf_idf_model = [0] * len(dictionnary)

    # calculate document representation for each term
    for i, term in enumerate(dictionnary):
        # term occurrence (boolean model)
        if term in content:
            boolean_model[i] = 1

        # term frequency tf
        tf_model[i] = tf[term]

        # term frequency wf=1+logtf if tf≠0 and 0 otherwise
        if tf[term] != 0:
            wf_model[i] = 1 + math.log(tf[term], 10)

        # term frequency × inverse document frequency tf.idf
        if term in tf and tf[term] != 0:
            tf_idf_model[i] = tf[term] * math.log(len(dictionnary[term]) / len(dictionnary), 10)

        # term frequency × inverse document frequency wf.idf
        if tf[term] != 0:
            idf = math.log(len(dictionnary[term]) / len(dictionnary), 10)
            wf_idf_model[i] = (1 + math.log(tf[term], 10)) * idf

    return {
        'boolean_model': boolean_model,
        'tf_model': tf_model,
        'wf_model': wf_model,
        'tf_idf_model': tf_idf_model,
        'wf_idf_model': wf_idf_model
    }

In [142]:
my_models=document_indexing(indices,my_words)

In [143]:
import pandas as pd
df = pd.DataFrame(columns=indices.keys(),index=my_models.keys())
for model_name in my_models.keys():
    model_values = my_models[model_name]
    for i in range(len(indices.keys())):
        df.loc[model_name, list(indices.keys())[i]] = model_values[i]

In [144]:
df

Unnamed: 0,microsoft,excel,tutorial,copy,paste,en,fr,de,e,hr,...,sites2023,waymentions,légales,cgv,personnelles,intend,class1vb,convertpostcodevb,reuse,performs
boolean_model,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
tf_model,5.0,11.0,2.0,5.0,8.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
wf_model,1.69897,2.041393,1.30103,1.69897,1.90309,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
tf_idf_model,-4.851424,-8.30209,-3.350169,-9.822663,-16.903619,-1.468769,-1.562045,-1.041889,-1.548681,-1.562045,...,0,0,0,0,0,0,0,0,0,0
wf_idf_model,-1.648485,-1.540711,-2.179335,-3.337682,-4.021139,-1.468769,-1.562045,-1.041889,-1.548681,-1.562045,...,0,0,0,0,0,0,0,0,0,0


now we have a function called document indexing, that when giving it a list of words appeared in a document, it will give us a dictionnary of the document represantation on each model (boolean, tf,wf,tf_idf,wf_idf)

the above dictionnary is for the first link in the crawlded links

# Querying

In [151]:
query="i want to learn excel because i love excel"


In [146]:
urls_array=list(crawlded)
urls_indexing={}
for i in urls_array:
    my_words=extract_content(i)
    urls_indexing[i]=document_indexing(indices,my_words)

here we are building a dictionnary where the keys are the urls and the values is a dictionnary of document indexing in all weighting schemes


In [152]:
import numpy as np
def compute_similarity(query,urls_indexing):
    query_array=query.split(' ')
    my_query_indexing=document_indexing(indices,query_array)
    similarities={
        'boolean_model': [],
        'tf_model': [],
        'wf_model': [],
        'tf_idf_model': [],
        'wf_idf_model': []
    }
    for j in my_query_indexing.keys():
        for i in urls_indexing.keys():
            similarities[j].append([i,cosine_similarity(my_query_indexing[j],urls_indexing[i][j])])
    return similarities
        
            
def cosine_similarity(d1, d2):
    dot_product = np.dot(d1, d2)
    norm1 = np.linalg.norm(d1)
    norm2 = np.linalg.norm(d2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

In [153]:
query_similarity=compute_similarity(query,urls_indexing)

In [154]:
for i in query_similarity.keys():
    query_similarity[i]=sorted(query_similarity[i], key=lambda x: x[1], reverse=True)

In [155]:
def suggest_links(query_similarity,max_suggested):
    for key, value in query_similarity.items():
        for v in range(max_suggested):
            print("the {} is suggesting {}".format(key, value[v][0]))
        print()
suggest_links(query_similarity,5)

the boolean_model is suggesting https://www.homeandlearn.co.uk/php/php5p6.html
the boolean_model is suggesting https://www.homeandlearn.co.uk/extras/image-information/add-windows-form-control-runtime.html
the boolean_model is suggesting https://www.homeandlearn.co.uk/php/php2p9.html
the boolean_model is suggesting https://www.homeandlearn.co.uk/games-programming/3d-games-programming.html
the boolean_model is suggesting https://www.homeandlearn.co.uk/php/php4p4.html

the tf_model is suggesting https://www.homeandlearn.co.uk/excel2007/Excel2007.html
the tf_model is suggesting https://www.homeandlearn.co.uk/excel2007/excel2007s1p1.html
the tf_model is suggesting https://www.homeandlearn.co.uk/excel2007/excel2007s7p2.html
the tf_model is suggesting https://www.homeandlearn.co.uk/excel2007/excel2007s7p4.html
the tf_model is suggesting https://www.homeandlearn.co.uk/excel2007/excel2007s8p1.html

the wf_model is suggesting https://www.homeandlearn.co.uk/excel2007/roundup-excel.html
the wf_mod

as we can see, after giving the query "i want to learn excel because i love excel" each model suggests some links , but as we can see the boolean is not efficient, the tf and tf_idf and wf are the best since they are suggesting only excel pages, and wf_idf is  suggesting only one excel page