In [83]:
# File for class pageScraper
# For scrape single page, Return dictionary URL, all backlinks and Raw Text
# Input : url   ------>>  Output : {
                                #     "url" : url,
                                #     "backlinks" : self.scrape_all_urls(raw_soup_html),
                                #     "rawText" : self.scrape_raw_text(raw_soup_html)
                                # }
# To Use : obj.scrape_page(url)
# 
# dev : Aingkk.
#                UML Diagram
#                +-----------------------------------+
#                | pageScraper                       |
#                +-----------------------------------+
#                | -allowed_domain: list             |
#                +-----------------------------------+
#                | +__init__()                       |
#                | +get_raw_html(url: str)           |
#                | +scrape_raw_text(soup_obj: bs4)   |
#                | +scrape_all_urls(soup_obj: bs4)   |
#                | +scrape_page(url: str)            |
#                +-----------------------------------+

import validators
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
import re

class pageScraper:
    """Class for Scrape single page, Return dictionary URL, all backlinks and Raw Text"""
    def __init__(self):
        # Set allowed domain
        self.allowed_domain = [
            "artyt.me",
            "www.35mmc.com",
            "www.dpreview.com"
        ]
    
    def get_raw_html(self, url):
        """get raw html soup obj"""
        # webReq = requests.get(url)
        return requests.get(url)
    
    def scrape_raw_text(self, html_text):
        """Return raw text string from bs4 boject"""
        # return ' '.join([raw.text for raw in soup_obj.find_all(['h1', 'p'])])
        soup = BeautifulSoup(html_text, 'html.parser')
        return soup.get_text()
    
    def scrape_all_urls(self, html_text):
        soup = BeautifulSoup(html_text, 'html.parser')
        urls = []
        for link in soup.find_all('a'):
            url = link.get('href')
            if url and re.match("^(http://|https://)", url) and not re.search(".(jpg|jpeg|png|gif)$", url):
                urls.append(url)
        return list(set(urls))
    
    def scrape_page(self, url):
        """Return a dictionary of url, all unrepeated backlinks and raw text"""
        raw_soup_html = self.get_raw_html(url).text
        return {
            "url" : url,
            "backlinks" : self.scrape_all_urls(raw_soup_html),
            "rawText" : self.scrape_raw_text(raw_soup_html)
        }

In [84]:
obj = pageScraper()
url = 'https://www.digitalcameraworld.com/news'
ans = obj.scrape_page(url)

print(ans['url'])

print(ans['backlinks'])

print(ans['rawText'])

In [2]:
import sqlite3

In [4]:
curr = conn.cursor()

In [5]:
cursor.execute("CREATE TABLE IF NOT EXISTS Reference_Domain(Domain_Name, Ref_Count)")
cursor.execute("CREATE TABLE IF NOT EXISTS web_Data(Web_ID, URL, All_Word, Ref_To)")
cursor.execute("CREATE TABLE IF NOT EXISTS Inverted_Index(Word, Document_Freq, Inverted_Dict)")

<sqlite3.Cursor at 0x2977e9b66c0>

In [None]:
import sqlite3
from urllib.parse import urlparse

class LinkChecker:
    """Class for working on URLs"""
    
    def __init__(self, database_file):
        """Input Database file"""
        self.conn = sqlite3.connect(database_file)
        self.cursor = self.conn.cursor()
    
    def alreadyScrape(self, url_to_check, table, column)
        """Check whether url already scrape, Return in True or false
        Table : Reference_Domain,   
        column_name : Domain_Name """

        query_check = f"SELECT * FROM {table_name} WHERE {column_name}='{url_to_check}'"
        self.cursor.execute(query_check)
        result = self.cursor.fetchone()

        if result:
            return True
        else:
            return False

    def checkAccessibility(self, url):
        """Check Whether URL is still accessible"""
        try:
            response = requests.get(url)
            response.raise_for_status()
            return True
        except requests.exceptions.HTTPError as err:
            return False

    def compareDomains(self, url1, url2):
        """Compare two url domain"""
        domain1 = urlparse(url1).hostname
        domain2 = urlparse(url2).hostname
        return domain1 == domain2
    


In [None]:
import sqlite3

class dataPipeline:
    """Class of function for Update / Remove data"""
    
    def __init__(self, database_file):
        """Input database file"""
        self.conn = sqlite3.connect(database_file)
        self.cursor = self.conn.cursor()
        self.createTable()
        
    def createTable(self):
        # Create table for keeping domain name of url and times of referenced to
        cursor.execute("CREATE TABLE IF NOT EXISTS Reference_Domain(Domain_Name, Ref_Count)")
        # Create a table for unique id for each url and list of all words in that url and list of url found on that page
        cursor.execute("CREATE TABLE IF NOT EXISTS web_Data(Web_ID, URL, All_Word, Ref_To)")
        # Create table for each word, number of documnet that conatain that word and dictionary of sorted key that are id of url and number of that word found on that link
        cursor.execute("CREATE TABLE IF NOT EXISTS Inverted_Index(Word, Document_Freq, Inverted_Dict)")

    def uncountRef(domain_name_list, tableName, domainColumn, countColumn):
        """For uncount referenced domain"""
        for domain in domain_name_list:
            query_check = f"UPDATE {tableName} SET {countColumn} = {countColumn} - 1 WHERE {domainColumn} = '{domain}'"
            cursor.execute(query_check)
            conn.commit()

            
    def removeInvertedIndex(table_name, doc_freq_col, inverted_dict_col, words, web_id):
        """Remove id from indexing and reduce docsfreq"""

        for word in words:
            # Retrieve the current values of Document_Freq and Inverted_Dict
            self.cursor.execute(f"SELECT {doc_freq_col}, {inverted_dict_col} FROM {table_name} WHERE Word=?", (word,))
            result = self.cursor.fetchone()
            doc_freq, inverted_dict = result[0], result[1]

            # Decrement the Document_Freq value
            doc_freq -= 1

            # Convert the Inverted_Dict string to a dictionary and remove the entry for the Web_ID
            inverted_dict = eval(inverted_dict)
            inverted_dict.pop(str(web_id), None)

            # Update the values of Document_Freq and Inverted_Dict for the word
            self.cursor.execute(f"UPDATE {table_name} SET {doc_freq_col}=?, {inverted_dict_col}=? WHERE Word=?", (doc_freq, str(inverted_dict), word))

        # Commit the changes to the database
        self.conn.commit()
        
        
    def getUniqueID(self, table_name, web_id_column):
        """function for unique unused ID for a website"""
        self.cursor.execute(f"SELECT MAX({web_id_column}) FROM {table_name}")
        max_id = self.cursor.fetchone()[0]
        next_id = 1 if max_id is None else max_id + 1
        self.cursor.execute(f"SELECT {web_id_column} FROM {table_name} WHERE {web_id_column} = {next_id}")
        while self.cursor.fetchone() is not None:
            next_id += 1
        return next_id
    
    
    def updateReferenceDomain(self, table_name, domain_col, ref_col, domains):
        """Update reference domain receiving a list of domain"""
        for domain in domains:
            # Check if the domain already exists in the table
            self.cursor.execute(f"SELECT {ref_col} FROM {table_name} WHERE {domain_col}=?", (domain,))
            result = self.cursor.fetchone()
            
            if result:
                # If the domain already exists, increment the Ref_Count by 1
                ref_count = result[0] + 1
                self.cursor.execute(f"UPDATE {table_name} SET {ref_col}=? WHERE {domain_col}=?", (ref_count, domain))
            else:
                # If the domain doesn't exist, insert a new entry with Ref_Count set to 1
                self.cursor.execute(f"INSERT INTO {table_name} ({domain_col}, {ref_col}) VALUES (?, 1)", (domain,))
        
        # Commit the changes to the database
        self.conn.commit()
    
    
    def updateWebData(self, table_name, web_id_column, url_column, all_words_column, ref_to_column, url, web_id, words, domains):
        """Insert new url data into web_Data"""
        all_words = " ".join(words)
        ref_to = ",".join(domains)
        
        self.cursor.execute(f"INSERT INTO {table_name} ({web_id_column}, {url_column}, {all_words_column}, {ref_to_column}) VALUES (?, ?, ?, ?)", (web_id, url, all_words, ref_to))
        self.conn.commit()
        
        
    def updateInvertedIndexing(self, table_name, word_column, document_freq_column, inverted_dict_column, web_id, word_list):
        word_count = {}
        for word in word_list:
            word_count[word] = word_count.get(word, 0) + 1
        for word, count in word_count.items():
            self.cursor.execute(f"SELECT {word_column} FROM {table_name} WHERE {word_column} = '{word}'")
            result = self.cursor.fetchone()
            if result:
                self.cursor.execute(f"UPDATE {table_name} SET {document_freq_column} = {document_freq_column} + 1, {inverted_dict_column} = {inverted_dict_column} || '{{{web_id}:{count}}}' WHERE {word_column} = '{word}'")
            else:
                self.cursor.execute(f"INSERT INTO {table_name} ({word_column}, {document_freq_column}, {inverted_dict_column}) VALUES ('{word}', 1, '{{{web_id}:{count}}}')")
        self.conn.commit()
        
    
    

In [None]:
url = "https://example.com/home"
LinkChecker = LinkChecker("test.db")
    
if LinkChecker.alreadyScrape(url, "Reference_Domain", "Domain_Name"):
    # Check if URL has been crawl
    # If Yes
    if LinkChecker.checkAccessibility(url):
        pass
    else:
        # remove data
    
else:
    # If Not
    if LinkChecker.checkAccessibility(url):
        # Insert New Data
        
    else:
        pass
    


In [3]:
import sqlite3

conn = sqlite3.connect("fur_zukunft/test.db")
cursor = conn.cursor()

# Create table for keeping domain name of url and times of referenced to
cursor.execute("CREATE TABLE IF NOT EXISTS Reference_Domain(Domain_Name, Ref_Count)")
# Create a table for unique id for each url and list of all words in that url and list of url found on that page
cursor.execute("CREATE TABLE IF NOT EXISTS web_Data(Web_ID, URL, All_Word, Ref_To)")
# Create table for each word, number of documnet that conatain that word and dictionary of sorted key that are id of url and number of that word found on that link
cursor.execute("CREATE TABLE IF NOT EXISTS Inverted_Index(Word, Document_Freq, Inverted_Dict)")

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('Reference_Domain',), ('web_Data',), ('Inverted_Index',)]


In [19]:
url = "https://example.com/home"
cursor.execute("SELECT All_Word FROM web_Data WHERE URL = ?", (url,))
domain_name = cursor.fetchone()
domain_name[0].split(', ')[1]

'example'

In [25]:
words = {"hello","world"}
domains = {"www.google.com","www.facebook.com"}
all_words = " , ".join(words)
ref_to = " , ".join(domains)

In [26]:
all_words, ref_to

('hello , world', 'www.google.com , www.facebook.com')

In [None]:
import sqlite3

class dataPipeline:
    """Class of function for Update / Remove data"""
    
    def __init__(self, database_file):
        """Input database file"""
        self.conn = sqlite3.connect(database_file)
        self.cursor = self.conn.cursor()
        self.createTable()
        
    def createTable(self):
        # Create table for keeping domain name of url and times of referenced to
        cursor.execute("CREATE TABLE IF NOT EXISTS Reference_Domain(Domain_Name, Ref_Count)")
        # Create a table for unique id for each url and list of all words in that url and list of url found on that page
        cursor.execute("CREATE TABLE IF NOT EXISTS web_Data(Web_ID, URL, All_Word, Ref_To)")
        # Create table for each word, number of documnet that conatain that word and dictionary of sorted key that are id of url and number of that word found on that link
        cursor.execute("CREATE TABLE IF NOT EXISTS Inverted_Index(Word, Document_Freq, Inverted_Dict)")

    def uncountRef(self, domain_name_list):
        """For uncount referenced domain"""
        for domain in domain_name_list:
            query_check = f"UPDATE Reference_Domain SET Ref_Count = Ref_Count - 1 WHERE Domain_Name = '{domain}'"
            cursor.execute(query_check)
            conn.commit()

            
    def removeInvertedIndex(self, words, web_id):
        """Remove id from indexing and reduce docsfreq"""

        for word in words:
            # Retrieve the current values of Document_Freq and Inverted_Dict
            self.cursor.execute(f"SELECT Document_Freq, Inverted_Dict FROM Inverted_Index WHERE Word=?", (word,))
            result = self.cursor.fetchone()
            doc_freq, inverted_dict = result[0], result[1]

            # Decrement the Document_Freq value
            doc_freq -= 1

            # Convert the Inverted_Dict string to a dictionary and remove the entry for the Web_ID
            inverted_dict = eval(inverted_dict)
            inverted_dict.pop(str(web_id), None)

            # Update the values of Document_Freq and Inverted_Dict for the word
            self.cursor.execute(f"UPDATE Inverted_Index SET Document_Freq=?, Inverted_Dict=? WHERE Word=?", (doc_freq, str(inverted_dict), word))

        # Commit the changes to the database
        self.conn.commit()
        
        
    def getUniqueID(self):
        """function for unique unused ID for a website"""
        self.cursor.execute(f"SELECT MAX(Web_ID) FROM web_Data")
        max_id = self.cursor.fetchone()[0]
        next_id = 1 if max_id is None else max_id + 1
        self.cursor.execute(f"SELECT Web_ID FROM web_Data WHERE Web_ID = {next_id}")
        while self.cursor.fetchone() is not None:
            next_id += 1
        return next_id
    
    
    # cursor.execute("CREATE TABLE IF NOT EXISTS Reference_Domain(Domain_Name, Ref_Count)")
    def updateReferenceDomain(self, domains):
        """Update reference domain receiving a list of domain"""
        for domain in domains:
            # Check if the domain already exists in the table
            self.cursor.execute(f"SELECT Ref_Count FROM Reference_Domain WHERE Domain_Name=?", (domain,))
            result = self.cursor.fetchone()
            
            if result:
                # If the domain already exists, increment the Ref_Count by 1
                ref_count = result[0] + 1
                self.cursor.execute(f"UPDATE Reference_Domain SET Ref_Count=? WHERE Domain_Name=?", (ref_count, domain))
            else:
                # If the domain doesn't exist, insert a new entry with Ref_Count set to 1
                self.cursor.execute(f"INSERT INTO Reference_Domain (Domain_Name, Ref_Count) VALUES (?, 1)", (domain,))
        
        # Commit the changes to the database
        self.conn.commit()
    
    def updateWebData(self, web_id, url, all_words, ref_to):
        """Insert new url data into web_Data"""
        all_words = " ".join(words)
        ref_to = ",".join(domains)
        
        self.cursor.execute(f"INSERT INTO web_Data (Web_ID, URL, All_Word, Ref_To) VALUES (?, ?, ?, ?)", (web_id, url, all_words, ref_to))
        self.conn.commit()
        
    
    # cursor.execute("CREATE TABLE IF NOT EXISTS Inverted_Index(Word, Document_Freq, Inverted_Dict)")
    def updateInvertedIndexing(self, web_id, word_list):
        word_count = {}
        for word in word_list:
            word_count[word] = word_count.get(word, 0) + 1
        for word, count in word_count.items():
            self.cursor.execute(f"SELECT Word FROM Inverted_Index WHERE Word = '{word}'")
            result = self.cursor.fetchone()
            if result:
                self.cursor.execute(f"UPDATE Inverted_Index SET Document_Freq = Document_Freq + 1, Inverted_Dict = Inverted_Dict || '{{{web_id}:{count}}}' WHERE Word = '{word}'")
            else:
                self.cursor.execute(f"INSERT INTO Inverted_Index (Word, Document_Freq, Inverted_Dict) VALUES ('{word}', 1, '{{{web_id}:{count}}}')")
        self.conn.commit()
        
    
    