# Scraper
```
State := Current site, collected data, links, times, etc
Init := (Inital site, 0, 0, 0, ...)
Goal := (*, specific amount of data, *, timeout, ...)
Action := Start | Collect | Travel | Stop | Exit
    Start := Execute the scraper in S if it is valid
    Collect := Collect useful data and links. Test whether it is new or not
    Travel := Move S2 via links
    Stop := Stop during a little time
    Exit := Exit the scraper
Transition :=
    (Start, S1) -> (Collect, S1) | (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Collect, S1) -> (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Travel, S1, S2) -> (Start, S2)
    (Stop, S1) -> (Collect, S1)
    (Exit, S1) -> Done
Action Cost := Equal
```

In [65]:
# Abstract Class
import time
import random
from functools import wraps
from operator import lt, eq
from copy import copy, deepcopy

from dataclasses import dataclass
from abc import ABCMeta, abstractmethod

import requests
from urllib.error import URLError
from requests.exceptions import HTTPError
from urllib.robotparser import RobotFileParser


@dataclass
class State:
    def __init__(self, site: str, data: dict = dict(), links: set = set(), times:float = 0.):
        self.site = site
        self.data = data
        self.links = links
        self.times = times
    
    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        return result

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

class ABCScraper(metaclass=ABCMeta):
    def __init__(self, init:State, goal:State):
        self.__init = init
        self.__current = deepcopy(self.__init)
        self.__goal = goal
        self.lastStep = None
    
    @property
    def initState(self):
        return self.__init
    @property
    def currentState(self):
        return self.__current
    @property
    def goalState(self):
        return self.__goal
    
    def __canFetch():
        robots = dict()
        
        def canFetch(self, path:"uri", ua:"user-agent" = '*') -> bool:
            url = urljoin(path, "/robots.txt")
            
            if url in robots.keys():
                return robots[url]
            
            try:
                robotParser =  RobotFileParser(url)
                robotParser.read()
                robots[url] = robotParser.can_fetch(ua, path)
                return robots[url]
            except HTTPError as e:
                if e.status_code // 100 == 4:
                    robots[url] = True # robots.txt doesn't exist. So, the access is permitted
                else:
                    robots[url] = False
                return robots[url]
            except URLError:
                raise # Invalid URL Error

        return canFetch
    canFetch = __canFetch()
    
    def meet_unexpected_error(e:"Error"):
        print("Unexpected Error")
        print(e)

    # Run
    def run(self):
        print("Running Scraper")
        return self.start()
    
    # Action
    def start(self):
        # Vaild URL Test
        url = self.currentState.site
        try:
            # Check robots.txt
#             if self.canFetch(url):
#                 #Stop randomly
#                 return self.collect() if random.randint(1, 100) > 10 else self.stop()
#             else:
#                 print("Robots.txt doesn't permit a scraper")
            
#             link = self.select_link()
#             if link:
#                 return self.travel(link)
#             else:
#                 self.lastStep = 'Start'
#                 return self.exit()

            # Check robots.txt but ignore it
            if not self.canFetch(url):
                print("Robots.txt doesn't permit a scraper")
            return self.collect() if random.randint(1, 100) > 10 else self.stop()
        except URLError:
            print("ERROR: URLError")
            return self.exit()
        except Exception as e:
            ABCScraper.meet_unexpected_error(e)
            return self.exit()
    
    @abstractmethod
    def select_link(self):
        '''Return selected link or None'''
        pass
    
    def collect(self):
        try:
            self.collect_by_custom()
        except HTTPError as e:
            if e.response.status_code // 100 == 5: #5xx
                return self.stop()
            print("Code", e.response.status_code)
            print("Reason", e.response.reason)
            print("Req Header", e.request.headers)
            return self.exit()
        except Exception as e:
            ABCScraper.meet_unexpected_error(e)
            return self.exit()
            
        else:
            if not self.is_exit():
                link = self.select_link()
                if link:
                    return self.travel(link)
            self.lastStep = 'Collect'
            return self.exit()
    
    @abstractmethod
    def collect_by_custom(self):
        '''make collect algorithm'''
        pass
    
    def travel(self, link):
        self.currentState.site = link
        return self.start()
    
    def stop(self):
        time.sleep(random.randint(1, 10))
        return self.collect()
        
    def exit(self):
        print("Last step is", self.lastStep)
        print("Last site is", self.currentState.site)
        print("Collected data size is", len(self.currentState.data))
        print("Remaining links is", len(set(self.currentState.links)))
        print("Collect time is", self.currentState.times, 'secs')
        return self.currentState
    
    @abstractmethod
    def is_exit(self):
        '''make exit condition. True if exit'''
        pass

In [64]:
import re
import json
from collections import ChainMap
from requests.compat import urljoin
from bs4 import BeautifulSoup

# Scraper
class GMailScraper(ABCScraper):
    def __init__(self, init:State, goal:State, configPath:"Safari config file path" = "mail.google.com.har"):
        '''
        HTTP Gmail url: https://mail.google.com/mail/u/0/h/
        How to get config file:
            1. Go to Safari
            2. Click network in development-tools
            3. Click "export" at right-side
        '''
        super().__init__(init, goal)
        url, headers = self.__parse_config(configPath)
        if not url or not headers:
            print("GMailScraper cannot be initialized")
            return None
        self.currentState.site = url
        self.headers = headers
        
    def __parse_config(self, configPath)->("url", "headers"):
        try:
            fp = open(configPath, 'r')
            config = json.load(fp)
            fp.close()    
        except FileNotFoundError:
            print("Invalid configuration Path")
            return None, None
        except OSError:
            print("Invalid configuration Path")
            return None, None
        except Exception as e:
            Scraper.meet_unexpected_error(e)
            return None, None
            
        else:
            try:
                req = config['log']['entries'][0]['request']
                headers = {attr['name']:attr['value'] for attr in req['headers']}
                return req['url'], headers
            except KeyError:
                print("Invalid configuration file")
                return None, None
            except IndexError:
                print("Invalid configuration file")
                return None, None
            except Exception as e:
                Scraper.meet_unexpected_error(e)
                return None, None
                
        

    def collect_by_custom(self):
        try:
            res = requests.get(self.currentState.site, headers=self.headers)
            res.raise_for_status()
        except HTTPError as e:
            print(e.response.status_code)
            print(e.response.reason)
            print(e.request.headers)
        except URLError as e:
            print("URLError")
        except Exception as e:
            Scraper.meet_unexpected_error(e)
        else:
            self.__collect_data(res)
            self.__collect_links(res)
        return
    
    def __collect_data(self, res):
        dom = BeautifulSoup(res.text, 'lxml')
        mails = dom.find('table', {'class': 'th'}).find_all('tr')
        data = {mail.select('tr td:nth-of-type(2)')[0].get_text() : \
                    mail.select('tr td:nth-of-type(3)')[0].get_text() for mail in mails}

        halfSpace = r'\u200c'
        for k, v in data.items():
            newV = re.sub(halfSpace, ' ', v).strip()
            data[k] = newV
        self.currentState.data = ChainMap(self.currentState.data, data)
        return
    
    def __collect_links(self, res):
        dom = BeautifulSoup(res.text, 'lxml')
        links = dom.find('table', {'class': 'ft'}).previous_sibling.find_all('a', {'class':'searchPageLink'})
        
        previous, recent, latest = None, None, None
        for link in links:
            # raw val = 이전   ›
            form = re.sub(r'›', '', link.get_text()).strip()
            match form:
                case "이전":
                    previous = link['href']
                case "다음":
                    recent = link['href']
                case "처음":
                    latest = link['href']
                case _:
                    continue
        self.currentState.links.add((previous, recent, latest))
        return
        
    
    def select_link(self):
        if not len(self.currentState.links):
            return None
        previous, _, _ = self.currentState.links.pop()
        return urljoin(self.currentState.site, previous) if previous else None
    
    def is_exit(self):
        return True if not len(self.currentState.links) else False

In [12]:
x = GMailScraper(State(""), State(""), "mail.google.com.har")
x.run()

Running Scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt doesn't permit a scraper
Robots.txt d

State(site='https://mail.google.com/mail/u/0/h/17749qycwmik2/?&st=2600', data=ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap(ChainMap({}, {'Nordic-Benelux Center': '[노르딕-베네룩스 센터] 7/18 EU대사 초청 특강 안내 - 고려대학교 노르딕-베네룩스 센터와 장모네 EU 센터는 오는 7월 18일(월), 마리아 카스티요 페르난데즈 주한 유럽연합 대사와 김형진 전 주벨기에·유럽연합 대사를 모시고 아래와 같이 특강을 개최하오니, 많은 관심과 참석 부탁드립니다. · 주제: Building a New Trilateral Partnership: EU, NATO and KOREA · 강연자:', 'campustown': '[고려대학교 캠퍼스타운] 2022년 캠퍼스타운 취업사관학교 교육생 모집 - 가. 교육일정 : 2022. 8.1.(월) ~ 2023. 2.28.(화) (예정) 취업사관학교 프로그램(4개월) 종료 후 인턴십 프로그램(2개월) 진행 나. 교육대상 구분 대상 인원 4차 산업 기술인재 양성과정 

# DB

```
Relation Scheme(mail)

Data := json format [{"sender": sender, "date": date, "title": title, "content": content, "type":type}, ...]
Sender := CHAR(50)
Date := Date
Title := CHAR(100)
Content := TEXT
Type := BOOLEAN (TRUE if Spam else Normal)


Query := SaveQuery | SearchQuery
SaveQuery := ("SAVE", _)
SearchQuery := ("SEARCH", SQLSyntax)
```

```
State := (data, query)
Init := (data, saveQuery) | (_, SearchQuery)
Goal := (_, saveQuery) | (data, SearchQuery)

Action := start | connect | create | save | search | exit
    start := Start db jobs
    connect := Make a connection to db
    create := Create relation scheme
    save := Save data in db
    search := Execute searchQuery in db
    exit := Exit the db jobs.
    
Transition
    (start, S1) := (connect, S1) | (save, S1) | (search, S1) | (exit, S1)
    (connect, S1) := (start, S1) | (exit, S1)
    (save, S1) := (create, S1) | (exit, S1)
    (create, S1) := (save, S1) | (exit, S1)
    (search, S1) := (exit, S1)
    (exit, S1) := Done
    
Action Cost := Manually defined
```

In [66]:
import random
from abc import ABCMeta, abstractmethod, abstractproperty
from dataclasses import dataclass
from enum import Enum, auto, unique

@unique
class QueryType(Enum):
    SAVE = auto()
    SEARCH = auto()

@dataclass
class State:
    def __init__(self, data = list(), query = tuple()):
        self.data = data
        self.query = query

class ABCSingleton(metaclass=ABCMeta):
    __instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in __instances.keys():
            cls.__instances[cls] = super().__call__(*args, **kwargs)
        return cls.__instances[cls]
    
    
class ABCDataBase(ABCSingleton):
    def __init__(self, init:State, db:"DataBase"):
        '''
        Init := (data, saveQuery) | (_, SearchQuery)
        '''
        self.__state = init
        self.db = db
        self.__conns = []
        self.__conn = None
        self.lastStep = "Init"
    
    @property
    def state(self):
        return self.__state
    @property
    def conn(self):
        '''Current connection'''
        return self.__conn

    def get_connections(self):
        '''Connection pool'''
        return self.__conns
    
    def start(self):
        if not len(self.conns):
            return self.connect()
        self.conn = self.__conns.pop()
        
        queryType = self.status.query[0]
        match queryType:
            case QueryType.SAVE:
                return self.save()
            case QueryType.SEARCH:
                return self.search()
            case _:
                print("Unexpected querytype")
                self.lastStep = "Start"
                return self.exit()
        return
    
    @abstractmethod
    def connect(self):
        '''Make a connection to db & Save it in __conns'''
        pass
    
    @abstractmethod
    def save(self, relation = "mail"):
        '''Save data in relation'''
        pass
    
    @abstractmethod
    def create(self, relation = "mail"):
        '''Create relation Scheme'''
        pass
    
    @abstractmethod
    def search(self, relation = "mail"):
        '''Search query & Fill it data in status'''
        pass
        
        
    def exit(self):
        print("Last step is", self.lastStep)
        return self.status
    

In [73]:
import sqlite3

class SQLiteDB(ABCDataBase):
    def connect(self):
        try
            conn = sqlite3.connect(self.db)
            self.__conns.append(conn)
            return self.start()
        except Exception as e:
            print("Unexpected Error")
            print(e)
            self.lastStep = "Connect"
            return self.exit()
        
    def save(self, relation = "mail"):
        with self.conn as conn:
            cur = conn.cursor()
            cur.execute('''
            SELECT name FROM sqlite_scheme
            WHERE type = 'table' AND name NOT LIKE 'sqlite_%';
            ''')
            tables = cur.fetchall()
            if not reduce(lambda x, y: x and "example" in y, tables, True):
                return self.create(relation)
            # Save Data
            
            self.lastStep = "Save"
            return self.exit()
        
    def create(self, relation = "mail"):
        pass
    def search(self, relation = "mail"):
        pass
    
x = SQLiteDB("1", "2")


'2'

In [101]:
import sqlite3
from functools import reduce

with sqlite3.connect(':memory:') as conn:
    cur = conn.cursor()
    cur.execute('''CREATE TABLE example(id, name)''')
    x = '''SELECT 
    name
FROM 
    sqlite_schema
WHERE 
    type ='table' AND 
    name NOT LIKE 'sqlite_%';'''
    cur.execute(x)
    tables = cur.fetchall()
    print()

True


In [99]:
from functools import reduce


True

In [100]:
"1" in True

TypeError: argument of type 'bool' is not iterable