# Scraper
```
State := Current site, collected data, links, times, etc
Init := (Inital site, 0, 0, 0, ...)
Goal := (*, specific amount of data, *, timeout, ...)
Action := Start | Collect | Travel | Stop | Exit
    Start := Execute the scraper in S if it is valid
    Collect := Collect useful data and links. Test whether it is new or not
    Travel := Move S2 via links
    Stop := Stop during a little time
    Exit := Exit the scraper
Transition :=
    (Start, S1) -> (Collect, S1) | (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Collect, S1) -> (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Travel, S1, S2) -> (Start, S2)
    (Stop, S1) -> (Collect, S1)
    (Exit, S1) -> Done
Action Cost := Equal
```

In [506]:
# Abstract Class
import time
import random
from functools import wraps
from operator import lt, eq
from copy import copy, deepcopy

from dataclasses import dataclass
from abc import ABCMeta, abstractmethod

import requests
from urllib.error import URLError
from requests.exceptions import HTTPError
from urllib.robotparser import RobotFileParser


@dataclass
class ScraperState:
    def __init__(self, site: str, data: dict = dict(), links: set = set(), times:float = 0.):
        self.site = site
        self.data = data
        self.links = links
        self.times = times
    
    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        return result

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

class ABCScraper(metaclass=ABCMeta):
    def __init__(self, init:ScraperState, goal:ScraperState):
        self.__init = init
        self.__current = deepcopy(self.__init)
        self.__goal = goal
        self.lastStep = "Init"
    
    @property
    def initState(self):
        return self.__init
    @property
    def currentState(self):
        return self.__current
    @property
    def goalState(self):
        return self.__goal
    
    def __canFetch():
        robots = dict()
        
        def canFetch(self, path:"uri", ua:"user-agent" = '*') -> bool:
            url = urljoin(path, "/robots.txt")
            
            if url in robots.keys():
                return robots[url]
            
            try:
                robotParser =  RobotFileParser(url)
                robotParser.read()
                robots[url] = robotParser.can_fetch(ua, path)
                return robots[url]
            except HTTPError as e:
                if e.status_code // 100 == 4:
                    robots[url] = True # robots.txt doesn't exist. So, the access is permitted
                else:
                    robots[url] = False
                return robots[url]
            except URLError:
                raise # Invalid URL Error

        return canFetch
    canFetch = __canFetch()
    
    def meet_unexpected_error(e:"Error"):
        print("Unexpected Error")
        print(e)

    # Run
    def run(self):
        print("Running Scraper")
        return self.start()
    
    # Action
    def start(self):
        # Vaild URL Test
        url = self.currentState.site
        try:
            # Check robots.txt
#             if self.canFetch(url):
#                 #Stop randomly
#                 return self.collect() if random.randint(1, 100) > 10 else self.stop()
#             else:
#                 print("Robots.txt doesn't permit a scraper")
            
#             link = self.select_link()
#             if link:
#                 return self.travel(link)
#             else:
#                 self.lastStep = 'Start'
#                 return self.exit()

            # Check robots.txt but ignore it
            if not self.canFetch(url):
                print("Robots.txt doesn't permit a scraper")
            return self.collect() if random.randint(1, 100) > 10 else self.stop()
        except URLError:
            print("ERROR: URLError")
            return self.exit()
        except Exception as e:
            ABCScraper.meet_unexpected_error(e)
            return self.exit()
    
    @abstractmethod
    def select_link(self):
        '''Return selected link or None'''
        pass
    
    def collect(self):
        try:
            self.collect_by_custom()
        except HTTPError as e:
            if e.response.status_code // 100 == 5: #5xx
                return self.stop()
            print("Code", e.response.status_code)
            print("Reason", e.response.reason)
            print("Req Header", e.request.headers)
            return self.exit()
        except Exception as e:
            ABCScraper.meet_unexpected_error(e)
            return self.exit()
            
        else:
            if not self.is_exit():
                link = self.select_link()
                if link:
                    return self.travel(link)
            self.lastStep = 'Collect'
            return self.exit()
    
    @abstractmethod
    def collect_by_custom(self):
        '''make collect algorithm'''
        pass
    
    def travel(self, link):
        self.currentState.site = link
        return self.start()
    
    def stop(self):
        time.sleep(random.randint(1, 10))
        return self.collect()
        
    def exit(self):
        print("Last step is", self.lastStep)
        print("Last site is", self.currentState.site)
        print("Collected data size is", len(list(self.currentState.data)))
        print("Remaining links is", len(self.currentState.links))
        print("Collect time is", self.currentState.times, 'secs')
        return self.currentState
    
    @abstractmethod
    def is_exit(self):
        '''make exit condition. True if exit'''
        pass

In [507]:
import re
import json
from requests.compat import urljoin
from bs4 import BeautifulSoup
from enum import Enum, unique, auto

@unique
class MailType(Enum):
    SPAM = auto()
    NORMAL = auto()
    
    def __str__(self):
        mail_type = ''
        match self:
            case MailType.SPAM:
                mail_type = 'SPAM'
            case MailType.NORMAL:
                mail_type = 'NORMAL'
        return mail_type

@dataclass
class Data:
    sender: str
    date: datetime.date
    title: str
    content: str
    mail_type: MailType
        
    def to_dict(self):
        return {'sender': self.sender, 'date': self.date, 'title': self.title,\
                'content': self.content, 'mail_type': str(self.mail_type)}
    

# Scraper
class GMailScraper(ABCScraper):
    def __init__(self, init:ScraperState, goal:ScraperState, configPath:"Safari config file path" = "mail.google.com.har"):
        '''
        HTTP Gmail url: https://mail.google.com/mail/u/0/h/
        How to get config file:
            1. Go to Safari
            2. Click network in development-tools
            3. Click "export" at right-side
        '''
        super().__init__(init, goal)
        url, headers = self.__parse_config(configPath)
        if not url or not headers:
            print("GMailScraper cannot be initialized")
            return None
        
        self.currentState.data = []
        self.headers = headers
        self.mode = MailType.NORMAL
        
        # Collect_links
        self.currentState.site = url
        self.currentState.links = []
        res = requests.get(url, headers=self.headers)
        self.__collect_links(res)
        self.currentState.site = self.select_link()
        
    def __parse_config(self, configPath)->("url", "headers"):
        try:
            fp = open(configPath, 'r')
            config = json.load(fp)
            fp.close()    
        except FileNotFoundError:
            print("Invalid configuration Path")
            return None, None
        except OSError:
            print("Invalid configuration Path")
            return None, None
        except Exception as e:
            Scraper.meet_unexpected_error(e)
            return None, None
            
        else:
            try:
                req = config['log']['entries'][0]['request']
                headers = {attr['name']:attr['value'] for attr in req['headers']}
                return req['url'], headers
            except KeyError:
                print("Invalid configuration file")
                return None, None
            except IndexError:
                print("Invalid configuration file")
                return None, None
            except Exception as e:
                Scraper.meet_unexpected_error(e)
                return None, None
                
        
    def collect_by_custom(self):
        try:
            res = requests.get(self.currentState.site, headers=self.headers)
            res.raise_for_status()
        except HTTPError as e:
            print(e.response.status_code)
            print(e.response.reason)
            print(e.request.headers)
        except URLError as e:
            print("URLError")
        except Exception as e:
            Scraper.meet_unexpected_error(e)
        else:
            self.__collect_data(res)
            if not self.currentState.links:
                self.__collect_links(res)
        return
    
    
    def __collect_data(self, res):
        dom = BeautifulSoup(res.text, 'lxml')
        sender = dom.find('table', {'class': 'h'}).next_sibling.find('table').find('h3').text
        date = dom.find('table', {'class': 'h'}).next_sibling.find('table').find('td').next_sibling.text
        date = re.search(r'^(\d+)년 (\d+)월 (\d+)일', date)
        date = datetime.date(int(date.group(1)), int(date.group(2)), int(date.group(3)))
        
        title = dom.find('table', {'class': 'h'}).find('h2').text

        content = dom.find('div', {'class': 'msg'}).text
        content = re.sub(r'\u200c+', r' ', content)
        content = re.sub(r'\xa0+', r' ', content)
        content = re.sub(r'(\s)+', r'\1', content).strip()
        
        if '스팸함' in dom.find('a', {'class': 'searchPageLink'}).text:
            mail_type = MailType.SPAM
        else:
            mail_type = MailType.NORMAL
            
        datum = Data(sender, date, title, content, mail_type)
        self.currentState.data.append(datum)
        
    
    def __collect_links(self, res):
        '''
        link := [next_main_page, mail_pages]
        if mail_pages == empty: next main_page is selected & collect mail_pages
        '''
        
        dom = BeautifulSoup(res.text, 'lxml')
        spam_mailbox_link = dom.find('table', {'class': 'm'}).find(string='스팸함').parent['href']
        spam_mailbox_link = urljoin(self.currentState.site, spam_mailbox_link)

        links = dom.find('table', {'class': 'ft'}).previous_sibling.find_all('a', {'class':'searchPageLink'})
        next_page_link = ''
        for link in links:
            if '이전' in link.get_text():
                nxt_page_link = urljoin(self.currentState.site, link['href'])
                break

        mail_links = set(link.parent['href'] for link in dom.find_all('span', {'class': 'ts'}))
        mail_links = set(map(lambda link: urljoin(self.currentState.site, link), mail_links))

        self.currentState.links.append((spam_mailbox_link, next_page_link, mail_links))
        
    
    def select_link(self):
        '''
        select mail_pages not main_page
        '''
        
        if not self.currentState.links:
            return None
        
        spam_mailbox_link, next_page_link, mail_links = self.currentState.links.pop()

        # Record
        print(len(mail_links), self.mode)
        if len(mail_links) != 0:
            # Scrape mail
            mail_link = mail_links.pop()
            self.currentState.links.append((spam_mailbox_link, next_page_link, mail_links))
            return mail_link
        elif next_page_link:
            # Move next page & Collect mail_links
            res = requests.get(next_page_link, headers=self.headers)
            self.__collect_links(res)
            return self.select_link()
        else:
            if self.mode == MailType.NORMAL:
                # Move Spam mailbox & Collect mail_link
                self.mode = MailType.SPAM
                res = requests.get(spam_mailbox_link, headers=self.headers)
                self.__collect_links(res)
                return self.select_link()
            else:
                #Done
                return None
    
    def is_exit(self):
        return True if not self.currentState.links else False

# DB

```
Relation Scheme(mail)

Data := json format [{"sender": sender, "date": date, "title": title, "content": content, "type":type}, ...]
Sender := CHAR(50)
Date := Date
Title := CHAR(100)
Content := TEXT
Type := BOOLEAN (TRUE if Spam else Normal)


Query := SaveQuery | SearchQuery
SaveQuery := ("SAVE", _)
SearchQuery := ("SEARCH", CONDITIONS SQL SYNTAX like title="test")
```

```
State := (data, query)
Init := (data, saveQuery) | (_, SearchQuery)
Goal := (_, saveQuery) | (data, SearchQuery)

Action := start | connect | create | save | search | exit
    start := Start db jobs
    connect := Make a connection to db
    create := Create relation scheme
    save := Save data in db
    search := Execute searchQuery in db
    exit := Exit the db jobs.
    
Transition
    (start, S1) := (connect, S1) | (save, S1) | (search, S1) | (exit, S1)
    (connect, S1) := (start, S1) | (exit, S1)
    (save, S1) := (create, S1) | (exit, S1)
    (create, S1) := (save, S1) | (exit, S1)
    (search, S1) := (exit, S1)
    (exit, S1) := Done
    
Action Cost := Manually defined
```

In [508]:
import random
from abc import ABCMeta, abstractmethod, abstractproperty
from dataclasses import dataclass
from enum import Enum, auto, unique
import datetime

@unique
class QueryType(Enum):
    SAVE = auto()
    SEARCH = auto()
    
    def __str__(self):
        query_type = ''
        match self:
            case QueryType.SAVE:
                query_type = 'SAVE'
            case QueryType.SEARCH:
                query_type = 'SEARCH'
        return query_type

@dataclass
class DBState:
    data: list()
    query: tuple()
        
class ABCSingleton(metaclass=ABCMeta):
    __instances = {}
    def __call__(cls, *args, **kwargs):
        if cls not in __instances.keys():
            cls.__instances[cls] = super().__call__(*args, **kwargs)
        return cls.__instances[cls]
    
    
class ABCDataBase(ABCSingleton):
    def __init__(self, init:DBState, db:"DataBase", table_name: "Relation"):
        '''
        Init := (data, saveQuery) | (_, SearchQuery)
        '''
        self.lastStep = "Init"
        self.__state = init
        self.db = db
        self.table_name = table_name
        
        self.__conns = []
        self.__conn = None
    
    @property
    def state(self):
        return self.__state
    
    @state.setter
    def state(self, new):
        self.__state = new
    
    @property
    def conn(self):
        '''Current connection'''
        return self.__conn
    @conn.setter
    def conn(self, new):
        self.__conn = new
    
    @property
    def conns(self):
        '''Connection pool'''
        return self.__conns
    
    def run(self):
        print("Run...")
        print(f"DB: {self.db}")
        print(f"Table: {self.table_name}")
        print(f"Mode: {str(self.state.query[0])}")
        return self.start()
    
    def start(self):
        self.lastStep = "Start"
        
        if not len(self.conns):
            return self.connect()
        self.conn = self.conns.pop()
        
        queryType = self.state.query[0]
        match queryType:
            case QueryType.SAVE:
                return self.save()
            case QueryType.SEARCH:
                return self.search()
            case _:
                print("Unexpected querytype")
                return self.exit()
    
    @abstractmethod
    def connect(self):
        '''Make a connection to db & Save it in __conns'''
        pass
    
    @abstractmethod
    def save(self):
        '''Save data in relation'''
        pass
    
    @abstractmethod
    def create(self):
        '''Create relation Scheme'''
        pass
    
    @abstractmethod
    def search(self):
        '''Search query & Fill it data in status'''
        pass
        
        
    def exit(self):
        print("Last step is", self.lastStep)
        return self.state
    

In [511]:
import sqlite3

class SQLiteDB(ABCDataBase):
    def connect(self):
        self.lastStep = "Connect"
        try:
            conn = sqlite3.connect(self.db)
            self.conns.append(conn)
            return self.start()
        except sqlite3.Error as e:
            print("An error occured:", e.args[0])
            return self.exit()
        
    def save(self):
        self.lastStep = "Save"
        try:
            with self.conn as conn:
                cur = conn.cursor()
                cur.execute('''
                SELECT name FROM sqlite_schema
                WHERE type = 'table' AND name NOT LIKE 'sqlite_%';
                ''')
                tables = cur.fetchall()
                if not any([True if self.table_name in table else False for table in tables]):
                    print("Enter create")
                    return self.create()
                keys = self.state.data[0].to_dict().keys()
                data = [datum.to_dict() for datum in self.state.data]
                cur.executemany(f'''
                    INSERT INTO {self.table_name}({','.join(keys)})
                    VALUES(:{',:'.join(keys)});
                ''', data)
                conn.commit()
        except sqlite3.Error as e:
            print("An error occured:", e.args[0])
        
        return self.exit()
        
    def create(self):
        self.lastStep = "Create"
        try:
            with self.conn as conn:
                cur = conn.cursor()
                cur.execute(f'''
                    CREATE TABLE {self.table_name}(
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        sender CHAR(100) NOT NULL,
                        date DATE NOT NULL,
                        title CHAR(200) NOT NULL,
                        content TEXT,
                        mail_type CHAR(10) NOT NULL
                    );
                    ''')
                conn.commit()
            return self.save()
        except sqlite3.Error as e:
            print("An error occured:", e.args[0])
            return self.exit()
            
    def search(self):
        self.lastStep = "Search"
        try:
            with self.conn as conn:
                cur = conn.cursor()
                condition = self.state.query[1]
                if condition:
                    cur.execute(f'''SELECT * FROM {self.table_name} WHERE {condition}''')
                else:
                    cur.execute(f'''SELECT * FROM {self.table_name}''')
                data = [Data(sender, date, title, content, mail_type) \
                        for _, sender, date, title, content, mail_type in cur.fetchall()]
                self.state.data = data
                
        except sqlite3.Error as e:
            print("An error occured:", e.args[0])
        except Exception as e:
            print(e)
        finally:
            return self.exit()
            



In [512]:
# Concat Scraper & DB

scraper = GMailScraper(ScraperState(""), ScraperState(""), "mail.google.com.har")
result = scraper.run()

db = SQLiteDB(DBState(result.data, (QueryType.SAVE, _)), "mails.db", "mail")
db.run()
db.state = DBState(_, (QueryType.SEARCH, ""))
result = db.run()
print(result)

5 NORMAL
Running Scraper
Robots.txt doesn't permit a scraper
4 NORMAL
Robots.txt doesn't permit a scraper
3 NORMAL
Robots.txt doesn't permit a scraper
2 NORMAL
Robots.txt doesn't permit a scraper
1 NORMAL
Robots.txt doesn't permit a scraper
0 NORMAL
1 SPAM
Robots.txt doesn't permit a scraper
0 SPAM
Last step is Collect
Last site is https://mail.google.com/mail/u/0/h/j4zmj06whgd2/?&th=181a70ad5e7a13af&v=c&s=m
Collected data size is 6
Remaining links is 0
Collect time is 0.0 secs
Run...
DB: mails.db
Table: mail
Mode: SAVE
Last step is Save
Run...
DB: mails.db
Table: mail
Mode: SEARCH
Last step is Search
DBState(data=[Data(sender='Google Payments', date='2022-06-23', title='Google Payments 사용자 정보가 변경되었습니다.', content='Google Payments 사용자 정보 변경 완료 알림\n관리자(Google)가 귀하의 사용자 정보를 변경했습니다. 아래의 변경사항을 검토하시기 바랍니다.\nPayments 프로필:\n윤영로\nPayments 프로필 ID:\n1171-6274-4141\n정보를 변경한 관리자:\nGoogle\n새 연락처 정보:\n윤영로\n새 권한:\n관리\n새 이메일 환경설정:\n모든 결제 이메일\n변경을 원하지 않을 경우 Payments 프로필 관리자에게 문의하세요.\n고객센터\n문의하기\n결제 프로필 