# Scraper
```
State := Current site, collected data, links, times, etc
Init := (Inital site, 0, 0, 0, ...)
Goal := (*, specific amount of data, *, timeout, ...)
Action := Start | Collect | Travel | Stop | Exit
    Start := Execute the scraper in S if it is valid
    Collect := Collect useful data and links. Test whether it is new or not
    Travel := Move S2 via links
    Stop := Stop during a little time
    Exit := Exit the scraper
Transition :=
    (Start, S1) -> (Collect, S1) | (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Collect, S1) -> (Travel, S1, S2 in links) | (Stop, S1) | (Exit, S1)
    (Travel, S1, S2) -> (Start, S2)
    (Stop, S1) -> (Collect, S1)
    (Exit, S1) -> Done
Action Cost := Manually defined
```

In [22]:
# Abstract Class
import time
import random
from functools import wraps
from operator import lt, eq
from copy import copy, deepcopy

from dataclasses import dataclass
from abc import ABCMeta, abstractmethod

import requests
from urllib.error import URLError
from requests.exceptions import HTTPError
from urllib.robotparser import RobotFileParser


@dataclass
class State:
    site: str
    data: dict
    links: set
    times: float #seconds
    
    def __init__(self, site: str, data: dict = dict(), links: set = set(), times:float = 0.):
        self.site = site
        self.data = data
        self.links = links
        self.times = times
    
    def __copy__(self):
        cls = self.__class__
        result = cls.__new__(cls)
        result.__dict__.update(self.__dict__)
        return result

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

class Scraper(metaclass=ABCMeta):
    def __init__(self, init:State, goal:State):
        self.__init = init
        self.__current = deepcopy(self.__init)
        self.__goal = goal
        self.lastStep = None
    
    @property
    def initState(self):
        return self.__init
    @property
    def currentState(self):
        return self.__current
    @property
    def goalState(self):
        return self.__goal
    
    def __canFetch():
        robots = dict()
        
        def canFetch(self, path:"uri", ua:"user-agent" = '*') -> bool:
            url = urljoin(path, "/robots.txt")
            
            if url in robots.keys():
                return robots[url]
            
            try:
                robotParser =  RobotFileParser(url)
                robotParser.read()
                robots[url] = robotParser.can_fetch(ua, path)
                return robots[url]
            except HTTPError as e:
                if e.status_code // 100 == 4:
                    robots[url] = True # robots.txt doesn't exist. So, the access is permitted
                else:
                    robots[url] = False
                return robots[url]
            except URLError:
                raise # Invalid URL Error

        return canFetch
    canFetch = __canFetch()
    
    def __meet_unexpected_error(e:"Error"):
        print("Unexpected Error")
        print(e)

    # Run
    def run(self):
        print("Running Scraper")
        return self.start()
    
    # Action
    def start(self):
        # Vaild URL Test
        url = self.currentState.site
        try:
            # Check robots.txt
#             if self.canFetch(url):
#                 #Stop randomly
#                 return self.collect() if random.randint(1, 100) > 10 else self.stop()
#             else:
#                 print("Robots.txt doesn't permit a scraper")
            
#             link = self.select_link()
#             if link:
#                 return self.travel(link)
#             else:
#                 self.lastStep = 'Start'
#                 return self.exit()

            # Check robots.txt but ignore it
            if not self.canFetch(url):
                print("Robots.txt doesn't permit a scraper")
            return self.collect() if random.randint(1, 100) > 10 else self.stop()
        except URLError:
            print("ERROR: URLError")
            return self.exit()
        except Exception as e:
            Scraper.__meet_unexpected_error(e)
            return self.exit()
    
    @abstractmethod
    def select_link(self):
        '''Return selected link or None'''
        pass
    
    def collect(self):
        try:
            self.collect_by_custom()
        except HTTPError as e:
            if e.response.status_code // 100 == 5: #5xx
                return self.stop()
            print("Code", e.response.status_code)
            print("Reason", e.response.reason)
            print("Req Header", e.request.headers)
            return self.exit()
        except Exception as e:
            Scraper.__meet_unexpected_error(e)
            return self.exit()
            
        else:
            if not self.is_exit():
                link = self.select_link()
                if link:
                    return self.travel(link)
            self.lastStep = 'Collect'
            return self.exit()
    
    @abstractmethod
    def collect_by_custom(self):
        '''make collect algorithm'''
        pass
    
    def travel(self, link):
        self.currentState.site = link
        return self.start()
    
    def stop(self):
        time.sleep(random.randint(1, 10))
        return self.collect()
        
    def exit(self):
        print("Last step is", self.lastStep)
        print("Last site is", self.currentState.site)
        print("Collected data size is", len(self.currentState.data))
        print("Remaining links is", len(set(self.currentState.links)))
        print("Collect time is", self.currentState.times, 'secs')
        return self.currentState
    
    @abstractmethod
    def is_exit(self):
        '''make exit condition. True if exit'''
        pass

In [38]:
import re
import json
from collections import ChainMap
from requests.compat import urljoin
from bs4 import BeautifulSoup

# Scraper
class GMailScraper(Scraper):
    def __init__(self, init:State, goal:State, configPath:"Safari config file path" = "mail.google.com.har"):
        '''
        HTTP Gmail url: https://mail.google.com/mail/u/0/h/
        How to get config file:
            1. Go to Safari
            2. Click network in development-tools
            3. Click "export" at right-side
        '''
        super().__init__(init, goal)
        url, headers = self.__parse_config(configPath)
        if not url or not headers:
            print("GMailScraper cannot be initialized")
            return None
        self.currentState.site = url
        self.headers = headers
        
    def __parse_config(self, configPath)->("url", "headers"):
        try:
            fp = open(configPath, 'r')
            config = json.load(fp)
            fp.close()    
        except FileNotFoundError:
            print("Invalid configuration Path")
            return None, None
        except OSError:
            print("Invalid configuration Path")
            return None, None
        except Exception as e:
            super.__meet_unexpected_error(e)
            return None, None
            
        else:
            try:
                req = config['log']['entries'][0]['request']
                headers = {attr['name']:attr['value'] for attr in req['headers']}
                return req['url'], headers
            except KeyError:
                print("Invalid configuration file")
                return None, None
            except IndexError:
                print("Invalid configuration file")
                return None, None
            except Exception as e:
                super.__meet_unexpected_error(e)
                return None, None
                
        

    def collect_by_custom(self):
        try:
            res = requests.get(self.currentState.site, headers=self.headers)
            res.raise_for_status()
        except HTTPError as e:
            print(e.response.status_code)
            print(e.response.reason)
            print(e.request.headers)
        except URLError as e:
            print("URLError")
        except Exception as e:
            super.__meet_unexpected_error(e)
        else:
            self.__collect_data(res)
            self.__collect_links(res)
        return
    
    def __collect_data(self, res):
        dom = BeautifulSoup(res.text, 'lxml')
        mails = dom.find('table', {'class': 'th'}).find_all('tr')
        data = {mail.select('tr td:nth-of-type(2)')[0].get_text() : \
                    mail.select('tr td:nth-of-type(3)')[0].get_text() for mail in mails}

        halfSpace = r'\u200c'
        for k, v in data.items():
            newV = re.sub(halfSpace, ' ', v).strip()
            data[k] = newV
        self.currentState.data = ChainMap(self.currentState.data, data)
        return
    
    def __collect_links(self, res):
        dom = BeautifulSoup(res.text, 'lxml')
        links = dom.find('table', {'class': 'ft'}).previous_sibling.find_all('a', {'class':'searchPageLink'})
        
        previous, recent, latest = None, None, None
        for link in links:
            # raw val = 이전   ›
            form = re.sub(r'›', '', link.get_text()).strip()
            match form:
                case "이전":
                    previous = link['href']
                case "다음":
                    recent = link['href']
                case "처음":
                    latest = link['href']
                case _:
                    continue
        self.currentState.links.add((previous, recent, latest))
        return
        
    
    def select_link(self):
        if len(self.currentState.links):
            previous, _, _ = self.currentState.links.pop()
            return urljoin(self.currentState.site, nxt)
        else:
            return None
    
    def is_exit(self):
        return True if not len(self.currentState.links) else False

In [34]:
x = GMailScraper(State(""), State(""), "mail.google.com.har")
x.run()

Running Scraper
Robots.txt doesn't permit a scraper
Unexpected Error
name 'nxt' is not defined
Last step is None
Last site is https://mail.google.com/mail/u/0/h/2zlhprnm75py/?zy=c&view&f=1
Collected data size is 6
Remaining links is 0
Collect time is 0.0 secs


State(site='https://mail.google.com/mail/u/0/h/2zlhprnm75py/?zy=c&view&f=1', data=ChainMap({}, {'YouTube Premium': '축하합니다 | YouTube Premium 회원이 되셨습니다 - 광고 없는 YouTube와 YouTube Music YouTube Premium YouTube Premium에 오신 것을 환영합니다 광고 없는 YouTube와 YouTube Music을 즐기세요. 멤버십 혜택을 모두 알아보세요. 광고 없는 감상과 백그라운드 재생 광고 없이 동영상을 시청하고 다른 앱을 사용하면서 계속 들을 수 있습니다 오프라인 저장 좋아하는', 'The Google Account Team': 'Xxxx님, Google 계정 설정을 확인하여 Mac 기기에서 다음 단계를 진행하세요 - Xxxx님, 안녕하세요 Mac 기기에서 Google에 로그인해 주셔서 감사합니다 Google 계정 설정이 알맞게 구성되어 있는지 확인해 주세요 개인 정보 보호 진단 완료하기 단계별 안내를 통해 나에게 알맞은 개인 정보 보호 설정을 선택하세요 완료하기 보안 진단 완료하기 보안 진단에서 계정 보안을 강화하기 위한 맞춤 권장사항을 확인하세요 완료하기 Google', 'YouTube': 'Welcome to YouTube Premium! - Hi Xxxx, Welcome to your YouTube Premium membership! Your 1-month trial begins immediately. Your payment method will be charged monthly once your trial ends. You can explore, manage, and cancel your', 'Google Play': 'Google Play 주문 영수증(2022. 6. 23.) - Google Play 감사합니다. Google Play에서 Google Ireland Limited의 무료 체험판을 신청하셨습

# DB
```
some structure
```

In [39]:
import sqlite3