In [1]:
# default_exp core.scraping.base

# Base

> Provides basic scraping functionality.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export

import requests 
from requests.models import Response
from bs4 import BeautifulSoup
from abc import ABC, abstractproperty
from dataclasses import dataclass
from typing import Optional

In [4]:
#exporti


class URL(ABC):
    
    @abstractproperty
    def url(self)->str:
        ...
        
@dataclass
class LinescoreURL:
    cz_event_id : int
        
    @property
    def url(self)->str:
        return 'https://curlingzone.com/event.php?view=Scores&eventid=%s#1'%self.cz_event_id
    
@dataclass
class BoxscoreURL:
    cz_event_id : int
    cz_draw_id: int
    
    
    @property
    def url(self)->str:
        return 'https://curlingzone.com/event.php?eventid=%s&view=Scores&showdrawid=%s#1'%(self.cz_event_id,self.cz_draw_id)
    
def make_url(
    cz_event_id : Optional[int] = None
    ,cz_draw_id : Optional[int] = None

)->URL:
    if cz_event_id:
        return LinescoreURL(cz_event_id = cz_event_id)
    
    if cz_draw_id:
        return BoxscoreURL(cz_event_id = cz_event_id, cz_draw_id = cz_draw_id)
    
    raise ValueError('One of cz_event_id or cz_draw_id must be populated.')

In [5]:
# hide
print(LinescoreURL(cz_event_id = 5000).url)
print(BoxscoreURL(cz_event_id = 5000,cz_draw_id = 5).url)
print(make_url(cz_event_id = 5000).url)

https://curlingzone.com/event.php?view=Scores&eventid=5000#1
https://curlingzone.com/event.php?eventid=5000&view=Scores&showdrawid=5#1
https://curlingzone.com/event.php?view=Scores&eventid=5000#1


In [6]:
#export
    
def make_request_from(

     *
    ,url : URL
    ,**kwargs

)->Response:
    return requests.get(url=url.url,**kwargs)
        

def make_soup_from(
    
     *
    ,response : Response
    ,**kwargs

)->BeautifulSoup:
    """ Returns a Beautifulsoup object for the passed URL."""
    
    return BeautifulSoup(response.content,**kwargs)

In [7]:
# tests

url = make_url(cz_event_id = 6938)
parser = 'html.parser'

response = make_request_from(url = url)
soup = make_soup_from(response = response)
assert isinstance(soup,BeautifulSoup)

In [8]:
#export
    
def make_request_from(

     *
    ,url : URL
    ,**kwargs

)->Response:
    return requests.get(url=url.url,**kwargs)
        

def make_soup_from(
    
     *
    ,response : Response
    ,**kwargs

)->BeautifulSoup:
    """ Returns a Beautifulsoup object for the passed URL."""
    


In [9]:
#exporti
class Page(ABC):
        
    @abstractproperty
    def event_name(self)->str:
        ...

@dataclass
class LinescorePage(Page):
    soup : BeautifulSoup
        
    @property
    def event_name(self)->str:
        return soup.find('title').string
    
@dataclass
class BoxscorePage(Page):
    soup : BeautifulSoup
        
    @property
    def event_name(self)->str:
        return soup.find('h3',attrs={'class':'entry-title-widget'}).string


In [10]:
LinescorePage(soup = soup).event_name
BoxscorePage(soup = soup).event_name

'Hokkaido Bank Curling Classic'

In [11]:
#exporti
class Page(ABC):
        
    @abstractproperty
    def event_name(self)->str:
        ...
    
    def __post__init(self)->None:
        self.url = url
        response = make_request_from(url = url)
        self.soup = make_soup_from(response=response)

@dataclass
class LinescorePage(Page):
    url : URL
        
    @property
    def event_name(self)->str:
        return soup.find('title').string
    
    def __post__init(self)->None:
        super().__post__init()
    
@dataclass
class BoxscorePage(Page):
    url : URL
        
    @property
    def event_name(self)->str:
        return soup.find('h3',attrs={'class':'entry-title-widget'}).string
    
    def __post__init(self)->None:
        super().__post__init()


In [12]:
LinescorePage(url = make_url(cz_event_id = 5000)).event_name
BoxscorePage(url=make_url(cz_event_id = 5000,cz_draw_id = 5)).event_name

'Hokkaido Bank Curling Classic'