In [1]:
# default_exp core.scraping.base

# Base

> Provides basic scraping functionality.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export

import requests 
from requests.models import Response
from bs4 import BeautifulSoup,Tag
from abc import ABC, abstractproperty, abstractmethod
from dataclasses import dataclass
from typing import Optional, List, Any
from collections import defaultdict
from hashlib import sha256

In [4]:
#exporti
    
def make_request_from(

     *
    ,url : str
    ,**kwargs

)->Response:
    return requests.get(url=url,**kwargs)
        

def make_soup_from(
    
     *
    ,response : Response
    ,**kwargs

)->BeautifulSoup:
    """ Returns a Beautifulsoup object for the passed URL."""
    
    return BeautifulSoup(response.content,features='html.parser',**kwargs)

In [5]:
#exporti

def generate_dict_from_table(

    table : Tag

)->defaultdict:
    """Helper function for returning the curling boxscore from a bs4 Tag object."""
    d = defaultdict(list)
    team = None
    
    # TODO : add error handling for when no table is passed / None
    
    if table is None:
        raise ValueError('Table tag is NoneType.')
    
    # loop through tags in table
    for tag in table.find_all('td'):
        if tag.attrs.get('class') == ['linescoreteam']:
            team = tag.a.string
            d[team] = defaultdict(list)
            d[team]['href'] = tag.a['href']
        elif tag.attrs.get('class') == ['linescorehammer']:
            d[team]['hammer'] = not bool(tag.string) # opposite for some reason
        elif tag.attrs.get('class') == ['linescoreend']:
            d[team]['score'].append(tag.string.strip())
        elif tag.attrs.get('class') == ['linescorefinal']:
            d[team]['finalscore'] = tag.b.string.strip()
        
    return d

In [6]:
#export
def flatten_boxscore(

    boxscore : defaultdict
)->List[List[Any]]:
    return [[team_name,*list(stats.values())] for team_name, stats in boxscore.items()]

In [7]:
#exporti

def hash_obj(

     obj : Any
    ,hash_type : str = 'sha256'
    ,encoding : str='utf-8'
)->str:
    """Hashes an object according to the passed hash_type and encoding."""
    hash_type = hash_type.lower()
    encoding = encoding.lower()
    
    if hash_type == 'sha256':
        hash_func = sha256
    else:
        raise NotImplementedError("Hash function %s not supported."%hash_type)
        
    return hash_func(str(obj).encode(encoding)).hexdigest().lower()

In [8]:
# #exporti


# class Page(ABC):
    
#     def __post_init__(self)->None:
#         response = make_request_from(url = self.url)
#         self.soup = make_soup_from(response=response)
    
#     @abstractproperty
#     def url(self)->str:
#         ...
        
#     @abstractproperty
#     def event_name(self)->str:
#         ...
        
#     @abstractproperty
#     def event_date(self)->str:
#         ...
        
#     @abstractproperty
#     def draw(self)->str:
#         ...
        
#     @abstractproperty
#     def tables(self)->List[Tag]:
#         ...
        
#     @abstractproperty
#     def table(self)->Tag:
#         ...
        
#     @abstractproperty
#     def base_boxscore(self)->defaultdict: # TODO add better type hinting
#         ...
        
#     @abstractproperty
#     def boxscore_with_details(self)->dict:
#         ...
        
#     @abstractproperty
#     def boxscore(self)->dict:
#         ...

        
# # @dataclass
# # class LinescorePage(Page):
# #     cz_event_id : int
        
# #     @property
# #     def url(self)->str:
# #         return 'https://curlingzone.com/event.php?view=Scores&eventid=%s#1'%self.cz_event_id
    
# #     @property
# #     def event_name(self)->str:
# #         return self.soup.find('title').string
    
# #     @property
# #     def event_date(self)->str:
# #         return self.soup.find(name='div',attrs={'class':'badge-widget'}).string
    
# #     @property
# #     def linescore_table(self)->Tag:
# #         return self.soup.find(name = 'table',attrs={'class':'linescorebox'})
    
    
    

    
# @dataclass
# class LinescorePage(Page):
#     cz_event_id : int
#     cz_draw_id: int
#     cz_game_id : int
    
    
#     @property
#     def url(self)->str:
#         return 'https://curlingzone.com/event.php?eventid=%s&view=Scores&showdrawid=%s#1'%(self.cz_event_id,self.cz_draw_id)
    
#     @property
#     def event_name(self)->str:
#         return self.soup.find('h3',attrs={'class':'entry-title-widget'}).string
    
#     @property
#     def event_date(self)->str:
#         return self.soup.find('div',attrs={'class':'badge-widget'}).string
    
#     @property
#     def draw(self)->str:
#         return self.soup.find(name='option',attrs={'selected':'selected'}).string
    
#     @property
#     def tables(self)->List[Tag]:
#         return self.soup.find_all(name = 'table',attrs={'class':'linescorebox'})
    
    
# # should I split the classes up here?
#     @property
#     def table(self)->Tag:
#         return self.tables[self.cz_game_id - 1]
    
#     @property
#     def base_boxscore(self)->defaultdict:
#         return generate_dict_from_table(table=self.table)
    
#     @property
#     def boxscore_with_details(self)->dict:
#         return {d[0]:{**d[-1],'date':self.event_date,'event':self.event_name,'draw':self.draw} for d in self.base_boxscore.items()}
    
#     @property
#     def boxscore(self)->dict:
#         boxscore_without_hash = self.boxscore_with_details
#         _hash = hash_obj(obj = boxscore_without_hash)
    
#         return {d[0]:{**d[-1],'hash':_hash} for d in boxscore_without_hash.items()} 
    
# linescore_page = LinescorePage(cz_event_id = 5000,cz_draw_id = 3,cz_game_id = 1)
# #print(boxscore_page.event_name,boxscore_page.event_date)
# #print(boxscore_page.base_boxscore)
# #print(boxscore_page.boxscore_with_details)
# print(linescore_page.boxscore)
# #print(boxscore_page.table)

In [9]:
# export


class Page(ABC):
    
    def __post_init__(self)->None:
        response = make_request_from(url = self.url)
        self.soup = make_soup_from(response=response)
        self.boxscores = self.generate_boxscores()
    
    @abstractproperty
    def url(self)->str:
        ...
        
    @abstractproperty
    def event_name(self)->str:
        ...
        
    @abstractproperty
    def event_date(self)->str:
        ...
        
    @abstractproperty
    def draw(self)->str:
        ...
        
    @abstractproperty
    def tables(self)->List[Tag]:
        ...
        
    @abstractmethod
    def generate_boxscores(self)->List[dict]:
        ...
        
    
@dataclass
class LinescorePage(Page):
    cz_event_id : int
    cz_draw_id: int
    
    
    @property
    def url(self)->str:
        return 'https://curlingzone.com/event.php?eventid=%s&view=Scores&showdrawid=%s#1'%(self.cz_event_id,self.cz_draw_id)
    
    @property
    def event_name(self)->str:
        return self.soup.find('h3',attrs={'class':'entry-title-widget'}).string
    
    @property
    def event_date(self)->str:
        return self.soup.find('div',attrs={'class':'badge-widget'}).string
    
    @property
    def draw(self)->str:
        return self.soup.find(name='option',attrs={'selected':'selected'}).string
    
    @property
    def tables(self)->List[Tag]:
        return self.soup.find_all(name = 'table',attrs={'class':'linescorebox'})
    
    def generate_boxscores(self)->List[defaultdict]:
        return [generate_dict_from_table(table=table) for table in self.tables]
    
    def get_boxscore_from(self,cz_game_id : int)->defaultdict:
        if cz_game_id <= 0:
            raise ValueError('cz_game_id must be 1 or greater.')
            
        if cz_game_id > len(self.boxscores):
            raise ValueError('') # TODO
            
        return self.boxscores[cz_game_id - 1]


In [10]:
# hide

linescore_page = LinescorePage(cz_event_id = 5000,cz_draw_id = 1)
linescore_page.boxscores
linescore_page.get_boxscore_from(cz_game_id = 5)

defaultdict(list,
            {'Connor Duhaime': defaultdict(list,
                         {'href': 'event.php?view=Team&eventid=5000&teamid=114946&profileid=9629#1',
                          'hammer': False,
                          'score': ['0', '0', '0', '0', 'X', '', '', ''],
                          'finalscore': '0'}),
             'Daryl Shane': defaultdict(list,
                         {'href': 'event.php?view=Team&eventid=5000&teamid=114941&profileid=9762#1',
                          'hammer': True,
                          'score': ['1', '1', '3', '1', 'X', '', '', ''],
                          'finalscore': '6'})})

In [11]:
# hide

flatten_boxscore(linescore_page.get_boxscore_from(cz_game_id = 5))

[['Connor Duhaime',
  'event.php?view=Team&eventid=5000&teamid=114946&profileid=9629#1',
  False,
  ['0', '0', '0', '0', 'X', '', '', ''],
  '0'],
 ['Daryl Shane',
  'event.php?view=Team&eventid=5000&teamid=114941&profileid=9762#1',
  True,
  ['1', '1', '3', '1', 'X', '', '', ''],
  '6']]

In [12]:
# hide 

# graveyard

In [13]:
# hide

# #exporti
# class Page(ABC):
        
#     @abstractproperty
#     def event_name(self)->str:
#         ...

# @dataclass
# class LinescorePage(Page):
#     soup : BeautifulSoup
        
#     @property
#     def event_name(self)->str:
#         return soup.find('title').string
    
# @dataclass
# class BoxscorePage(Page):
#     soup : BeautifulSoup
        
#     @property
#     def event_name(self)->str:
#         return soup.find('h3',attrs={'class':'entry-title-widget'}).string

    
# LinescorePage(soup = soup).event_name
# BoxscorePage(soup = soup).event_name

In [14]:
# hide

# #exporti
# class Page(ABC):
        
#     @abstractproperty
#     def event_name(self)->str:
#         ...
    
#     def __post__init(self)->None:
#         self.url = url
#         response = make_request_from(url = url)
#         self.soup = make_soup_from(response=response)

# @dataclass
# class LinescorePage(Page):
#     url : URL
        
#     @property
#     def event_name(self)->str:
#         return soup.find('title').string
    
#     def __post__init(self)->None:
#         super().__post__init()
    
# @dataclass
# class BoxscorePage(Page):
#     url : URL
        
#     @property
#     def event_name(self)->str:
#         return soup.find('h3',attrs={'class':'entry-title-widget'}).string
    
#     def __post__init(self)->None:
#         super().__post__init()

# LinescorePage(url = make_url(cz_event_id = 5000)).event_name
# BoxscorePage(url=make_url(cz_event_id = 5000,cz_draw_id = 5)).event_name