In [54]:
# default_exp core.scraping.base

# Base

> Provides basic scraping functionality.

In [55]:
#hide
from nbdev.showdoc import *

In [56]:
#export

from bs4 import Tag
from abc import ABC, abstractproperty, abstractmethod
from dataclasses import dataclass
from typing import Optional, List, Any, Tuple
from uuid import  uuid4
from czapi.core.errors import DifferentScoreLengthError,InvalidScoreError,InvalidEventError
from czapi.core.utils import make_request_from, make_soup_from
from czapi.core.testing import TagLike
from time import sleep

In [57]:
#exporti

def generate_dict_from_table(

     table : Tag
    ,**kwargs

)->dict:
    """Helper function for returning the curling boxscore from a bs4 Tag object."""
    d = {}
    team = None
    
    # TODO : add error handling for when no table is passed / None
    
    if table is None:
        raise ValueError('Table tag is NoneType.')
    
    # loop through tags in table
    for tag in table.find_all('td'):
        if tag.attrs.get('class') == ['linescoreteam']:
            team = tag.a.string
            d[team] = {**kwargs}
            d[team]['href'] = tag.a['href']
            d[team]['score'] = list() # initiate score incase the game hasn't started
            
        elif tag.attrs.get('class') == ['linescorehammer']:
            d[team]['hammer'] = not bool(tag.string) # opposite for some reason
        elif tag.attrs.get('class') == ['linescoreend']:
            score = tag.string.strip()
            if score: d[team]['score'].append(tag.string.strip()) # eliminates empty strings
        elif tag.attrs.get('class') == ['linescorefinal']:
            d[team]['finalscore'] = tag.b.string.strip()
        
    return d

In [58]:
# hide
test_table = TagLike(

    children = {'td':[
        
         TagLike(
        
             attrs = {'class':['linescoreteam']}
            ,a = TagLike(href='team1_href',string='team1')
               
         )
        ,TagLike(
        
        
             attrs = {'class':['linescorehammer']}
            ,string = 'this team does not have hammer'
        
        )
        ,TagLike(
        
            attrs = {'class':['linescoreend']}
            ,string = '1'
            
        )
        ,TagLike(
        
        
             attrs = {'class':['linescoreend']}
            ,string = '1'
            
        )
        ,TagLike(
        
        
             attrs = {'class':['linescorefinal']}
            ,b = TagLike(string='2')
            
        )
        ,TagLike(
        
             attrs = {'class':['linescoreteam']}
            ,a = TagLike(href='team2_href',string='team2')
               
         )
        ,TagLike(
        
        
             attrs = {'class':['linescorehammer']}
            ,string = ''
        
        )
        ,TagLike(
        
            attrs = {'class':['linescoreend']}
            ,string = '0'
            
        )
        ,TagLike(
        
        
             attrs = {'class':['linescoreend']}
            ,string = '0'
            
        )
        ,TagLike(
        
        
             attrs = {'class':['linescorefinal']}
            ,b = TagLike(string='0')
            
        )
        
        
        
    ]}


)


generate_dict_from_table(table=test_table,draw='Draw 1')

{'team1': {'draw': 'Draw 1',
  'href': 'team1_href',
  'score': ['1', '1'],
  'hammer': False,
  'finalscore': '2'},
 'team2': {'draw': 'Draw 1',
  'href': 'team2_href',
  'score': ['0', '0'],
  'hammer': True,
  'finalscore': '0'}}

In [59]:
# exporti
def normalize_scores(score_1 : List[str],score_2 : List[str])->Tuple[List[int],List[int]]:
    """
        Take two lists representing the end results of a boxscore and return the 'normalized' or relative scores.
    """
    score_1_len = len(score_1)
    score_2_len = len(score_2)
    if score_1_len != score_2_len:
        raise DifferentScoreLengthError(score_1_len =score_1_len,score_2_len = score_2_len)
        
    
    current_diff = 0
    end_1 = [current_diff]
    
    for i in range(score_1_len-1):
        try:
            val_1 = int(score_1[i])
            val_2 = int(score_2[i])
            
        except ValueError:
            break
            
        if val_1 > 0 and val_2 > 0:
            raise InvalidScoreError(idx = i+1, val_1 = val_1,val_2=val_2)
        
        new_current_diff = current_diff + val_1 - val_2
        end_1.append(new_current_diff)
        current_diff = new_current_diff
        
    return end_1, list(map(lambda x: -1*x,end_1))

In [60]:
# hide

score_1, score_2 =  ['0', '1', '1', '0', '0', '1', '1', '0'], ['1', '0', '0', '4', '0', '0', '0', '0']
n_score_1,n_score_2 = normalize_scores(score_1 = score_1,score_2 =score_2)

assert ([0,-1,0,1,-3,-3,-2,-1],[0,1,0,-1,3,3,2,1]) == (n_score_1,n_score_2)

n_score_1,n_score_2

([0, -1, 0, 1, -3, -3, -2, -1], [0, 1, 0, -1, 3, 3, 2, 1])

In [61]:
# hide

score_1, score_2 =  ['0', '1', '1', '0', 'X', '', '', ''], ['1', '0', '0', '4', 'X', '', '', '']
n_score_1,n_score_2 = normalize_scores(score_1 = score_1,score_2 =score_2)

assert ([0,-1,0,1,-3],[0,1,0,-1,3]) == (n_score_1,n_score_2)

n_score_1,n_score_2

([0, -1, 0, 1, -3], [0, 1, 0, -1, 3])

In [62]:
# exporti
def get_hammer_progressions(hammer_start:bool,normalized_score:List[int])->Tuple[List[bool],List[bool]]:
    current_hammer = hammer_start
    current_score = 0
    hammer_progression = [hammer_start]
    for i in range(1,len(normalized_score)):
        if current_hammer and (normalized_score[i] > current_score):
            current_hammer = False
        if not current_hammer and (normalized_score[i] < current_score):
            current_hammer = True
        current_score = normalized_score[i]
        hammer_progression.append(current_hammer)
        
    return hammer_progression, list(map(lambda x: not x, hammer_progression))

In [63]:
# hide
hammer_1, hammer_2 = get_hammer_progressions(hammer_start=False,normalized_score=n_score_1)
assert ([False,True,False,False,True],[True,False,True,True,False]) == (hammer_1,hammer_2)
hammer_1,hammer_2

([False, True, False, False, True], [True, False, True, True, False])

In [64]:
# exporti

@dataclass
class HalfBoxscore:
    team_name : str
    href : str
    hammer : bool
    score : List[str]
    finalscore : str
    draw_num : int
    draw : str
    
        
    
@dataclass
class NormalizedHalfBoxscore(HalfBoxscore):
    hammer_progression : List[bool]
    normalized_score : List[int]

In [65]:
#exporti

def generate_half_boxscore_pair(boxscore:dict)->Tuple[NormalizedHalfBoxscore]:
    half_boxscores = [HalfBoxscore(team_name=team_name,**results) for team_name,results in boxscore.items()]
    
    normalized_scores = normalize_scores(
    
         score_1 = half_boxscores[0].score
        ,score_2 = half_boxscores[1].score
    
    )
    
    hammer_progressions = get_hammer_progressions(
    
         hammer_start = half_boxscores[0].hammer
        ,normalized_score = normalized_scores[0]
    
    )
    
    return NormalizedHalfBoxscore(**half_boxscores[0].__dict__,hammer_progression = hammer_progressions[0],normalized_score = normalized_scores[0]),NormalizedHalfBoxscore(**half_boxscores[1].__dict__,hammer_progression = hammer_progressions[1],normalized_score = normalized_scores[1] )


In [66]:
#exporti

@dataclass
class NormalizedBoxscore:

    boxscore: dict
    
    def __post_init__(self)->None:
        self.normalized_half_boxscore_pair = generate_half_boxscore_pair(boxscore=self.boxscore)
        self.guid = uuid4().int
        self.flattened_normalized_boxscore = [tuple(half_score.__dict__.values())+(self.guid,) for half_score in self.normalized_half_boxscore_pair]
        
        
#     @property
#     def flattened_normalized_boxscore(self)->List[List[Any]]:
#         return [list(half_score.__dict__.values())+[self.guid] for half_score in self.normalized_half_boxscore_pair]


In [67]:
# exporti

class Page(ABC):
    
    
    @abstractproperty
    def url(self)->str:
        pass
        
    @abstractproperty
    def event_name(self)->str:
        pass
        
    @abstractproperty
    def event_date(self)->str:
        pass
        
    @abstractproperty
    def draw(self)->str:
        pass
    
    @abstractproperty
    def draw_num(self)->int:
        pass
        
    @abstractproperty
    def tables(self)->List[Tag]:
        pass
        
    @abstractmethod
    def generate_boxscores(self)->List[dict]:
        pass

In [68]:
# export
@dataclass
class LinescorePage(Page):
    """
        Represents a CurlingZone linescore page.
        Example page here: https://curlingzone.com/event.php?eventid=7795&view=Scores&showdrawid=25#1
    
    """
    cz_event_id : int
    cz_draw_id: int
        
        
    def __post_init__(self)->None:
        response = make_request_from(url = self.url)
        self.soup = make_soup_from(response=response)
        self.boxscores = self.generate_boxscores()
        self.normalized_boxscores = self.generate_normalized_boxscores()
    
    
    @property
    def url(self)->str:
        return 'https://curlingzone.com/event.php?eventid=%s&view=Scores&showdrawid=%s#1'%(self.cz_event_id,self.cz_draw_id)
    
    @property
    def event_name(self)->str:
        return self.soup.find('h3',attrs={'class':'entry-title-widget'}).string
    
    @property
    def event_date(self)->str:
        return self.soup.find('div',attrs={'class':'badge-widget'}).string
    
    @property
    def draw_num(self)->int:
        return self.soup.find(name='select').find_all(name='option').index(self.soup.find(name='option',attrs={'selected':'selected'}))+1
    
    @property
    def draw(self)->str:
        return self.soup.find(name='option',attrs={'selected':'selected'}).string
    
    @property
    def tables(self)->List[Tag]:
        return self.soup.find_all(name = 'table',attrs={'class':'linescorebox'})
    
    def generate_boxscores(self)->List[dict]:
        return [generate_dict_from_table(table=table,draw=self.draw,draw_num=self.draw_num) for table in self.tables]
    
    def generate_normalized_boxscores(self)->List[NormalizedBoxscore]:
        return [NormalizedBoxscore(boxscore=boxscore) for boxscore in self.boxscores]
    
    def get_boxscore_from(self,cz_game_id : int)->dict:
        if cz_game_id <= 0:
            raise ValueError('cz_game_id must be 1 or greater.')
            
        if cz_game_id > len(self.boxscores):
            raise ValueError('') # TODO
            
        return self.boxscores[cz_game_id - 1]
    
    # repeated code but will re-factor later
    def get_normalized_boxscore_from(self,cz_game_id : int)->NormalizedBoxscore:
        if cz_game_id <= 0:
            raise ValueError('cz_game_id must be 1 or greater.')
            
        if cz_game_id > len(self.normalized_boxscores):
            raise ValueError('') # TODO
            
        return self.normalized_boxscores[cz_game_id - 1]


In [69]:
# exporti
class BadLinescorePage(Page):
    
    def __init__(self
                 ,url:str=None
                 ,event_name:str=None
                 ,event_date:str=None
                 ,draw:str=None
                 ,draw_num:int=None
                 ,tables:List[TagLike]=None
                 ,boxscores:List[dict]=None
                 ,normalized_boxscores:List[NormalizedBoxscore]=None
                ):
        self.url = url
        self.event_name = event_name
        self.event_date = event_date
        self.draw = draw
        self.draw_num = draw_num
        self.tables = tables
        self.boxscores = boxscores
        self.normalized_boxscores = normalized_boxscores

    def url(self)->str:
        return ''
        

    def event_name(self)->str:
        return ''

    def event_date(self)->str:
        return ''
        
    def draw_num(self)->int:
        return 1
        
    def draw(self)->str:
        return ''
        
    def tables(self)->List[Tag]:
        pass
        
    def generate_boxscores(self)->List[dict]:
        return self.boxscores 
    

In [70]:
# hide

# testing error handling, mismatched linescore lengths

bad_linescore_page = BadLinescorePage(boxscores=[{
            
            'Team 1' : {
                 'href':''
                ,'hammer':True
                ,'score':['1','2','3']
                ,'finalscore':'6'
                ,'draw' : 'draw 1'
                ,'draw_num' : 1
            },
            'Team 2':{
                 'href':''
                ,'hammer':True
                ,'score':['0','X']
                ,'finalscore':'0'
                ,'draw':'draw 1'
                ,'draw_num':1
            }
            
        }])

try:    
    generate_half_boxscore_pair(bad_linescore_page.boxscores[0])
except DifferentScoreLengthError as e:
    print(e.message)

Input scores have length 3 and length 2 and do not match.


In [71]:
# hide

# testing error handling, invalid score

bad_linescore_page = BadLinescorePage(boxscores=[{
            
            'Team 1' : {
                 'href':''
                ,'hammer':True
                ,'score':['1','1','3']
                ,'finalscore':'6'
                ,'draw' : 'draw 1'
                ,'draw_num' : 1
            },
            'Team 2':{
                 'href':''
                ,'hammer':True
                ,'score':['0','1','0']
                ,'finalscore':'1'
                ,'draw':'draw 1'
                ,'draw_num':1
            }
            
        }])

try:    
    generate_half_boxscore_pair(bad_linescore_page.boxscores[0])
except InvalidScoreError as e:
    print(e.message)

Input end scores of 1 and 1 for end 2 are both greater than 0 and the end score is not valid.


In [72]:
# hide

linescore_page = LinescorePage(cz_event_id = 5000,cz_draw_id = 1)
linescore_page.boxscores
boxscore = linescore_page.get_boxscore_from(cz_game_id = 5)
boxscore

{'Connor Duhaime': {'draw': 'Draw: 1',
  'draw_num': 1,
  'href': 'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114946&profileid=9629#1',
  'score': ['0', '0', '0', '0', 'X'],
  'hammer': False,
  'finalscore': '0'},
 'Daryl Shane': {'draw': 'Draw: 1',
  'draw_num': 1,
  'href': 'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114941&profileid=9762#1',
  'score': ['1', '1', '3', '1', 'X'],
  'hammer': True,
  'finalscore': '6'}}

In [73]:
#exporti

GameData = List[Tuple[str,str,bool,List[str],str,int,str,List[bool],List[int],int]]

def _get_flat_boxscores_from(linescore_page:LinescorePage)->GameData:
    flattened_boxscores = [boxscore.flattened_normalized_boxscore for boxscore in linescore_page.normalized_boxscores]
    return [(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9]) for f in flattened_boxscores for row in f]

In [74]:
#export

game_data_column_headers = (

     'team_name'
    ,'href'
    ,'hammer_start'
    ,'score'
    ,'final_score'
    ,'draw_num'
    ,'draw'
    ,'hammer_progression'
    ,'relative_score'
    ,'guid'


)

In [75]:
#export

def get_flat_boxscores_from(cz_event_id:int,cz_draw_id:int)->GameData:
    """Returns a list of tuples of boxscore information on a linescore page."""
    linescore_page = LinescorePage(cz_event_id = cz_event_id,cz_draw_id = cz_draw_id)
    return _get_flat_boxscores_from(linescore_page = linescore_page)

In [76]:
# hide
linescore_page.get_normalized_boxscore_from(cz_game_id = 5).flattened_normalized_boxscore

[('Connor Duhaime',
  'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114946&profileid=9629#1',
  False,
  ['0', '0', '0', '0', 'X'],
  '0',
  1,
  'Draw: 1',
  [False, True, True, True, True],
  [0, -1, -2, -5, -6],
  65553062928611767012125523645969215508),
 ('Daryl Shane',
  'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114941&profileid=9762#1',
  True,
  ['1', '1', '3', '1', 'X'],
  '6',
  1,
  'Draw: 1',
  [True, False, False, False, False],
  [0, 1, 2, 5, 6],
  65553062928611767012125523645969215508)]

In [77]:
# hide
get_flat_boxscores_from(cz_event_id = 5000, cz_draw_id = 1)

[('Rob Retchless',
  'event.php?view=Team&eventid=5000&teamid=114936&profileid=9625#1',
  True,
  ['2', '0', '0', '0', 'X'],
  '2',
  1,
  'Draw: 1',
  [True, False, True, True, True],
  [0, 2, 1, -2, -5],
  296934272838794961332641294263686844601),
 ('Jordan Chandler',
  'event.php?view=Team&eventid=5000&teamid=114942&profileid=9627#1',
  False,
  ['0', '1', '3', '3', 'X'],
  '7',
  1,
  'Draw: 1',
  [False, True, False, False, False],
  [0, -2, -1, 2, 5],
  296934272838794961332641294263686844601),
 ('Charlie Robert',
  'event.php?view=Team&eventid=5000&teamid=114942&profileid=9627&eventid=5000&teamid=114939&profileid=10028#1',
  True,
  ['1', '0', '0', '0', '1', 'X', 'X'],
  '2',
  1,
  'Draw: 1',
  [True, False, True, True, True, False],
  [0, 1, -1, -2, -3, -2],
  166732447483580069111341860035710052625),
 ('Brent Ross',
  'event.php?view=Team&eventid=5000&teamid=114942&profileid=9627&eventid=5000&teamid=114949&profileid=9887#1',
  False,
  ['0', '2', '1', '1', '0', 'X', 'X'],
  '

In [78]:
game_data_column_headers

('team_name',
 'href',
 'hammer_start',
 'score',
 'final_score',
 'draw_num',
 'draw',
 'hammer_progression',
 'relative_score',
 'guid')

In [79]:
GameData

typing.List[typing.Tuple[str, str, bool, typing.List[str], str, int, str, typing.List[bool], typing.List[int], int]]

In [80]:
#export

@dataclass
class Boxscore:
    team_name:str
    href:str
    hammer_start:bool
    score:List[str]
    final_score:str
    draw_num: int
    draw: str
    hammer_progression: List[bool]
    relative_score: List[int]
    guid:int

In [81]:
#export


@dataclass
class Event:
    cz_event_id : int
    delay : int =0
    verbose: bool = False
    
        
    def __post_init__(self)->None:
        response = make_request_from(url = self.url)
        self.soup = make_soup_from(response=response)
        
        if not self.is_valid:
            raise InvalidEventError(cz_event_id = self.cz_event_id)
        
        pages = list()
        for draw_id in range(self.draws):
            if self.verbose:
                print('Scraping draw %s.'%(draw_id+1))
            pages.append(LinescorePage(cz_event_id = self.cz_event_id,cz_draw_id = draw_id+1))
            sleep(self.delay)
            
        self.pages = pages 
    
    
    @property
    def is_valid(self)->bool:
        return self.soup.find(name='select') is not None
    
    @property
    def url(self)->str:
        return 'https://curlingzone.com/event.php?eventid=%s&view=Scores&showdrawid=1#1'%(self.cz_event_id)
    
    @property
    def draws(self)->int:
        return len(self.soup.find(name='select').find_all(name='option'))
        
        
    def get_flat_boxscores(self,flat=False)->List[GameData]:
        if flat:
            return [boxscore for linescore_page in self.pages for boxscore in _get_flat_boxscores_from(linescore_page = linescore_page)]
        return [Boxscore(*boxscore) for linescore_page in self.pages for boxscore in _get_flat_boxscores_from(linescore_page = linescore_page)]
   
       

In [82]:
#hide
event = Event(cz_event_id = 5000,verbose=True,delay=2)
event

Scraping draw 1.
Scraping draw 2.
Scraping draw 3.
Scraping draw 4.
Scraping draw 5.
Scraping draw 6.
Scraping draw 7.
Scraping draw 8.
Scraping draw 9.
Scraping draw 10.
Scraping draw 11.
Scraping draw 12.
Scraping draw 13.
Scraping draw 14.
Scraping draw 15.


Event(cz_event_id=5000, delay=2, verbose=True)

In [83]:
# hide
event.pages

[LinescorePage(cz_event_id=5000, cz_draw_id=1),
 LinescorePage(cz_event_id=5000, cz_draw_id=2),
 LinescorePage(cz_event_id=5000, cz_draw_id=3),
 LinescorePage(cz_event_id=5000, cz_draw_id=4),
 LinescorePage(cz_event_id=5000, cz_draw_id=5),
 LinescorePage(cz_event_id=5000, cz_draw_id=6),
 LinescorePage(cz_event_id=5000, cz_draw_id=7),
 LinescorePage(cz_event_id=5000, cz_draw_id=8),
 LinescorePage(cz_event_id=5000, cz_draw_id=9),
 LinescorePage(cz_event_id=5000, cz_draw_id=10),
 LinescorePage(cz_event_id=5000, cz_draw_id=11),
 LinescorePage(cz_event_id=5000, cz_draw_id=12),
 LinescorePage(cz_event_id=5000, cz_draw_id=13),
 LinescorePage(cz_event_id=5000, cz_draw_id=14),
 LinescorePage(cz_event_id=5000, cz_draw_id=15)]

In [84]:
# hide
event.pages[0].boxscores

[{'Rob Retchless': {'draw': 'Draw: 1',
   'draw_num': 1,
   'href': 'event.php?view=Team&eventid=5000&teamid=114936&profileid=9625#1',
   'score': ['2', '0', '0', '0', 'X'],
   'hammer': True,
   'finalscore': '2'},
  'Jordan Chandler': {'draw': 'Draw: 1',
   'draw_num': 1,
   'href': 'event.php?view=Team&eventid=5000&teamid=114942&profileid=9627#1',
   'score': ['0', '1', '3', '3', 'X'],
   'hammer': False,
   'finalscore': '7'}},
 {'Charlie Robert': {'draw': 'Draw: 1',
   'draw_num': 1,
   'href': 'event.php?view=Team&eventid=5000&teamid=114942&profileid=9627&eventid=5000&teamid=114939&profileid=10028#1',
   'score': ['1', '0', '0', '0', '1', 'X', 'X'],
   'hammer': True,
   'finalscore': '2'},
  'Brent Ross': {'draw': 'Draw: 1',
   'draw_num': 1,
   'href': 'event.php?view=Team&eventid=5000&teamid=114942&profileid=9627&eventid=5000&teamid=114949&profileid=9887#1',
   'score': ['0', '2', '1', '1', '0', 'X', 'X'],
   'hammer': False,
   'finalscore': '4'}},
 {'John Willsey': {'draw': 

In [85]:
# hide
event.pages[-1].boxscores

[{'ShopWoodstock.com': {'draw': 'Draw: CF',
   'draw_num': 15,
   'href': 'event.php?view=Team&eventid=5000&teamid=114952&profileid=9703#1',
   'score': ['0', '0', '0', '0', '0', '0', '0', '0'],
   'hammer': False,
   'finalscore': '0'},
  'Richard Krell': {'draw': 'Draw: CF',
   'draw_num': 15,
   'href': 'event.php?view=Team&eventid=5000&teamid=116174&profileid=9670#1',
   'score': ['0', '0', '1', '0', '0', '1', '0', '1'],
   'hammer': True,
   'finalscore': '3'}}]

In [86]:
# hide
event.get_flat_boxscores()

[Boxscore(team_name='Rob Retchless', href='event.php?view=Team&eventid=5000&teamid=114936&profileid=9625#1', hammer_start=True, score=['2', '0', '0', '0', 'X'], final_score='2', draw_num=1, draw='Draw: 1', hammer_progression=[True, False, True, True, True], relative_score=[0, 2, 1, -2, -5], guid=109899279346624616126993479598011596735),
 Boxscore(team_name='Jordan Chandler', href='event.php?view=Team&eventid=5000&teamid=114942&profileid=9627#1', hammer_start=False, score=['0', '1', '3', '3', 'X'], final_score='7', draw_num=1, draw='Draw: 1', hammer_progression=[False, True, False, False, False], relative_score=[0, -2, -1, 2, 5], guid=109899279346624616126993479598011596735),
 Boxscore(team_name='Charlie Robert', href='event.php?view=Team&eventid=5000&teamid=114942&profileid=9627&eventid=5000&teamid=114939&profileid=10028#1', hammer_start=True, score=['1', '0', '0', '0', '1', 'X', 'X'], final_score='2', draw_num=1, draw='Draw: 1', hammer_progression=[True, False, True, True, True, False