In [1]:
# default_exp core.scraping.base

# Base

> Provides basic scraping functionality.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export

import requests 
from requests.models import Response
from bs4 import BeautifulSoup,Tag
from abc import ABC, abstractproperty, abstractmethod
from dataclasses import dataclass
from typing import Optional, List, Any, Tuple
from collections import defaultdict
from uuid import  uuid4

In [4]:
#exporti
    
def make_request_from(

     *
    ,url : str
    ,**kwargs

)->Response:
    return requests.get(url=url,**kwargs)
        

def make_soup_from(
    
     *
    ,response : Response
    ,**kwargs

)->BeautifulSoup:
    """ Returns a Beautifulsoup object for the passed URL."""
    
    return BeautifulSoup(response.content,features='html.parser',**kwargs)

In [5]:
#exporti

def generate_dict_from_table(

    table : Tag

)->defaultdict:
    """Helper function for returning the curling boxscore from a bs4 Tag object."""
    d = defaultdict(list)
    team = None
    
    # TODO : add error handling for when no table is passed / None
    
    if table is None:
        raise ValueError('Table tag is NoneType.')
    
    # loop through tags in table
    for tag in table.find_all('td'):
        if tag.attrs.get('class') == ['linescoreteam']:
            team = tag.a.string
            d[team] = defaultdict(list)
            d[team]['href'] = tag.a['href']
        elif tag.attrs.get('class') == ['linescorehammer']:
            d[team]['hammer'] = not bool(tag.string) # opposite for some reason
        elif tag.attrs.get('class') == ['linescoreend']:
            d[team]['score'].append(tag.string.strip())
        elif tag.attrs.get('class') == ['linescorefinal']:
            d[team]['finalscore'] = tag.b.string.strip()
        
    return d

In [6]:
# exporti
def normalize_scores(score_1 : List[str],score_2 : List[str])->List[int]:
    if len(score_1) != len(score_2):
        raise ValueError('') # TODO
        
    end_1 = []
    current_diff = 0
    
    for i in range(len(score_1)):
        try:
            val_1 = int(score_1[i])
            val_2 = int(score_2[i])
            
        except ValueError:
            break
            
        if val_1 > 0 and val_2 > 0:
            raise ValueError('') # TODO
        
        new_current_diff = current_diff + val_1 - val_2
        end_1.append(new_current_diff)
        current_diff = new_current_diff
        
    return end_1, list(map(lambda x: -1*x,end_1))




In [7]:
# hide

score_1, score_2 =  ['0', '0', '0', '0', 'X', '', '', ''], ['1', '1', '3', '1', 'X', '', '', '']
n_score_1,n_score_2 = normalize_scores(score_1 = score_1,score_2 =score_2)
n_score_1,n_score_2

([-1, -2, -5, -6], [1, 2, 5, 6])

In [8]:
# exporti
def get_hammer_progressions(hammer_start:bool,normalized_score:List[int])->List[bool]:
    current_hammer = hammer_start
    hammer_progression = [hammer_start]
    for i in range(1,len(normalized_score)):
        if current_hammer and (normalized_score[i] > normalized_score[i-1]):
            current_hammer = False
        if not current_hammer and (normalized_score[i] < normalized_score[i-1]):
            current_hammer = True
        hammer_progression.append(current_hammer)
        
    return hammer_progression, list(map(lambda x: not x, hammer_progression))

In [9]:
# hide
get_hammer_progressions(hammer_start=False,normalized_score=n_score_1)

([False, True, True, True], [True, False, False, False])

In [10]:
# exporti

@dataclass
class HalfBoxscore:
    team_name : str
    href : str
    hammer : bool
    score : List[str]
    finalscore : str
        
    
@dataclass
class NormalizedHalfBoxscore(HalfBoxscore):
    hammer_progression : List[bool]
    normalized_score : List[int]

In [11]:
#exporti

def generate_half_boxscore_pair(boxscore:defaultdict)->Tuple[NormalizedHalfBoxscore]:
    half_boxscores = [HalfBoxscore(team_name=team_name,**results) for team_name,results in boxscore.items()]
    
    normalized_scores = normalize_scores(
    
         score_1 = half_boxscores[0].score
        ,score_2 = half_boxscores[1].score
    
    )
    
    hammer_progressions = get_hammer_progressions(
    
         hammer_start = half_boxscores[0].hammer
        ,normalized_score = normalized_scores[0]
    
    )
    
    return NormalizedHalfBoxscore(**half_boxscores[0].__dict__,hammer_progression = hammer_progressions[0],normalized_score = normalized_scores[0]),NormalizedHalfBoxscore(**half_boxscores[1].__dict__,hammer_progression = hammer_progressions[1],normalized_score = normalized_scores[1] )


In [12]:
#exporti

@dataclass
class NormalizedBoxscore:

    boxscore: defaultdict
    
    def __post_init__(self)->None:
        self.normalized_half_boxscore_pair = generate_half_boxscore_pair(boxscore=self.boxscore)
        self.guid = uuid4().int
        self.flattened_normalized_boxscore = [list(half_score.__dict__.values())+[self.guid] for half_score in self.normalized_half_boxscore_pair]
        
#     @property
#     def flattened_normalized_boxscore(self)->List[List[Any]]:
#         return [list(half_score.__dict__.values())+[self.guid] for half_score in self.normalized_half_boxscore_pair]


In [13]:
# export


class Page(ABC):
    
    def __post_init__(self)->None:
        response = make_request_from(url = self.url)
        self.soup = make_soup_from(response=response)
        self.boxscores = self.generate_boxscores()
        self.normalized_boxscores = self.generate_normalized_boxscores()
    
    @abstractproperty
    def url(self)->str:
        ...
        
    @abstractproperty
    def event_name(self)->str:
        ...
        
    @abstractproperty
    def event_date(self)->str:
        ...
        
    @abstractproperty
    def draw(self)->str:
        ...
        
    @abstractproperty
    def tables(self)->List[Tag]:
        ...
        
    @abstractmethod
    def generate_boxscores(self)->List[dict]:
        ...
        
    
@dataclass
class LinescorePage(Page):
    cz_event_id : int
    cz_draw_id: int
    
    
    @property
    def url(self)->str:
        return 'https://curlingzone.com/event.php?eventid=%s&view=Scores&showdrawid=%s#1'%(self.cz_event_id,self.cz_draw_id)
    
    @property
    def event_name(self)->str:
        return self.soup.find('h3',attrs={'class':'entry-title-widget'}).string
    
    @property
    def event_date(self)->str:
        return self.soup.find('div',attrs={'class':'badge-widget'}).string
    
    @property
    def draw(self)->str:
        return self.soup.find(name='option',attrs={'selected':'selected'}).string
    
    @property
    def tables(self)->List[Tag]:
        return self.soup.find_all(name = 'table',attrs={'class':'linescorebox'})
    
    def generate_boxscores(self)->List[defaultdict]:
        return [generate_dict_from_table(table=table) for table in self.tables]
    
    def generate_normalized_boxscores(self)->List[NormalizedBoxscore]:
        return [NormalizedBoxscore(boxscore=boxscore) for boxscore in self.boxscores]
    
    def get_boxscore_from(self,cz_game_id : int)->defaultdict:
        if cz_game_id <= 0:
            raise ValueError('cz_game_id must be 1 or greater.')
            
        if cz_game_id > len(self.boxscores):
            raise ValueError('') # TODO
            
        return self.boxscores[cz_game_id - 1]
    
    # repeated code but will re-factor later
    def get_normalized_boxscore_from(self,cz_game_id : int)->NormalizedBoxscore:
        if cz_game_id <= 0:
            raise ValueError('cz_game_id must be 1 or greater.')
            
        if cz_game_id > len(self.normalized_boxscores):
            raise ValueError('') # TODO
            
        return self.normalized_boxscores[cz_game_id - 1]


In [14]:
# hide

linescore_page = LinescorePage(cz_event_id = 5000,cz_draw_id = 1)
linescore_page.boxscores
boxscore = linescore_page.get_boxscore_from(cz_game_id = 5)
boxscore

defaultdict(list,
            {'Connor Duhaime': defaultdict(list,
                         {'href': 'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114946&profileid=9629#1',
                          'hammer': False,
                          'score': ['0', '0', '0', '0', 'X', '', '', ''],
                          'finalscore': '0'}),
             'Daryl Shane': defaultdict(list,
                         {'href': 'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114941&profileid=9762#1',
                          'hammer': True,
                          'score': ['1', '1', '3', '1', 'X', '', '', ''],
                          'finalscore': '6'})})

In [15]:
#exporti
def get_all_normalized_flattend_boxscores(linescore_page):
    flattened_boxscores = [boxscore.flattened_normalized_boxscore for boxscore in linescore_page.normalized_boxscores]
    return [[row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7]] for f in flattened_boxscores for row in f]



In [16]:
# hide

import pandas as pd
df = pd.DataFrame(data=get_all_normalized_flattend_boxscores(linescore_page))
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,Rob Retchless,event.php?view=Team&eventid=5000&teamid=114936...,True,"[2, 0, 0, 0, X, , , ]",2,"[True, True, True, True]","[2, 1, -2, -5]",170654087633026943912350084448178970820
1,Jordan Chandler,event.php?view=Team&eventid=5000&teamid=114942...,False,"[0, 1, 3, 3, X, , , ]",7,"[False, False, False, False]","[-2, -1, 2, 5]",170654087633026943912350084448178970820
2,Charlie Robert,event.php?view=Team&eventid=5000&teamid=114942...,True,"[1, 0, 0, 0, 1, X, X, ]",2,"[True, True, True, True, False]","[1, -1, -2, -3, -2]",204228979359787017598923357027711107376
3,Brent Ross,event.php?view=Team&eventid=5000&teamid=114942...,False,"[0, 2, 1, 1, 0, X, X, ]",4,"[False, False, False, False, True]","[-1, 1, 2, 3, 2]",204228979359787017598923357027711107376
4,John Willsey,event.php?view=Team&eventid=5000&teamid=114949...,False,"[0, 0, 0, 1, 0, 3, 0, 1, X]",5,"[False, True, True, False, False, False, True,...","[0, -1, -1, 0, 0, 3, 2, 3]",185725973966920893367803831433507868076
5,ShopWoodstock.com,event.php?view=Team&eventid=5000&teamid=114949...,True,"[0, 1, 0, 0, 0, 0, 1, 0, X]",2,"[True, False, False, True, True, True, False, ...","[0, 1, 1, 0, 0, -3, -2, -3]",185725973966920893367803831433507868076
6,Wayne Tuck Jr.,event.php?view=Team&eventid=5000&teamid=114952...,False,"[0, 0, 2, 2, 0, 0, 2, X]",6,"[False, True, False, False, False, True, False]","[0, -2, 0, 2, 2, 0, 2]",214944184407389351658188434654665459869
7,Charlie Richard,event.php?view=Team&eventid=5000&teamid=114952...,True,"[0, 2, 0, 0, 0, 2, 0, X]",4,"[True, False, True, True, True, False, True]","[0, 2, 0, -2, -2, 0, -2]",214944184407389351658188434654665459869
8,Connor Duhaime,event.php?view=Team&eventid=5000&teamid=114958...,False,"[0, 0, 0, 0, X, , , ]",0,"[False, True, True, True]","[-1, -2, -5, -6]",77281843292219585378295656623640214057
9,Daryl Shane,event.php?view=Team&eventid=5000&teamid=114958...,True,"[1, 1, 3, 1, X, , , ]",6,"[True, False, False, False]","[1, 2, 5, 6]",77281843292219585378295656623640214057


In [17]:
# hide
linescore_page.get_normalized_boxscore_from(cz_game_id = 5)

NormalizedBoxscore(boxscore=defaultdict(<class 'list'>, {'Connor Duhaime': defaultdict(<class 'list'>, {'href': 'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114946&profileid=9629#1', 'hammer': False, 'score': ['0', '0', '0', '0', 'X', '', '', ''], 'finalscore': '0'}), 'Daryl Shane': defaultdict(<class 'list'>, {'href': 'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114941&profileid=9762#1', 'hammer': True, 'score': ['1', '1', '3', '1', 'X', '', '', ''], 'finalscore': '6'})}))

In [18]:
# hide
linescore_page.get_normalized_boxscore_from(cz_game_id = 5).flattened_normalized_boxscore

[['Connor Duhaime',
  'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114946&profileid=9629#1',
  False,
  ['0', '0', '0', '0', 'X', '', '', ''],
  '0',
  [False, True, True, True],
  [-1, -2, -5, -6],
  77281843292219585378295656623640214057],
 ['Daryl Shane',
  'event.php?view=Team&eventid=5000&teamid=114958&profileid=0&eventid=5000&teamid=114941&profileid=9762#1',
  True,
  ['1', '1', '3', '1', 'X', '', '', ''],
  '6',
  [True, False, False, False],
  [1, 2, 5, 6],
  77281843292219585378295656623640214057]]