In [1]:
# default_exp core.scraping.base

# Base

> Provides basic scraping functionality.

In [2]:
#hide
from nbdev.showdoc import *

In [3]:
#export

import requests 
from requests.models import Response
from bs4 import BeautifulSoup,Tag
from abc import ABC, abstractproperty, abstractmethod
from dataclasses import dataclass
from typing import Optional, List, Any
from collections import defaultdict
from hashlib import sha256

In [4]:
#exporti
    
def make_request_from(

     *
    ,url : str
    ,**kwargs

)->Response:
    return requests.get(url=url,**kwargs)
        

def make_soup_from(
    
     *
    ,response : Response
    ,**kwargs

)->BeautifulSoup:
    """ Returns a Beautifulsoup object for the passed URL."""
    
    return BeautifulSoup(response.content,features='html.parser',**kwargs)

In [5]:
#exporti

def generate_dict_from_table(

    table : Tag

)->defaultdict:
    """Helper function for returning the curling boxscore from a bs4 Tag object."""
    d = defaultdict(list)
    team = None
    
    # TODO : add error handling for when no table is passed / None
    
    if table is None:
        raise ValueError('Table tag is NoneType.')
    
    # loop through tags in table
    for tag in table.find_all('td'):
        if tag.attrs.get('class') == ['linescoreteam']:
            team = tag.a.string
            d[team] = defaultdict(list)
            d[team]['href'] = tag.a['href']
        elif tag.attrs.get('class') == ['linescorehammer']:
            d[team]['hammer'] = not bool(tag.string) # opposite for some reason
        elif tag.attrs.get('class') == ['linescoreend']:
            d[team]['score'].append(tag.string.strip())
        elif tag.attrs.get('class') == ['linescorefinal']:
            d[team]['finalscore'] = tag.b.string.strip()
        
    return d

In [6]:
#exporti

def hash_obj(

     obj : Any
    ,hash_type : str = 'sha256'
    ,encoding : str='utf-8'
)->str:
    """Hashes an object according to the passed hash_type and encoding."""
    hash_type = hash_type.lower()
    encoding = encoding.lower()
    
    if hash_type == 'sha256':
        hash_func = sha256
    else:
        raise NotImplementedError("Hash function %s not supported."%hash_type)
        
    return hash_func(str(obj).encode(encoding)).hexdigest().lower()

In [7]:
# export


class Page(ABC):
    
    def __post_init__(self)->None:
        response = make_request_from(url = self.url)
        self.soup = make_soup_from(response=response)
        self.boxscores = self.generate_boxscores()
    
    @abstractproperty
    def url(self)->str:
        ...
        
    @abstractproperty
    def event_name(self)->str:
        ...
        
    @abstractproperty
    def event_date(self)->str:
        ...
        
    @abstractproperty
    def draw(self)->str:
        ...
        
    @abstractproperty
    def tables(self)->List[Tag]:
        ...
        
    @abstractmethod
    def generate_boxscores(self)->List[dict]:
        ...
        
    
@dataclass
class LinescorePage(Page):
    cz_event_id : int
    cz_draw_id: int
    
    
    @property
    def url(self)->str:
        return 'https://curlingzone.com/event.php?eventid=%s&view=Scores&showdrawid=%s#1'%(self.cz_event_id,self.cz_draw_id)
    
    @property
    def event_name(self)->str:
        return self.soup.find('h3',attrs={'class':'entry-title-widget'}).string
    
    @property
    def event_date(self)->str:
        return self.soup.find('div',attrs={'class':'badge-widget'}).string
    
    @property
    def draw(self)->str:
        return self.soup.find(name='option',attrs={'selected':'selected'}).string
    
    @property
    def tables(self)->List[Tag]:
        return self.soup.find_all(name = 'table',attrs={'class':'linescorebox'})
    
    def generate_boxscores(self)->List[defaultdict]:
        return [generate_dict_from_table(table=table) for table in self.tables]
    
    def get_boxscore_from(self,cz_game_id : int)->defaultdict:
        if cz_game_id <= 0:
            raise ValueError('cz_game_id must be 1 or greater.')
            
        if cz_game_id > len(self.boxscores):
            raise ValueError('') # TODO
            
        return self.boxscores[cz_game_id - 1]


In [8]:
# hide

linescore_page = LinescorePage(cz_event_id = 5000,cz_draw_id = 1)
linescore_page.boxscores
linescore_page.get_boxscore_from(cz_game_id = 5)

defaultdict(list,
            {'Connor Duhaime': defaultdict(list,
                         {'href': 'event.php?view=Team&eventid=5000&teamid=114946&profileid=9629#1',
                          'hammer': False,
                          'score': ['0', '0', '0', '0', 'X', '', '', ''],
                          'finalscore': '0'}),
             'Daryl Shane': defaultdict(list,
                         {'href': 'event.php?view=Team&eventid=5000&teamid=114941&profileid=9762#1',
                          'hammer': True,
                          'score': ['1', '1', '3', '1', 'X', '', '', ''],
                          'finalscore': '6'})})

In [9]:
# exporti

@dataclass
class HalfBoxscore:
    href : str
    hammer : bool
    score : List[str]
    finalscore : str
        
@dataclass
class NormalizedHalfBoxscore:
    href : str
    hammer_progression : List[bool]
    normalized_score : List[int]

In [10]:
# hide

boxscore = linescore_page.get_boxscore_from(cz_game_id = 3)
HalfBoxscore(**boxscore['John Willsey'])

HalfBoxscore(href='event.php?view=Team&eventid=5000&teamid=114940&profileid=9707#1', hammer=False, score=['0', '0', '0', '1', '0', '3', '0', '1', 'X'], finalscore='5')

In [11]:
# exporti
def normalize_score(score_1 : List[str],score_2 : List[str])->List[int]:
    if len(score_1) != len(score_2):
        raise ValueError('') # TODO
        
    end_1 = list()
    current_diff = 0
    
    for i in range(len(score_1)):
        try:
            val_1 = int(score_1[i])
            val_2 = int(score_2[i])
            
        except ValueError:
            break
            
        if val_1 > 0 and val_2 > 0:
            raise ValueError('') # TODO
        
        new_current_diff = current_diff + val_1 - val_2
        end_1.append(new_current_diff)
        current_diff = new_current_diff
        
    return end_1

In [12]:
# hide
score_1 = ['0','1','0','1']
score_2 = ['0','0','2','0']
new_score = normalize_score(score_1,score_2) 
new_score

[0, 1, -1, 0]

In [13]:
#hide

list(map(lambda x: -1*x,new_score))

[0, -1, 1, 0]

In [14]:
# exporti
def make_hammer_progression(hammer_start:bool,normalized_score:List[int])->List[bool]:
    current_hammer = hammer_start
    hammer_progression = [current_hammer]
    for i in range(len(normalized_score)-1):
        if current_hammer and normalized_score[i] > 0:
            current_hammer = False
        if not current_hammer and normalized_score[i] < 0:
            current_hammer = True
        hammer_progression.append(current_hammer)
        
    return hammer_progression

In [15]:
# hide
hammer_progression = make_hammer_progression(hammer_start=True,normalized_score = new_score)
hammer_progression

[True, True, False, True]

In [16]:
# hide
list(map(lambda x: not x, hammer_progression))

[False, False, True, False]

In [17]:
# hide

# test

boxscore = linescore_page.get_boxscore_from(cz_game_id = 3)

half_boxscores = [HalfBoxscore(**half_boxscore) for half_boxscore in boxscore.values()]

normalized_score = normalize_score(half_boxscores[0].score,half_boxscores[1].score)

hammer_progression = make_hammer_progression(half_boxscores[0].hammer,normalized_score = normalized_score)

print(normalized_score,hammer_progression)

normalized_boxscore_1 = NormalizedHalfBoxscore(href = half_boxscores[0].href,hammer_progression = hammer_progression,normalized_score = normalized_score)
normalized_boxscore_2 = NormalizedHalfBoxscore(href = half_boxscores[1].href,hammer_progression = list(map(lambda x: not x, hammer_progression)), normalized_score =  list(map(lambda x: -1*x,new_score)))


print(normalized_boxscore_1,normalized_boxscore_2)

[0, -1, -1, 0, 0, 3, 2, 3] [False, False, True, True, True, True, False, False]
NormalizedHalfBoxscore(href='event.php?view=Team&eventid=5000&teamid=114940&profileid=9707#1', hammer_progression=[False, False, True, True, True, True, False, False], normalized_score=[0, -1, -1, 0, 0, 3, 2, 3]) NormalizedHalfBoxscore(href='event.php?view=Team&eventid=5000&teamid=114952&profileid=9703#1', hammer_progression=[True, True, False, False, False, False, True, True], normalized_score=[0, -1, 1, 0])
