In [1]:
from bs4 import BeautifulSoup
import urllib3
import requests
import pandas as pd
import re
import random
import time
import datetime

In [2]:
it_link = "https://www.lequipe.fr/Football/championnat-d-italie/saison-2018-2019/page-calendrier-resultats/38e-journee"

In [3]:
all_link = "https://www.lequipe.fr/Football/championnat-d-allemagne/saison-2018-2019/page-calendrier-resultats/34e-journee"

In [7]:
response = requests.get(all_link)
soup = BeautifulSoup(response.text, 'html.parser')
team_box = soup.find_all("span", {"class": "TeamScore__nameshort"})
score_box = soup.find_all("div", {"class": "TeamScore__score TeamScore__score--ended"})

In [8]:
def pause():
    """
    The goal of this section is to pause the execution before 2 scraping executions
    :return: wait 1, 2, 3, 4 or 4 seconds
    """
    time_break = random.randint(1, 4)
    return time.sleep(time_break)

def get_url_leg(url_template, season, nbr):
    """
    :param url_template: str: template of the url to be used
    :param season: str
    :param nbr: int: leg number
    
    :return a completed url 
    """
    leg_token = "{}e".format(nbr) if nbr > 1 else "1ere"
    return url_template.format(leg=leg_token, season=season)


def transform_box_to_data(box, regex_pattern, data_name, data_type):
    """
    :param box: eautifulSoup box corresponding to 1 part of the data
    :param regex_pattern: regexp value extract the relevant data
    :param data_name: str: name of the data we're interested in
    :param data_type: str: converting type for the extracted data
    
    :return dataframe with all data retrieved from the box
    """
    elements = []
    # extract from box
    for box_elem in box:
        for x in box_elem:
            elem = re.findall(regex_pattern, str(x))
            if elem:
                elem = int(elem[0]) if data_type == 'int' else elem[0]
                elements.append(elem)
    
    # build data
    data = {'key': [('Home' if i%2==0 else 'Away') + '_' + str(i//2) for i, _ in enumerate(elements)],
           data_name: elements}
    return pd.DataFrame(data=data)



In [9]:
transform_box_to_data(box=team_box, regex_pattern=r'>([0-9îê\'üöëéèâÉa-zA-Z-\s?]+)</span', 
                                    data_name='team', data_type='str')

Unnamed: 0,key,team
0,Home_0,Bayern Munich
1,Away_0,Eintracht Francfort
2,Home_1,Borussia M'Gladbach
3,Away_1,Borussia Dortmund
4,Home_2,Werder Brême
5,Away_2,RB Leipzig
6,Home_3,Hertha Berlin
7,Away_3,Bayer Leverkusen
8,Home_4,Wolfsburg
9,Away_4,Augsbourg


In [1]:
def _get_rank_position_scoring(nb_teams):
    bonus_position = {1: 300,
                      2: 280,
                      3: 270,
                      4: 260,
                      5: 220,
                      nb_teams: 350,
                      nb_teams - 1: 340,
                      nb_teams - 2: 300,
                      nb_teams - 3: 280}

    for rk in range(6, nb_teams - 3):
        upper = ((nb_teams / 2) - 3) ** 2
        lower = ((nb_teams / 2) - 5) ** 2
        if 2 * rk < nb_teams:
            bonus_position[rk] = int(220 * (rk - nb_teams / 2) ** 2 / lower)
        else:
            bonus_position[rk] = int(280 * ((rk - nb_teams / 2) ** 2) / upper)

    return bonus_position


def _get_rank_scoring(nb_teams):
    max_bonus = 250 * nb_teams
    return {1 + i // 2 if i % 2 == 0 else nb_teams - i // 2: max_bonus - i * 250 for i in range(nb_teams)}

In [6]:
dict(sorted(_get_rank_scoring(nb_teams=20).items()))

{1: 5000,
 2: 4500,
 3: 4000,
 4: 3500,
 5: 3000,
 6: 2500,
 7: 2000,
 8: 1500,
 9: 1000,
 10: 500,
 11: 250,
 12: 750,
 13: 1250,
 14: 1750,
 15: 2250,
 16: 2750,
 17: 3250,
 18: 3750,
 19: 4250,
 20: 4750}