In [77]:
import random
import os
from pprint import pprint

import requests
from lxml.html import fromstring

import pandas as pd

In [15]:
def random_user_agent(file):
    if os.path.exists(file):
        with open(file, 'r') as f:
            lines = f.readlines()

            return str(random.choice(lines)).replace("\n", "")

In [123]:
url_list = [
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/clasicas',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/defensivas',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/disciplina',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/ofensivas',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/eficiencia',
]

In [124]:
def retrieve_team_list():
    head = {
        "User-Agent": random_user_agent('resources/user_agent_list.txt'),
        "X-Requested-With": "XMLHttpRequest",
        "Accept": "text/html,application/xml",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }

    url = "https://www.laliga.es/laliga-santander"

    req = requests.get(url, headers=head)
    
    if req.status_code != 200:
        raise ConnectionError("connection error " + str(req.status_code) + ", try again later.")

    root = fromstring(req.text)
    path = root.xpath(".//div[@id='equipos']/div[contains(@class, 'laliga-santander')]")
    
    strip_from_url = 'https://www.laliga.es/laliga-santander/'
    
    teams = list()
    
    for element in path:
        for value in element:
            teams.append(value.get('href').replace(strip_from_url, ''))
            
    return teams

In [180]:
def retrieve_team_dict():
    head = {
        "User-Agent": random_user_agent('resources/user_agent_list.txt'),
        "X-Requested-With": "XMLHttpRequest",
        "Accept": "text/html,application/xml",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }

    url = "https://www.laliga.es/laliga-santander"

    req = requests.get(url, headers=head)
    
    if req.status_code != 200:
        raise ConnectionError("connection error " + str(req.status_code) + ", try again later.")

    root = fromstring(req.text)
    path = root.xpath(".//div[@id='equipos']/div[contains(@class, 'laliga-santander')]")
    
    strip_from_url = 'https://www.laliga.es/laliga-santander/'
    
    teams = list()
    
    for element in path:
        for value in element:
            obj = {
                'name': value.text_content(),
                'tag': value.get('href').replace(strip_from_url, ''),
            }
            
            teams.append(obj)
            
    return teams

In [188]:
def retrieve_player_stats(team):
    result = list()
    
    for url in url_list:
        url = url.replace('REPLACE_TEAM', team['tag'])
        
        head = {
            "User-Agent": random_user_agent('resources/user_agent_list.txt'),
            "X-Requested-With": "XMLHttpRequest",
            "Accept": "text/html,application/xml",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
        }

        req = requests.get(url, headers=head)

        if req.status_code != 200:
            raise ConnectionError("connection error " + str(req.status_code) + ", try again later.")

        root = fromstring(req.text)
        path = root.xpath(".//div[contains(@class, 'rotar-tabla')]/table[@class='datatable']/thead/tr/th")

        index = list()

        for element in path:
            if 'title' not in element.attrib:
                index.append(element.text)
            else:
                index.append(element.get('title'))

        df = pd.DataFrame(columns=index)

        path = root.xpath(".//div[contains(@class, 'rotar-tabla')]/table[@class='datatable']/tbody/tr")

        row = list()
        
        for element in path:
            for value in element:
                row.append(value.text_content())

            df.loc[len(df)] = row
            row = list()
        
        df.insert(loc=0, column='Equipo', value=team['name'])

        result.append(df)
        
    return result

In [191]:
def concatenate_dataframes(df_list):
    result = pd.concat([df for df in df_list], axis=1)
    result = result.loc[:,~result.columns.duplicated()]
    
    return result

In [192]:
def __launch__():
    team_dict = retrieve_team_dict()
    players = pd.DataFrame()
    
    for team in team_dict:
        df_list = retrieve_player_stats(team)
        
        players = players.append(concatenate_dataframes(df_list), ignore_index=True, sort=False)
        
    # LaLiga Error -> https://www.laliga.es/laliga-santander/barcelona/plantilla/eficiencia
    players = players.drop(columns=['Goles marcados por tiros realizados desde fuera del área',
                                    'Goles marcados por tiros realizados desde dentro del área',
                                    'Goles marcados por tiros realizados con el pie izquierdo',
                                    'Goles marcados por tiros realizados con el pie derecho',
                                    'Goles marcados por tiros realizados de cabeza',
                                    'Goles marcados por tiros realizados de jugada a balón parado'])
    
    players.to_csv('../dataset/laliga_player_stats.csv', encoding='utf-8', index=False)
    
    return players

In [193]:
players = __launch__()