In [196]:
import random
import os
from pprint import pprint

import requests
from lxml.html import fromstring

import pandas as pd

In [197]:
def random_user_agent(file):
    if os.path.exists(file):
        with open(file, 'r') as f:
            lines = f.readlines()

            return str(random.choice(lines)).replace("\n", "")

In [395]:
url_list = [
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/clasicas',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/defensivas',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/disciplina',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/ofensivas',
    'https://www.laliga.es/laliga-santander/REPLACE_TEAM/plantilla/eficiencia',
]

In [396]:
en_url_list = [
    'https://www.laliga.es/en/laliga-santander/REPLACE_TEAM/squad/standings',
    'https://www.laliga.es/en/laliga-santander/REPLACE_TEAM/squad/defensives',
    'https://www.laliga.es/en/laliga-santander/REPLACE_TEAM/squad/discipline',
    'https://www.laliga.es/en/laliga-santander/REPLACE_TEAM/squad/ofensive',
    'https://www.laliga.es/en/laliga-santander/REPLACE_TEAM/squad/efficiency',
]

In [397]:
def retrieve_team_list():
    head = {
        "User-Agent": random_user_agent('resources/user_agent_list.txt'),
        "X-Requested-With": "XMLHttpRequest",
        "Accept": "text/html,application/xml",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }

    url = "https://www.laliga.es/laliga-santander"

    req = requests.get(url, headers=head)
    
    if req.status_code != 200:
        raise ConnectionError("connection error " + str(req.status_code) + ", try again later.")

    root = fromstring(req.text)
    path = root.xpath(".//div[@id='equipos']/div[contains(@class, 'laliga-santander')]")
    
    strip_from_url = 'https://www.laliga.es/laliga-santander/'
    
    teams = list()
    
    for element in path:
        for value in element:
            teams.append(value.get('href').replace(strip_from_url, ''))
            
    return teams

In [398]:
def retrieve_team_dict():
    head = {
        "User-Agent": random_user_agent('resources/user_agent_list.txt'),
        "X-Requested-With": "XMLHttpRequest",
        "Accept": "text/html,application/xml",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
    }

    url = "https://www.laliga.es/laliga-santander"

    req = requests.get(url, headers=head)
    
    if req.status_code != 200:
        raise ConnectionError("connection error " + str(req.status_code) + ", try again later.")

    root = fromstring(req.text)
    path = root.xpath(".//div[@id='equipos']/div[contains(@class, 'laliga-santander')]")
    
    strip_from_url = 'https://www.laliga.es/laliga-santander/'
    
    teams = list()
    
    for element in path:
        for value in element:
            obj = {
                'name': value.text_content(),
                'tag': value.get('href').replace(strip_from_url, ''),
            }
            
            teams.append(obj)
            
    return teams

In [399]:
def retrieve_player_stats(team):
    result = list()
    
    for url in url_list:
        url = url.replace('REPLACE_TEAM', team['tag'])
        
        head = {
            "User-Agent": random_user_agent('resources/user_agent_list.txt'),
            "X-Requested-With": "XMLHttpRequest",
            "Accept": "text/html,application/xml",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
        }

        req = requests.get(url, headers=head)

        if req.status_code != 200:
            raise ConnectionError("connection error " + str(req.status_code) + ", try again later.")

        root = fromstring(req.text)
        path = root.xpath(".//div[contains(@class, 'rotar-tabla')]/table[@class='datatable']/thead/tr/th")

        index = list()

        for element in path:
            if 'title' not in element.attrib:
                index.append(element.text)
            else:
                index.append(element.get('title'))

        df = pd.DataFrame(columns=index)

        path = root.xpath(".//div[contains(@class, 'rotar-tabla')]/table[@class='datatable']/tbody/tr")

        row = list()
        
        for element in path:
            for value in element:
                row.append(value.text_content())

            df.loc[len(df)] = row
            row = list()
        
        df.insert(loc=0, column='Equipo', value=team['name'])
        df = df.drop(columns=['Foto'])
        
        result.append(df)
        
    return result

In [400]:
def retrieve_player_columns_english(team):
    result = list()
    
    for url in en_url_list:
        url = url.replace('REPLACE_TEAM', team['tag'])
        
        head = {
            "User-Agent": random_user_agent('resources/user_agent_list.txt'),
            "X-Requested-With": "XMLHttpRequest",
            "Accept": "text/html,application/xml",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "keep-alive",
        }

        req = requests.get(url, headers=head)

        if req.status_code != 200:
            raise ConnectionError("connection error " + str(req.status_code) + ", try again later.")

        root = fromstring(req.text)
        path = root.xpath(".//div[contains(@class, 'rotar-tabla')]/table[@class='datatable']/thead/tr/th")

        indexes = list()

        for element in path:
            if 'title' not in element.attrib:
                obj = {
                    'title': element.text,
                    'col': element.text,
                }
                
                indexes.append(obj)
            else:
                obj = {
                    'title': element.get('title'),
                    'col': element.text,
                }
                
                indexes.append(obj)
                
        errored_titles = [
            'Pictures',
            'Goals scored per attempt (outside the box)',
            'Goals scored per attempt (inside the box)',
            'Goals scored per attempt (left foot)',
            'Goals scored per attempt (right foot)',
            'Goals scored per attempt (header)',
            'Goals scored per attempt (set piece)',
        ]        
        
        for index in indexes:
            if index not in result and index['title'] not in errored_titles:
                result.append(index)
        
    return result

In [401]:
def concatenate_dataframes(df_list):
    result = pd.concat([df for df in df_list], axis=1)
    result = result.loc[:,~result.columns.duplicated()]
    
    return result

In [402]:
def __launch__():
    team_dict = retrieve_team_dict()
    players = pd.DataFrame()
    
    for team in team_dict:
        print(team)
        df_list = retrieve_player_stats(team)
        
        players = players.append(concatenate_dataframes(df_list), ignore_index=True, sort=False)
        
    # LaLiga Error -> https://www.laliga.es/laliga-santander/barcelona/plantilla/eficiencia
    players = players.drop(columns=['Goles marcados por tiros realizados desde fuera del área',
                                    'Goles marcados por tiros realizados desde dentro del área',
                                    'Goles marcados por tiros realizados con el pie izquierdo',
                                    'Goles marcados por tiros realizados con el pie derecho',
                                    'Goles marcados por tiros realizados de cabeza',
                                    'Goles marcados por tiros realizados de jugada a balón parado'])
    
    players.to_csv('../dataset/laliga_player_stats_spanish.csv', encoding='utf-8', index=False)
    
    players.replace(['Portero', 'Defensa', 'Centrocampista', 'Delantero'], ['Goalkeeper', 'Defender', 'Midfielder', 'Forward'], inplace=True)
    
    english_columns = retrieve_player_columns_english({'name': 'F.C. Barcelona', 'tag': 'barcelona'})
    
    english_columns_list = ['Team']
    
    for english_column in english_columns:
        english_columns_list.append(english_column['title'])
    
    players.columns = english_columns_list
    
    players.to_csv('../dataset/laliga_player_stats_english.csv', encoding='utf-8', index=False)

In [403]:
players = __launch__()

{'name': 'Athletic Club', 'tag': 'athletic'}
{'name': 'Atlético de Madrid', 'tag': 'atletico'}
{'name': 'CD Leganés', 'tag': 'leganes'}
{'name': 'D. Alavés', 'tag': 'alaves'}
{'name': 'FC Barcelona', 'tag': 'barcelona'}
{'name': 'Getafe CF', 'tag': 'getafe'}
{'name': 'Girona FC', 'tag': 'girona'}
{'name': 'Levante UD', 'tag': 'levante'}
{'name': 'R. Valladolid CF', 'tag': 'valladolid'}
{'name': 'Rayo Vallecano', 'tag': 'rayo'}
{'name': 'RC Celta', 'tag': 'celta'}
{'name': 'RCD Espanyol', 'tag': 'espanyol'}
{'name': 'Real Betis', 'tag': 'betis'}
{'name': 'Real Madrid', 'tag': 'real-madrid'}
{'name': 'Real Sociedad', 'tag': 'real-sociedad'}
{'name': 'SD Eibar', 'tag': 'eibar'}
{'name': 'SD Huesca', 'tag': 'huesca'}
{'name': 'Sevilla FC', 'tag': 'sevilla'}
{'name': 'Valencia CF', 'tag': 'valencia'}
{'name': 'Villarreal CF', 'tag': 'villarreal'}
