In [168]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
import warnings
import csv
from sklearn.preprocessing import MultiLabelBinarizer

warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", None)

In [169]:
#Get html
url = "https://fbref.com/en/comps/Big5/stats/players/Big-5-European-Leagues-Stats"
html = requests.get(url=url).text

soup = BeautifulSoup(html, "html.parser")
table = soup.find("tbody")
rows = table.find_all("tr")

cols = ['player_id', 'name', 'country', 'position', 'club', 'league', 'games_played',\
       'games_started', 'minutes_played', 'minutes_per_90', 'goals', 'assists',\
       'non_penalty_goals', 'penalty_goals', 'penalty_attempted', 'yellow_cards',\
       'red_card', 'goals_per_90', 'assits_per_90', 'non_penalty_goals_per_90']


In [170]:
#create a dataframe with player stats
df = pd.DataFrame(columns=cols)

for row in rows:
    try:
        if row['class']:
            continue
    except:
        player_id = int(row.find("th").text)
        values = row.find_all("td")
        name = values[0].find('a').text
        country = values[1].text.split(' ')[len(values[1].text.split(' ')) - 1]
        position = values[2].text.split(',')
        club = values[3].text
        league = values[4].find('a').text
        games_played = float(values[7].text)
        games_started = float(values[8].text)
        minutes_played = float(''.join(values[9].text.split(',')))
        minutes_per_90 = float(values[10].text)
        goals = float(values[11].text) 
        assists = float(values[12].text)
        non_penalty_goals = float(values[13].text)
        penalty_goals = float(values[14].text)
        penalty_attempted = float(values[15].text)
        yellow_cards = float(values[16].text)
        red_card = float(values[17].text)
        goals_per_90 = float(values[18].text)
        assits_per_90 = float(values[19].text)
        non_penalty_goals_per_90 = float(values[20].text)

        # stats.append([index, name, country, position, club, league, games_played, games_started, minutes_played, minutes_per_90, goals, assists, non_penalty_goals, penalty_goals, penalty_attempted, yellow_cards, red_card, goals_per_90, assits_per_90, non_penalty_goals_per_90])
        
        stats = [player_id, name, country, position, club, league, games_played, games_started,\
                                minutes_played, minutes_per_90, goals, assists, non_penalty_goals, penalty_goals,\
                                penalty_attempted, yellow_cards, red_card, goals_per_90, assits_per_90,\
                                non_penalty_goals_per_90]
#         pd.DataFrame([stats], columns=cols).head()
#         df = df.append(pd.DataFrame([stats], columns=cols))
#         df = pd.conca([df, pd.DataFrame([index, name, country, position, club, league, games_played, games_started, minutes_played, minutes_per_90, goals, assists, non_penalty_goals, penalty_goals, penalty_attempted, yellow_cards, red_card, goals_per_90, assits_per_90, non_penalty_goals_per_90])], axis=0, ignore_index = True)
        df = pd.concat([df, pd.DataFrame([stats], columns=cols)])
    
    
df = df.set_index('player_id')
# df.to_csv("data.csv")
df.head()
df.shape

(2383, 19)

In [171]:
#one hot encoding position
mlb = MultiLabelBinarizer(sparse_output = True)
df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('position')),
                index=df.index,
                columns=mlb.classes_))


In [172]:
df.to_csv("data.csv")

In [173]:
temp = df[['name', 'GK', 'DF', 'MF', 'FW']][0:20]
temp.head(20)

Unnamed: 0_level_0,name,GK,DF,MF,FW
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Brenden Aaronson,0,0,1,1
2,Yunis Abdelhamid,0,1,0,0
3,Himad Abdelli,0,0,1,1
4,Salis Abdul Samed,0,0,1,0
5,Laurent Abergel,0,0,1,0
6,Matthis Abline,0,0,1,1
7,Zakaria Aboukhlal,0,0,1,1
8,Tammy Abraham,0,0,0,1
9,Francesco Acerbi,0,1,0,0
10,Mohamed Achi,0,0,0,1


In [90]:
player_id = 19
player = temp.iloc[player_id]
print(player)
temp[['GK', 'DF', 'MF', 'FW']].head()
player[["GK", "DF", "MF", "FW"]].head()
dot_product = temp[['GK', 'DF', 'MF', 'FW']].dot(player[["GK", "DF", "MF", "FW"]])
dot_product.rename("similarity")
temp['similarity'] = dot_product
temp.head(20)

name    Michel Aebischer
GK                     0
DF                     0
MF                     1
FW                     1
Name: 20, dtype: Sparse[object, 0]


Unnamed: 0_level_0,name,GK,DF,MF,FW,similarity
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Brenden Aaronson,0,0,1,1,2
2,Yunis Abdelhamid,0,1,0,0,0
3,Himad Abdelli,0,0,1,1,2
4,Salis Abdul Samed,0,0,1,0,1
5,Laurent Abergel,0,0,1,0,1
6,Matthis Abline,0,0,1,1,2
7,Zakaria Aboukhlal,0,0,1,1,2
8,Tammy Abraham,0,0,0,1,1
9,Francesco Acerbi,0,1,0,0,0
10,Mohamed Achi,0,0,0,1,1


In [179]:
player_id = 10
player = df.iloc[player_id]

temp = df.copy()

temp['pos_similarity'] = temp[['GK', 'DF', 'MF', 'FW']].dot(player[["GK", "DF", "MF", "FW"]])
temp = temp[temp['pos_similarity'] > 0]
print(temp.shape)
temp.head()

(959, 23)


Unnamed: 0_level_0,name,country,club,league,games_played,games_started,minutes_played,minutes_per_90,goals,assists,...,yellow_cards,red_card,goals_per_90,assits_per_90,non_penalty_goals_per_90,DF,FW,GK,MF,pos_similarity
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Yunis Abdelhamid,MAR,Reims,Ligue 1,15.0,15.0,1350.0,15.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1,0,0,0,1
9,Francesco Acerbi,ITA,Inter,Serie A,9.0,7.0,660.0,7.3,0.0,1.0,...,1.0,0.0,0.0,0.14,0.14,1,0,0,0,1
11,Marcos Acuña,ARG,Sevilla,La Liga,10.0,7.0,551.0,6.1,0.0,0.0,...,4.0,1.0,0.0,0.0,0.0,1,0,0,0,1
15,Tosin Adarabioyo,ENG,Fulham,Premier League,10.0,10.0,900.0,10.0,1.0,0.0,...,0.0,0.0,0.1,0.0,0.1,1,0,0,0,1
22,Emmanuel Agbadou,CIV,Reims,Ligue 1,12.0,11.0,969.0,10.8,0.0,0.0,...,2.0,2.0,0.0,0.0,0.0,1,0,0,0,1


In [176]:
# z-normalize the stats
cols_for_norm = ['games_played', 'games_started',
       'minutes_played', 'minutes_per_90', 'goals', 'assists',
       'non_penalty_goals', 'penalty_goals', 'penalty_attempted',
       'yellow_cards', 'red_card', 'goals_per_90', 'assits_per_90',
       'non_penalty_goals_per_90']

player = temp.iloc[player_id]

for col in cols_for_norm:
    temp[col] = (temp[col] - temp[col].mean()) / temp[col].std()

In [177]:
temp.head()

Unnamed: 0_level_0,name,country,club,league,games_played,games_started,minutes_played,minutes_per_90,goals,assists,...,yellow_cards,red_card,goals_per_90,assits_per_90,non_penalty_goals_per_90,DF,FW,GK,MF,pos_similarity
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,Yunis Abdelhamid,MAR,Reims,Ligue 1,1.353805,1.643066,1.746353,1.746144,-0.467197,-0.486689,...,-0.325502,-0.306201,-0.433091,-0.329461,-0.492627,1,0,0,0,1
9,Francesco Acerbi,ITA,Inter,Serie A,0.011894,0.008098,0.084139,0.07686,-0.467197,0.703961,...,-0.325502,-0.306201,-0.433091,0.472887,0.217717,1,0,0,0,1
11,Marcos Acuña,ARG,Sevilla,La Liga,0.235546,0.008098,-0.178442,-0.183288,-0.467197,-0.486689,...,1.714738,2.75262,-0.433091,-0.329461,-0.492627,1,0,0,0,1
15,Tosin Adarabioyo,ENG,Fulham,Premier League,0.235546,0.621211,0.662301,0.662193,0.955157,-0.486689,...,-1.005582,-0.306201,0.662777,-0.329461,0.014761,1,0,0,0,1
22,Emmanuel Agbadou,CIV,Reims,Ligue 1,0.682849,0.825582,0.828522,0.835625,-0.467197,-0.486689,...,0.354578,5.81144,-0.433091,-0.329461,-0.492627,1,0,0,0,1


In [178]:
player_temp = player[['games_played', 'games_started',
       'minutes_played', 'minutes_per_90', 'goals', 'assists',
       'non_penalty_goals', 'penalty_goals', 'penalty_attempted',
       'yellow_cards', 'red_card', 'goals_per_90', 'assits_per_90',
       'non_penalty_goals_per_90']]

player_temp.head(15)

games_played                 10.0
games_started                 7.0
minutes_played              666.0
minutes_per_90                7.4
goals                         0.0
assists                       0.0
non_penalty_goals             0.0
penalty_goals                 0.0
penalty_attempted             0.0
yellow_cards                  1.0
red_card                      0.0
goals_per_90                  0.0
assits_per_90                 0.0
non_penalty_goals_per_90      0.0
Name: 33, dtype: Sparse[object, 0]

In [167]:
df_temp = temp[['games_played', 'games_started',
       'minutes_played', 'minutes_per_90', 'goals', 'assists',
       'non_penalty_goals', 'penalty_goals', 'penalty_attempted',
       'yellow_cards', 'red_card', 'goals_per_90', 'assits_per_90',
       'non_penalty_goals_per_90']].copy()
df_temp.head()


Unnamed: 0_level_0,games_played,games_started,minutes_played,minutes_per_90,goals,assists,non_penalty_goals,penalty_goals,penalty_attempted,yellow_cards,red_card,goals_per_90,assits_per_90,non_penalty_goals_per_90
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2,1.353805,1.643066,1.746353,1.746144,-0.467197,-0.486689,-0.473321,-0.068277,-0.102293,-0.325502,-0.306201,-0.433091,-0.329461,-0.492627
9,0.011894,0.008098,0.084139,0.07686,-0.467197,0.703961,-0.473321,-0.068277,-0.102293,-0.325502,-0.306201,-0.433091,0.472887,0.217717
11,0.235546,0.008098,-0.178442,-0.183288,-0.467197,-0.486689,-0.473321,-0.068277,-0.102293,1.714738,2.75262,-0.433091,-0.329461,-0.492627
15,0.235546,0.621211,0.662301,0.662193,0.955157,-0.486689,1.000429,-0.068277,-0.102293,-1.005582,-0.306201,0.662777,-0.329461,0.014761
22,0.682849,0.825582,0.828522,0.835625,-0.467197,-0.486689,-0.473321,-0.068277,-0.102293,0.354578,5.81144,-0.433091,-0.329461,-0.492627
