# What is Scouting and why is it so important?

# Import Required Libraries

In [None]:
# Data Manipulation libraries:
import numpy as np
import pandas as pd
from copy import deepcopy

# Scraping libraries
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen

# Plotting libraries
import mplsoccer
from highlight_text import fig_text

# Scrape Data for Scouting

## Scrape Performance Data

In [None]:
top5Leaguesids = ["EPL", "La_Liga", "Bundesliga", "Serie_A", "Ligue_1"]

In [None]:
""" Scrape player stats """

# Connect to the page and obtain HTML code:
scrapeUrl = "https://understat.com/league/EPL"
pageConnect = urlopen(scrapeUrl)
pageHTML = BeautifulSoup(pageConnect, "html.parser")

# Obtain data string for Player Data in the league:
listElement = 3
jsonRawString = pageHTML.findAll(name="script")[listElement].text
start_idx = jsonRawString.index("\\")
stop_idx = jsonRawString.index("')")

# Process the string and convert to dataframe:
jsonData = jsonRawString[start_idx:stop_idx]
jsonData = jsonData.encode("utf8").decode("unicode_escape")
tourPlayerData = pd.json_normalize(json.loads(jsonData))

In [None]:
tourPlayerData.head()

In [None]:
tourPlayerData.shape

In [None]:
finalPlayerData = pd.DataFrame()

""" Scrape player stats from all top 5 European Leagues"""

for tourName in top5Leaguesids:
    print("Scraping for ", tourName)
    # Connect to the page and obtain HTML code:
    scrapeUrl = "https://understat.com/league/" + tourName
    pageConnect = urlopen(scrapeUrl)
    pageHTML = BeautifulSoup(pageConnect, "html.parser")

    # Obtain data string for Player Data in the league:
    listElement = 3
    jsonRawString = pageHTML.findAll(name="script")[listElement].text
    start_idx = jsonRawString.index("\\")
    stop_idx = jsonRawString.index("')")

    # Process the string and convert to dataframe:
    jsonData = jsonRawString[start_idx:stop_idx]
    jsonData = jsonData.encode("utf8").decode("unicode_escape")
    tourPlayerData = pd.json_normalize(json.loads(jsonData))
    tourPlayerData.insert(0, "tourName", tourName)
    print(tourPlayerData.shape)

    # Append player data from current tour to final Dataframe:
    finalPlayerData = finalPlayerData.append(tourPlayerData)

In [None]:
finalPlayerData.tail()

In [None]:
finalPlayerData["tourName"].unique()

In [None]:
finalPlayerData.shape

In [None]:
finalPlayerData.reset_index(drop=True)

In [None]:
finalPlayerData.reset_index(drop=True, inplace=True)

In [None]:
finalPlayerData.to_csv("../data/understatDataTop5Leagues.csv", index=False)

In [None]:
finalPlayerData[finalPlayerData["position"].str.contains("S")]["position"].unique()

In [None]:
finalPlayerData[finalPlayerData["position"].str.contains("GK S")]

In [None]:
finalPlayerData[(finalPlayerData["position"] == "S") & (finalPlayerData["games"] > "10")]

# Data Cleaning

In [None]:
finalPlayerData.info()

In [None]:
finalPlayerData.apply(pd.to_numeric, errors="ignore").info()

In [None]:
finalPlayerData = finalPlayerData.apply(pd.to_numeric, errors="ignore")

In [None]:
finalPlayerData[(finalPlayerData["position"].isin(["S", "F S"])) & (finalPlayerData["games"] > 10)]

# Feature Engineering

## Per90 Columns

In [None]:
finalPlayerData.columns

In [None]:
colsForPer90 = ['goals', 'xG',
                'assists', 'xA',
                'shots', 'key_passes',
                'npg', 'npxG',
                'xGChain', 'xGBuildup']

In [None]:
for col in colsForPer90:
    finalPlayerData[col + "Per90"] =\
        finalPlayerData[col].divide(finalPlayerData["time"]).multiply(90).round(1)

In [None]:
finalPlayerData.head()

`discipline` Column

In [None]:
finalPlayerData["discipline"] =\
    (finalPlayerData["yellow_cards"]*1).add(finalPlayerData["red_cards"]*3).round(1)

In [None]:
finalPlayerData.loc[finalPlayerData["discipline"] > 10, ["player_name", "yellow_cards", "red_cards"]]

In [None]:
finalPlayerData["discipline"].head()

## `goalContributions` Column

In [None]:
finalPlayerData["gContri"] = finalPlayerData["goals"].add(finalPlayerData["assists"])
finalPlayerData["gContriPer90"] = finalPlayerData["goalsPer90"].add(finalPlayerData["assistsPer90"])

In [None]:
finalPlayerData["xgContri"] = finalPlayerData["xG"].add(finalPlayerData["xA"])
finalPlayerData["xgContriPer90"] = finalPlayerData["xGPer90"].add(finalPlayerData["xAPer90"])

In [None]:
finalPlayerData["shotsContri"] = finalPlayerData["shots"].add(finalPlayerData["key_passes"])
finalPlayerData["shotsContriPer90"] =\
    finalPlayerData["shotsPer90"].add(finalPlayerData["key_passesPer90"])

In [None]:
finalPlayerData["npgContri"] = finalPlayerData["npg"].add(finalPlayerData["assists"])
finalPlayerData["npgContriPer90"] = finalPlayerData["npgPer90"].add(finalPlayerData["assistsPer90"])

In [None]:
finalPlayerData["npxgContri"] = finalPlayerData["npxG"].add(finalPlayerData["xA"])
finalPlayerData["npxgContriPer90"] = finalPlayerData["npxGPer90"].add(finalPlayerData["xAPer90"])

In [None]:
finalPlayerData["gContri"]

# Who will Replace Harry Kane at Spurs?

## Finding Similar Players

In [None]:
finalPlayerData.columns

### Attacking Rating System

**Parameters for Rating**
1. gContriPer90
1. xgContriPer90
1. shotsContriPer90
1. npgContriPer90
1. npxgContriPer90
1. xGChainPer90
1. xGBuildupPer90

In [None]:
finalPlayerData["attRating"] =\
    (finalPlayerData["gContriPer90"]*2)\
        .add(finalPlayerData["xgContriPer90"]*2)\
        .add(finalPlayerData["shotsContriPer90"]*1.5)\
        .add(finalPlayerData["npgContriPer90"]*3)\
        .add(finalPlayerData["npxgContriPer90"]*3)

In [None]:
finalPlayerData["attRating"].nlargest(25)

In [None]:
finalPlayerData.loc[finalPlayerData["attRating"].nlargest(25).index,
                    ["player_name", "team_title", "attRating"]]

### Filter Players

In [None]:
""" Only consider players who have played >900 mins (equivalent to 10 matches) """

finalPlayerDataFiltered = deepcopy(finalPlayerData[finalPlayerData["time"] >= 900])

In [None]:
finalPlayerDataFiltered.loc[finalPlayerDataFiltered["attRating"].nlargest(25).index,
                            ["player_name", "team_title", "attRating"]]

In [None]:
finalPlayerDataFiltered.loc[finalPlayerDataFiltered["player_name"].str.contains("Dominic"),
                            ["player_name", "team_title", "attRating"]]

### Level the Playing Field

In [None]:
pd.DataFrame([["EPL", 23.928],
              ["La_Liga", 19.375],
              ["Serie_A", 16.285],
              ["Bundesliga", 15.214],
              ["Ligue_1", 7.916],])

In [None]:
leaguePointsUEFA = pd.DataFrame([["EPL", 23.928],
                                 ["La_Liga", 19.375],
                                 ["Serie_A", 16.285],
                                 ["Bundesliga", 15.214],
                                 ["Ligue_1", 7.916],])

In [None]:
leaguePointsUEFA

In [None]:
leaguePointsUEFA.columns = ["tourName", "uefaPoints"]

In [None]:
pd.merge(finalPlayerDataFiltered, leaguePointsUEFA,
         how="left", on="tourName")

In [None]:
finalPlayerDataFiltered = pd.merge(finalPlayerDataFiltered, leaguePointsUEFA,
                                   how="left", on="tourName")

In [None]:
finalPlayerDataFiltered["attRating"].multiply(finalPlayerDataFiltered["uefaPoints"])

In [None]:
finalPlayerDataFiltered["adjAttRating"] =\
    finalPlayerDataFiltered["attRating"].multiply(finalPlayerDataFiltered["uefaPoints"])

In [None]:
finalPlayerDataFiltered["adjAttRating"].min()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler((1, 10))

In [None]:
scaler.fit_transform(np.array(finalPlayerDataFiltered["adjAttRating"]).reshape(-1, 1))

In [1]:
finalPlayerDataFiltered["adjAttRating"] =\
    scaler.fit_transform(np.array(finalPlayerDataFiltered["adjAttRating"]).reshape(-1, 1))

NameError: name 'scaler' is not defined

In [None]:
finalPlayerDataFiltered.loc[finalPlayerDataFiltered["adjAttRating"].nlargest(25).index,
                            ["player_name", "team_title", "attRating", "adjAttRating"]]

In [2]:
""" Possible Replacements """

possibleReplacements = ["Luis Muriel", "Kelechi Iheanacho", "Rodrigo",
                        "Patrick Bamford", "Gerard Moreno", "Timo Werner"]

## Comparison of Replacements with Kane

### Goal Contributions

In [None]:
finalPlayerDataFiltered.columns

In [None]:
finalAttParams = ["goalsPer90", "xGPer90", "assistsPer90", "xAPer90", "npgPer90", "npxGPer90", "adjAttRating"]

In [None]:
finalPlayerDataFiltered.loc[finalPlayerDataFiltered["player_name"] == "Harry Kane",
                            finalAttParams].values.tolist()

### Setting up parameters

In [None]:
# Final Parameter list for comparison:
params = finalAttParams

# Harry Kane Stats:
values = finalPlayerDataFiltered.loc[finalPlayerDataFiltered["player_name"] == "Harry Kane",
                                     params].round(1).values.flatten()

In [None]:
values

### Comparative Analysis with each possible replacement

In [None]:
for replacement in possibleReplacements:

    values_2 =\
        finalPlayerDataFiltered.loc[finalPlayerDataFiltered["player_name"] == replacement,
                                    params].round(1).values.flatten()

    minRange = finalPlayerDataFiltered[params].min().tolist()
    maxRange = finalPlayerDataFiltered[params].max().tolist()

    # instantiate PyPizza class
    baker = mplsoccer.PyPizza(
        params=params,                  # list of parameters
        min_range=minRange,
        max_range=maxRange,
        background_color="#383838",     # background color
        straight_line_color="#222222",  # color for straight lines
        straight_line_lw=1,             # linewidth for straight lines
        last_circle_lw=1,               # linewidth of last circle
        last_circle_color="#222222",    # color of last circle
        other_circle_ls="-.",           # linestyle for other circles
        other_circle_lw=1               # linewidth for other circles
    )

    # plot pizza
    fig, ax = baker.make_pizza(
        values,                     # list of values of Harry Kane
        compare_values=values_2,    # comparison values of replacement player
        figsize=(8, 8),             # adjust figsize according to your need
        kwargs_slices=dict(
            facecolor="#1A78CF", edgecolor="#222222",
            zorder=2, linewidth=1
        ),                          # values to be used when plotting slices
        kwargs_compare=dict(
            facecolor="#FF9300", edgecolor="#222222",
            zorder=2, linewidth=1,
        ),
        kwargs_params=dict(
            color="#ffffff", fontsize=12,
            va="center"
        ),                          # values to be used when adding parameter
        kwargs_values=dict(
            color="#000000", fontsize=12,
            zorder=3,
            bbox=dict(
                edgecolor="#000000", facecolor="cornflowerblue",
                boxstyle="round,pad=0.2", lw=1
            )
        ),                          # values to be used when adding parameter-values labels
        kwargs_compare_values=dict(
            color="#000000", fontsize=12, zorder=3,
            bbox=dict(edgecolor="#000000", facecolor="#FF9300", boxstyle="round,pad=0.2", lw=1)
        ),                          # values to be used when adding parameter-values labels
    )

    # adjust the texts
#     params_offset = [False, False, False, False, False, True, False]
#     # to adjust text for comparison-values-text pass adj_comp_values=True
#     baker.adjust_texts(params_offset, offset=-0.2)

    # add title
    fig_text(
        0.515, 0.99, "<Harry Kane> vs <" + replacement + ">",
        size=20, fig=fig,
        highlight_textprops=[{"color": '#1A78CF'}, {"color": '#FF9300'}],
        ha="center", color="#F2F2F2"
    )

    # add subtitle
    fig.text(
        0.515, 0.932,
        "Scouting Goal Feature Comparison | 2020-21",
        size=15,
        ha="center", color="#ffffff"
    )