In [13]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
import difflib

In [2]:
headers = {'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

# Tournaments to scrape from transfermarket.com"
urls = {"Champions league": "https://www.transfermarkt.com/uefa-champions-league/elfmeterschiessen/pokalwettbewerb/CL",
        "Europa league": "https://www.transfermarkt.com/europa-league/elfmeterschiessen/pokalwettbewerb/EL",
        "Community Shield": "https://www.transfermarkt.com/uefa-supercup/elfmeterschiessen/pokalwettbewerb/USC",
        "World cup": "https://www.transfermarkt.com/fifa-klub-wm/elfmeterschiessen/pokalwettbewerb/KLUB",
       "Euro": "https://www.transfermarkt.com/europameisterschaft-2024/elfmeterschiessen/pokalwettbewerb/EM24",
       "Club world cup": "https://www.transfermarkt.com/fifa-klub-wm/elfmeterschiessen/pokalwettbewerb/KLUB",
       "Afcon":"https://www.transfermarkt.com/afrika-cup/elfmeterschiessen/pokalwettbewerb/AFCN",
       "Copa America": "https://www.transfermarkt.com/copa-america-2024/elfmeterschiessen/pokalwettbewerb/CAM4",
       "FA cup": "https://www.transfermarkt.com/fa-cup/elfmeterschiessen/pokalwettbewerb/FAC",
       "AFC Asian Cup": "https://www.transfermarkt.com/afc-asian-cup-2023/elfmeterschiessen/pokalwettbewerb/AM23",
       "Olympic":"https://www.transfermarkt.com/olympische-spiele/elfmeterschiessen/pokalwettbewerb/OLYM"}

In [3]:
def first_shooter(game_url):
    """
    A function that returns the team who started the penalty shootouts
    """
    try:
        game_res = requests.get(game_url, headers=headers)
        game_soap = BeautifulSoup(game_res.content, 'html.parser')
        penalties = game_soap.find(id="sb-elfmeterscheissen")
        teams = penalties.find_all("div", {"class":"sb-aktion-wappen"})
        return teams[0].find("a").get("title")
    except:
        return "Unknown"

In [4]:
def first_shooter_win(home, away, score, first_shooter):
    """
    A function that returns 1 if the first shooter won. otherwise, returns 0.
    """
    home_score = int(score.split(":")[0])
    away_score = int(score.split(":")[1].split()[0])
    if (home == first_shooter) and (home_score > away_score):
        return 1
    elif (away == first_shooter) and (away_score > home_score):
        return 1
    else:
        return 0

In [5]:
# Extracting games information
df = pd.DataFrame()
for tournament, url in urls.items():
    res = requests.get(url, headers=headers)
    soap = BeautifulSoup(res.content, 'html.parser')
    
    odd = soap.find_all("tr", {"class": "odd"})
    even = soap.find_all("tr", {"class": "even"})

    games = []
    for game in odd + even:
        game_a = game.find_all("a")
        stage = game_a[0].text
        home = game_a[1].text
        score = game_a[3].text
        away = game_a[5].text
        url = "https://www.transfermarkt.com" + game.find("td", class_="zentriert hauptlink").find("a")["href"]
        games.append({"tournament":tournament,"stage":stage, "home":home,"score":score,'away':away,"url":url})
    tournament_df = pd.DataFrame(games)
    tournament_df['first_shooter'] = tournament_df['url'].apply(first_shooter)
    df = pd.concat([df, tournament_df])

In [6]:
raw_df = df.copy()

In [7]:
# Filtering out the matches with unknown first_shooter
df = df[df['first_shooter'] != 'Unknown']

# Including only the matches played on neutral ground
neutral_ground_tournaments = ['World cup', 'Euro', 'Club world cup', 'Afcon', 'Copa America',
                              'AFC Asian Cup', 'Olympic', 'Community Shield']

mask_1 = (df['tournament'].isin(neutral_ground_tournaments))
mask_2 = (df['tournament'] == 'FA cup') & (df['stage'].isin(['Final', 'Semi-Finals']))
mask_3 = (df['tournament'].isin(['Champions league', 'Europa league'])) & (df['stage'] == 'Final')

df = df[mask_1 | mask_2 | mask_3]

In [8]:
df.shape

(195, 7)

In [14]:
# Adjusting the teams' names
teams = list(set(list(df['home'].unique()) + list(df['away'].unique())))
first_shooter_teams = list(df['first_shooter'].unique())

teams_to_handle = []
for team in first_shooter_teams:
    if team in teams:
        pass
    else:
        teams_to_handle.append(team)

teams_to_change = {}
for team in teams_to_handle:
    names = list(df[df['first_shooter'] == team].iloc[0][['home','away']].values)
    teams_to_change[team] = difflib.get_close_matches(team, names, cutoff=0.1)[0]

df['first_shooter'] = df['first_shooter'].replace(teams_to_change)

In [15]:
# Calculating if the first shooter won or not
df['First_team_wins'] = df.apply(lambda x: first_shooter_win(x.home, x.away, x.score, x.first_shooter), axis=1)

In [20]:
# Saving the data
df.to_csv("./data/penalty_shootouts.csv")