In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

import re

import numpy as np



### The first thing we need to do is get a list of all the match urls so we can access each one individually

In [37]:
# set some options for the scraper so it doesn't pull up the chrome tab every time
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# base url that each individual match's information in appended on to 
base_url = "https://www.fotmob.com/leagues/130/matches/mls?group=by-date&page="
match_urls = []

# acitvate the driver 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [38]:
# fotmob has multiplee pages for each season, and these are the ones with the desired matches  
for page in range(1, 29):  # this is MW 11-29 (sort of; the pages don't align so you miss a few or have a few extra)
    driver.get(base_url + str(page))
    
    # Wait for links to load and then information is present on the page
    WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
    )
    
    # if the link on the page has 'matches' in it, it gets read as a match link 
    links = driver.find_elements(By.TAG_NAME, "a")
    page_urls = [a.get_attribute("href") for a in links if "/matches/" in a.get_attribute("href")]
    match_urls.extend(page_urls)

driver.quit()

In [39]:
# there are urls for the whole page (not individual matches) that have 130 in then (the MLS league ID on fotmob)
# so we can get rid of these because they aren't actual games 

match_urls = [x for x in match_urls if '130/matches' not in x]
len(match_urls)

390

In [2]:
match_urls

### 2025 VAR Events loop

### This one goes through every match url and takes the VAR information from the commentary 

In [41]:
# set options
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# initiate driver
driver_all = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# initiate some blank lists that will have things added on to them 
var_all = []
var_records = []

for url in match_urls:
    MLS_url = url + ":tab=ticker"  # this is on the end of the urls when you're on the commentary page on fotmob
    driver_all.get(MLS_url)

    # wait for ticker content to fully load, then use the CSS selector to find the commentary 
    try:
        WebDriverWait(driver_all, 10).until(
            lambda d: any(e.text.strip() != "" for e in 
                          d.find_elements(By.CSS_SELECTOR, ".css-uf1mns-LiveTickerTextOnly"))
        )
    except:
        print(f"⚠️ No ticker text loaded for {MLS_url}")
        continue   # only skip if wait fails

    # grab team names from the title of the web page 
    title = driver_all.title
    if " vs " in title:
        home_team = title.split(" vs ")[0]
        away_team = title.split(" vs ")[1].split(" -")[0]   

    # grab commentary entries using the CSS selectors that i found by inspecting the web page 
    commentary_blocks = driver_all.find_elements(By.CSS_SELECTOR, ".css-uf1mns-LiveTickerTextOnly")
    minute_blocks = driver_all.find_elements(By.CSS_SELECTOR, ".css-1c3t9g9-EventTimeMain")

    # refs are a little tougher because the text is technically inside a box that has an image in it as well 
    try:
        referee_elem = driver_all.find_element(By.CSS_SELECTOR, ".css-1q2gy5n-RevereeCSS")
        referee = referee_elem.get_attribute("innerText")
    except:
        referee = "NA"
    

    # initialize empty scores so we can keep track of them 
    score_pattern = re.compile(r"(\d+)")
    current_home_score = None
    current_away_score = None

    for i, entry in enumerate(commentary_blocks):
        text = entry.text.strip()
        
        # update score whenever a commentary line contains a score update
        if "," in text and re.search(r"\d", text):  # crude filter for score lines
            numbers = score_pattern.findall(text)
            if len(numbers) >= 2:  # last two numbers = home, away
                current_home_score = int(numbers[-2])
                current_away_score = int(numbers[-1])
        
        # get the minute of the VAR event
        if "VAR" in text:
            minute = minute_blocks[i].text if i < len(minute_blocks) else None

            # get the 3 events before the VAR event in the commentary 
            prev1 = commentary_blocks[i+1].text.strip() if i+1 >= 0 else None
            prev2 = commentary_blocks[i+2].text.strip() if i+2 >= 0 else None
            prev3 = commentary_blocks[i+3].text.strip() if i+3 >= 0 else None

            record = {
                "home_team": home_team,
                "away_team": away_team,
                "minute": minute,
                "var_text": text,
                "prev1": prev1,
                "prev2": prev2,
                "prev3": prev3,
                "home_score_at_time": current_home_score,
                "away_score_at_time": current_away_score,
                "referee": referee
                # "match_url": MLS_url
            }
            print(record)
            var_records.append(record)
            var_all.append(text)

driver_all.quit()
# print(var_all)

{'home_team': 'New England Revolution', 'away_team': 'Columbus Crew', 'minute': '90’', 'var_text': 'VAR Decision: No Penalty New England Revolution.', 'prev1': 'Attempt blocked. Carles Gil (New England Revolution) left footed shot from a difficult angle and long range on the right is blocked.', 'prev2': 'Foul by Malte Amundsen (Columbus Crew).', 'prev3': 'Luis Díaz (New England Revolution) wins a free kick on the right wing.', 'home_score_at_time': 0, 'away_score_at_time': 1, 'referee': 'Filip Dujic'}
{'home_team': 'Philadelphia Union', 'away_team': 'FC Cincinnati', 'minute': '33’', 'var_text': 'VAR Decision: Goal Philadelphia Union 2-0 FC Cincinnati (Tai Baribo).', 'prev1': 'Attempt missed. Quinn Sullivan (Philadelphia Union) left footed shot from the centre of the box is too high.', 'prev2': 'Foul by Evander (FC Cincinnati).', 'prev3': 'Tai Baribo (Philadelphia Union) wins a free kick in the defensive half.', 'home_score_at_time': 2, 'away_score_at_time': 0, 'referee': 'Allen Chapman

In [3]:
# make into a data frame 
var_df = pd.DataFrame(var_records)

In [43]:
# get rid of repeats by only including the instance of each event that has this string in the commentary 
# there's exactly one entry in the commentary with 'VAR Decision' for each VAR event
var_df = var_df[var_df['var_text'].str.contains('VAR Decision')]

In [44]:
# get the first 3 words after VAR decision, since in most cases that tells you enough information
# Ex. No penalty Seattle

# test_merge["var_decision"] = test_merge["var_text"].str.extract(r"(VAR Decision: \w+(?: \w+){0,2})")
var_df["var_decision"] = var_df["var_text"].str.extract(r"(VAR Decision: \w+(?: \w+){0,2})")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df["var_decision"] = var_df["var_text"].str.extract(r"(VAR Decision: \w+(?: \w+){0,2})")


In [47]:
# time to make some boolean columns that give us more information on the type of review 
var_df_final = var_df

# this tell us if the event was awarded and which of the 3 match altering events it was 
var_df_final['Not Given'] = var_df_final['var_decision'].str.contains("No")
var_df_final['Penalty'] = var_df_final['var_decision'].str.contains("Penalty")
var_df_final['Card'] = var_df_final['var_decision'].str.contains("Card")
var_df_final['Goal'] = var_df_final['var_decision'].str.contains("Goal")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df_final['Not Given'] = var_df_final['var_decision'].str.contains("No")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df_final['Penalty'] = var_df_final['var_decision'].str.contains("Penalty")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df_final['Card'] = var_df_final['var_decision'

In [48]:
# get rid of the ' on the minute 

var_df_final['minute'] = var_df_final['minute'].str.replace("’", "")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df_final['minute'] = var_df_final['minute'].str.replace("’", "")


In [49]:
# make the minute column numeric 

var_df_final['minute'] = pd.to_numeric(var_df_final['minute'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  var_df_final['minute'] = pd.to_numeric(var_df_final['minute'])


In [4]:
# seeing if var reviews came from set pieces by searching the previous events for some key words
# pattern for case-insensitive match of any keyword

pattern = r"(free kick|corner kick|goal kick|throw in)"

# check across multiple columns
var_df_final["Set Piece"] = var_df_final[["prev1", "prev2", "prev3"]].apply(
    lambda row: row.str.contains(pattern, flags=re.IGNORECASE, na=False).any(),
    axis=1
)


In [5]:
# these will help us start to make the gamestate column by indicating which team is winning
# at the time of the decision 
# it goes through and compares the scores and then decides who's in the lead 

var_df_final['home_score_at_time'] = pd.to_numeric(var_df_final['home_score_at_time'])
var_df_final['away_score_at_time'] = pd.to_numeric(var_df_final['away_score_at_time'])

conditions = [
    var_df_final['home_score_at_time'] > var_df_final['away_score_at_time'],
    var_df_final['home_score_at_time'] == var_df_final['away_score_at_time'],
    var_df_final['home_score_at_time'] < var_df_final['away_score_at_time']
]

choices = ['Home', 'Tied', 'Away']

var_df_final['Winning'] = np.select(conditions, choices)

In [52]:
# save the final data frame 

var_df_final.to_csv('/Users/arthurlennard//Desktop/MLS Data Science/var_2025_new.csv')

### Let's try to get some 2024 data (exact same process)

In [22]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Example league page
base_url_24 = "https://www.fotmob.com/leagues/130/matches/mls?season=2024&group=by-date&page="
match_urls_24 = []

driver_24 = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [23]:
for page in range(1, 37): 
    driver_24.get(base_url_24 + str(page))
    
    # Wait for links to load
    WebDriverWait(driver_24, 20).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
    )
    
    links = driver_24.find_elements(By.TAG_NAME, "a")
    page_urls = [a.get_attribute("href") for a in links if "/matches/" in a.get_attribute("href")]
    match_urls_24.extend(page_urls)

driver_24.quit()

In [132]:
match_urls_24

['https://www.fotmob.com/leagues/130/matches/mls?season=2024&group=by-date&page=34#main-content',
 'https://www.fotmob.com/leagues/130/matches/mls?season=2024',
 'https://www.fotmob.com/matches/new-york-city-fc-vs-nashville-sc/dmztmw7m#4387029',
 'https://www.fotmob.com/matches/fc-dallas-vs-portland-timbers/mnrhsos#4387036',
 'https://www.fotmob.com/matches/columbus-crew-vs-new-england-revolution/1b4jcj#4386785',
 'https://www.fotmob.com/matches/vancouver-whitecaps-vs-los-angeles-fc/8t3ytgca#4387019',
 'https://www.fotmob.com/matches/cf-montreal-vs-new-york-city-fc/36ydg5bv#4387317',
 'https://www.fotmob.com/matches/chicago-fire-fc-vs-nashville-sc/5fcjuiad#4387284',
 'https://www.fotmob.com/matches/dc-united-vs-charlotte-fc/ban4d2jn#4387285',
 'https://www.fotmob.com/matches/new-england-revolution-vs-inter-miami-cf/5yx5j0uy#4387275',
 'https://www.fotmob.com/matches/columbus-crew-vs-new-york-red-bulls/1ams94#4387287',
 'https://www.fotmob.com/matches/orlando-city-vs-atlanta-united/6xaa

In [24]:
match_urls_24 = [x for x in match_urls_24 if '130/matches' not in x]
len(match_urls_24)

492

In [25]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver_all_24 = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

var_all_24 = []
var_records_24 = []

for url in match_urls_24:  # change back to match when you want to do the whole thing
    MLS_url = url + ":tab=ticker"
    driver_all_24.get(MLS_url)

    # ✅ Wait for ticker content to fully load
    try:
        WebDriverWait(driver_all_24, 10).until(
            lambda d: any(e.text.strip() != "" for e in 
                          d.find_elements(By.CSS_SELECTOR, ".css-uf1mns-LiveTickerTextOnly"))
        )
    except:
        print(f"⚠️ No ticker text loaded for {MLS_url}")
        continue   # ✅ only skip if wait fails

    # ✅ Grab team names
    title = driver_all_24.title
    if " vs " in title:
        home_team = title.split(" vs ")[0]
        away_team = title.split(" vs ")[1].split(" -")[0]   

    # ✅ Grab commentary entries
    commentary_blocks = driver_all_24.find_elements(By.CSS_SELECTOR, ".css-uf1mns-LiveTickerTextOnly")
    minute_blocks = driver_all_24.find_elements(By.CSS_SELECTOR, ".css-1c3t9g9-EventTimeMain")

    try:
        referee_elem = driver_all_24.find_element(By.CSS_SELECTOR, ".css-1q2gy5n-RevereeCSS")
        referee = referee_elem.get_attribute("innerText")
    except:
        referee = "NA"

    score_pattern = re.compile(r"(\d+)")
    current_home_score = None
    current_away_score = None

    for i, entry in enumerate(commentary_blocks):
        text = entry.text.strip()
        
        # ✅ Update score whenever a commentary line contains a score update
        if "," in text and re.search(r"\d", text):  # crude filter for score lines
            numbers = score_pattern.findall(text)
            if len(numbers) >= 2:  # last two numbers = home, away
                current_home_score = int(numbers[-2])
                current_away_score = int(numbers[-1])
        
        if "VAR" in text:
            minute = minute_blocks[i].text if i < len(minute_blocks) else None

            prev1 = commentary_blocks[i+1].text.strip() if i+1 < len(commentary_blocks) else None
            prev2 = commentary_blocks[i+2].text.strip() if i+2 < len(commentary_blocks) else None
            prev3 = commentary_blocks[i+3].text.strip() if i+3 < len(commentary_blocks) else None

            record = {
                "home_team": home_team,
                "away_team": away_team,
                "minute": minute,
                "var_text": text,
                "prev1": prev1,
                "prev2": prev2,
                "prev3": prev3,
                "home_score_at_time": current_home_score,
                "away_score_at_time": current_away_score,
                "referee": referee
                # "match_url": MLS_url
            }
            print(record)
            var_records_24.append(record)
            var_all_24.append(text)

driver_all_24.quit()
# print(var_all_24)

{'home_team': 'FC Cincinnati', 'away_team': 'Toronto FC', 'minute': '5’', 'var_text': 'VAR Decision: No Penalty Toronto FC.', 'prev1': 'Foul by Obinna Nwobodo (FC Cincinnati).', 'prev2': 'Richie Laryea (Toronto FC) wins a free kick on the left wing.', 'prev3': 'Luciano Acosta (FC Cincinnati) wins a free kick in the attacking half.', 'home_score_at_time': 0, 'away_score_at_time': 0, 'referee': 'Jonathan Weiner'}
{'home_team': 'Nashville SC', 'away_team': 'New York Red Bulls', 'minute': '25’', 'var_text': 'VAR Decision: No Red Card Sean Davis (Nashville SC).', 'prev1': 'Delay over. They are ready to continue.', 'prev2': 'Delay in match (New York Red Bulls).', 'prev3': 'Peter Stroud (New York Red Bulls) wins a free kick in the attacking half.', 'home_score_at_time': 0, 'away_score_at_time': 0, 'referee': 'Cristian Campo'}
{'home_team': 'Real Salt Lake', 'away_team': 'Los Angeles FC', 'minute': '84’', 'var_text': 'VAR Decision: No Red Card Braian Ojeda (Real Salt Lake).', 'prev1': 'Delay i

In [26]:
var_df_24 = pd.DataFrame(var_records_24)

In [27]:
var_df_24 = var_df_24[var_df_24['var_text'].str.contains('VAR Decision')]

In [29]:
var_df_24["var_decision"] = var_df_24["var_text"].str.extract(r"(VAR Decision: \w+(?: \w+){0,2})")

In [30]:
var_df_24['Not Given'] = var_df_24['var_decision'].str.contains("No")
var_df_24['Penalty'] = var_df_24['var_decision'].str.contains("Penalty")
var_df_24['Card'] = var_df_24['var_decision'].str.contains("Card")
var_df_24['Goal'] = var_df_24['var_decision'].str.contains("Goal")

In [6]:
var_df_24["Set Piece"] = var_df_24[["prev1", "prev2", "prev3"]].apply(
    lambda row: row.str.contains(pattern, flags=re.IGNORECASE, na=False).any(),
    axis=1
)

In [32]:
var_df_24['minute'] = var_df_24['minute'].str.replace("’", "")

In [33]:
conditions_24 = [
    var_df_24['home_score_at_time'] > var_df_24['away_score_at_time'],
    var_df_24['home_score_at_time'] == var_df_24['away_score_at_time'],
    var_df_24['home_score_at_time'] < var_df_24['away_score_at_time']
]

choices = ['Home', 'Tied', 'Away']

var_df_24['Winning'] = np.select(conditions_24, choices)

In [34]:
var_df_24['minute'] = pd.to_numeric(var_df_24['minute'])

In [35]:
var_df_24.to_csv('/Users/arthurlennard//Desktop/MLS Data Science/var_2024.csv')

### Let's try to get some 2023 data (same process again)

In [2]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

# Example league page
base_url_23 = "https://www.fotmob.com/leagues/130/matches/mls?season=2023&group=by-date&page="
match_urls_23 = []

driver_23 = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [3]:
for page in range(1, 37): 
    driver_23.get(base_url_23 + str(page))
    
    # Wait for links to load
    WebDriverWait(driver_23, 20).until(
        EC.presence_of_all_elements_located((By.TAG_NAME, "a"))
    )
    
    links = driver_23.find_elements(By.TAG_NAME, "a")
    page_urls = [a.get_attribute("href") for a in links if "/matches/" in a.get_attribute("href")]
    match_urls_23.extend(page_urls)

driver_23.quit()

In [4]:
match_urls_23

['https://www.fotmob.com/leagues/130/matches/mls?season=2023&group=by-date&page=1#main-content',
 'https://www.fotmob.com/leagues/130/matches/mls?season=2023',
 'https://www.fotmob.com/matches/colorado-rapids-vs-seattle-sounders-fc/4f3iofg#4084677',
 'https://www.fotmob.com/matches/sporting-kansas-city-vs-portland-timbers/motubfd#4084686',
 'https://www.fotmob.com/matches/portland-timbers-vs-los-angeles-fc/8t3y49q7#4084676',
 'https://www.fotmob.com/matches/toronto-fc-vs-atlanta-united/4ee8htbc#4084807',
 'https://www.fotmob.com/matches/columbus-crew-vs-dc-united/1bah4k#4084667',
 'https://www.fotmob.com/matches/philadelphia-union-vs-inter-miami-cf/8h29qmsy#4084668',
 'https://www.fotmob.com/matches/new-england-revolution-vs-houston-dynamo-fc/1tk487#4084669',
 'https://www.fotmob.com/matches/new-york-red-bulls-vs-nashville-sc/5fec3a54#4084670',
 'https://www.fotmob.com/matches/orlando-city-vs-fc-cincinnati/695s1t43#4084671',
 'https://www.fotmob.com/matches/cf-montreal-vs-austin-fc/c5h

In [5]:
match_urls_23 = [x for x in match_urls_23 if '130/matches' not in x]
len(match_urls_23)

489

In [8]:
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver_all_23 = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

var_all_23 = []
var_records_23 = []

for url in match_urls_23:  # change back to match when you want to do the whole thing
    MLS_url = url + ":tab=ticker"
    driver_all_23.get(MLS_url)

    # ✅ Wait for ticker content to fully load
    try:
        WebDriverWait(driver_all_23, 10).until(
            lambda d: any(e.text.strip() != "" for e in 
                          d.find_elements(By.CSS_SELECTOR, ".css-uf1mns-LiveTickerTextOnly"))
        )
    except:
        print(f"⚠️ No ticker text loaded for {MLS_url}")
        continue   # ✅ only skip if wait fails

    # ✅ Grab team names
    title = driver_all_23.title
    if " vs " in title:
        home_team = title.split(" vs ")[0]
        away_team = title.split(" vs ")[1].split(" -")[0]   

    # ✅ Grab commentary entries
    commentary_blocks = driver_all_23.find_elements(By.CSS_SELECTOR, ".css-uf1mns-LiveTickerTextOnly")
    minute_blocks = driver_all_23.find_elements(By.CSS_SELECTOR, ".css-1c3t9g9-EventTimeMain")

    try:
        referee_elem = driver_all_23.find_element(By.CSS_SELECTOR, ".css-1q2gy5n-RevereeCSS")
        referee = referee_elem.get_attribute("innerText")
    except:
        referee = "NA"

    score_pattern = re.compile(r"(\d+)")
    current_home_score = None
    current_away_score = None

    for i, entry in enumerate(commentary_blocks):
        text = entry.text.strip()
        
        # ✅ Update score whenever a commentary line contains a score update
        if "," in text and re.search(r"\d", text):  # crude filter for score lines
            numbers = score_pattern.findall(text)
            if len(numbers) >= 2:  # last two numbers = home, away
                current_home_score = int(numbers[-2])
                current_away_score = int(numbers[-1])
        
        if "VAR" in text:
            minute = minute_blocks[i].text if i < len(minute_blocks) else None

            prev1 = commentary_blocks[i+1].text.strip() if i+1 < len(commentary_blocks) else None
            prev2 = commentary_blocks[i+2].text.strip() if i+2 < len(commentary_blocks) else None
            prev3 = commentary_blocks[i+3].text.strip() if i+3 < len(commentary_blocks) else None

            record = {
                "home_team": home_team,
                "away_team": away_team,
                "minute": minute,
                "var_text": text,
                "prev1": prev1,
                "prev2": prev2,
                "prev3": prev3,
                "home_score_at_time": current_home_score,
                "away_score_at_time": current_away_score,
                "referee": referee
                # "match_url": MLS_url
            }
            print(record)
            var_records_23.append(record)
            var_all_23.append(text)

driver_all_23.quit()
# print(var_all_23)

{'home_team': 'Colorado Rapids', 'away_team': 'Sporting Kansas City', 'minute': '15’', 'var_text': 'VAR Decision: No Goal Colorado Rapids 0-0 Sporting Kansas City.', 'prev1': 'GOAL OVERTURNED BY VAR: Darren Yapi (Colorado Rapids) scores but the goal is ruled out after a VAR review.', 'prev2': 'Offside, Colorado Rapids. Sam Nicholson tries a through ball, but Darren Yapi is caught offside.', 'prev3': 'Attempt saved. Roger Espinoza (Sporting Kansas City) header from outside the box is saved in the centre of the goal. Assisted by Erik Thommy with a cross.', 'home_score_at_time': 0, 'away_score_at_time': 0, 'referee': 'Allen Chapman'}
{'home_team': 'Colorado Rapids', 'away_team': 'Sporting Kansas City', 'minute': '13’', 'var_text': 'GOAL OVERTURNED BY VAR: Darren Yapi (Colorado Rapids) scores but the goal is ruled out after a VAR review.', 'prev1': 'Offside, Colorado Rapids. Sam Nicholson tries a through ball, but Darren Yapi is caught offside.', 'prev2': 'Attempt saved. Roger Espinoza (Sp

In [9]:
var_df_23 = pd.DataFrame(var_records_23)

In [10]:
var_df_23 = var_df_23[var_df_23['var_text'].str.contains('VAR Decision')]

In [11]:
var_df_23["var_decision"] = var_df_23["var_text"].str.extract(r"(VAR Decision: \w+(?: \w+){0,2})")

In [12]:
var_df_23['Not Given'] = var_df_23['var_decision'].str.contains("No")
var_df_23['Penalty'] = var_df_23['var_decision'].str.contains("Penalty")
var_df_23['Card'] = var_df_23['var_decision'].str.contains("Card")
var_df_23['Goal'] = var_df_23['var_decision'].str.contains("Goal")

In [7]:
var_df_23["Set Piece"] = var_df_23[["prev1", "prev2", "prev3"]].apply(
    lambda row: row.str.contains(pattern, flags=re.IGNORECASE, na=False).any(),
    axis=1
)

In [16]:
var_df_23['minute'] = var_df_23['minute'].str.replace("’", "")

In [17]:
conditions_23 = [
    var_df_23['home_score_at_time'] > var_df_23['away_score_at_time'],
    var_df_23['home_score_at_time'] == var_df_23['away_score_at_time'],
    var_df_23['home_score_at_time'] < var_df_23['away_score_at_time']
]

choices = ['Home', 'Tied', 'Away']

var_df_23['Winning'] = np.select(conditions_23, choices)

In [18]:
var_df_23['minute'] = pd.to_numeric(var_df_23['minute'])

In [20]:
var_df_23.to_csv('/Users/arthurlennard//Desktop/MLS Data Science/var_2023.csv')