In [1]:
import pandas as pd
import datetime
from common import Data

In [9]:
df = Data.get_nwsl_matches()
df = df[
    ~df["season"].str.contains("Fall")
    & ~df["season"].str.contains("Challenge")
    & ~df["is_playoffs"]
]
df

Unnamed: 0,date,time,season,home,away,home_score,away_score,home_penalty_score,away_penalty_score,home_xg,away_xg,referee,attendance,stadium,is_forfeit,is_extra_time,is_pks,is_playoffs
0,2013-04-13,19:35,2013,KC,POR,1,1,,,,,Kari Seitz,6784.0,Shawnee Mission District Stadium,False,False,False,False
1,2013-04-14,17:00,2013,CHI,RGN,1,1,,,,,Josh Wilkens,1255.0,Village of Lisle-Benedictine University ...,False,False,False,False
2,2013-04-14,18:00,2013,NJNY,WNY,1,0,,,,,John McCloskey,2611.0,Yurcak Field,False,False,False,False
3,2013-04-14,18:30,2013,BOS,WAS,1,1,,,,,Hernan Aguilar,2634.0,Dilboy Stadium,False,False,False,False
4,2013-04-20,19:00,2013,WAS,WNY,1,1,,,,,Kari Seitz,4569.0,Maureen Hendricks Field at Maryland Socc...,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1272,2023-10-15,14:00,2023,LA,POR,5,1,,,1.5,0.6,Elvis Osmanovic,22000.0,BMO Stadium,False,False,False,False
1273,2023-10-15,16:00,2023,CHI,RGN,0,3,,,0.3,1.9,Anya Voigt,8004.0,SeatGeek Stadium,False,False,False,False
1274,2023-10-15,17:00,2023,ORL,HOU,1,0,,,3.1,1.2,Natalie Simon,8504.0,Exploria Stadium,False,False,False,False
1275,2023-10-15,17:00,2023,WAS,NC,0,1,,,1.0,1.2,Ricardo Fierro,15479.0,Audi Field,False,False,False,False


In [10]:
# Filter to desired columns
def get_points(team, opp):
    if team > opp:
        return 3
    elif team < opp:
        return 0
    else:
        return 1


home = df.copy()
home["points"] = home.apply(
    lambda x: get_points(x["home_score"], x["away_score"]), axis=1
)
home = home[["date", "home", "season", "points"]]

away = df.copy()
away["points"] = away.apply(
    lambda x: get_points(x["away_score"], x["home_score"]), axis=1
)
away = away[["date", "home", "season", "points"]]
df = pd.concat([home, away])
df = df.rename(columns={"home": "team"})
df

Unnamed: 0,date,team,season,points
0,2013-04-13,KC,2013,1
1,2013-04-14,CHI,2013,1
2,2013-04-14,NJNY,2013,3
3,2013-04-14,BOS,2013,1
4,2013-04-20,WAS,2013,1
...,...,...,...,...
1272,2023-10-15,LA,2023,0
1273,2023-10-15,CHI,2023,3
1274,2023-10-15,ORL,2023,0
1275,2023-10-15,WAS,2023,3


In [11]:
def cumulative_sum(lists):
    cu_list = []
    length = len(lists)
    cu_list = [sum(lists[0:x:1]) for x in range(0, length + 1)]
    return cu_list[1:]


def get_points(team, season):
    subset = df[(df["team"] == team) & (df["season"] == season)]
    return cumulative_sum(list(subset["points"]))

matches_per_season = {
    "2013": 22,
    "2014": 24,
    "2015": 20,
    "2016": 20,
    "2017": 24,
    "2018": 24,
    "2019": 24,
    "2021": 24,
    "2022": 22,
    "2023": 22,
}

# Aggregate by team, season
results = pd.DataFrame([], columns=["team", "season", "points"])
for i, row in df.groupby(["team", "season"]).count().reset_index().iterrows():
    points = get_points(row["team"], row["season"])
    if len(points) != matches_per_season[str(row["season"])]:
        raise Error("incorrect number of matches")
    results.loc[len(results.index)] = [
        row["team"],
        row["season"],
        points,
    ]

results = results.sort_values(["season", "team"])

  return array(a, dtype, copy=False, order=order)


In [8]:
# Write to a file
results.to_csv("final.csv", index=False)