In [85]:
import nwslpy
import pandas as pd
import datetime

In [86]:
# Collect raw data
matches = nwslpy.load_matches()
matches = matches[
    matches["season"].apply(lambda x: "Fall" not in x and "Challenge" not in x)
]
teams = nwslpy.load_teams()

In [87]:
# Remove duplicate matches
matches = matches.drop("angel-city-fc-vs-chicago-red-stars-2023-06-04")
matches = matches.drop("chicago-red-stars-vs-north-carolina-courage-2023-06-09")

In [88]:
# Transform data into date, team, season, points

# Remove playoff games
playoffs = {
    "2023 Challenge Cup": datetime.date(2023, 8, 7),
    "2023": datetime.date(2023, 10, 21),
    "2022 Challenge Cup": datetime.date(2022, 5, 3),
    "2022": datetime.date(2022, 10, 15),
    "2021 Challenge Cup": datetime.date(2021, 5, 7),
    "2021": datetime.date(2021, 11, 6),
    "2020 Fall Series": datetime.date(2020, 10, 18),
    "2020 Challenge Cup": datetime.date(2020, 7, 16),
    "2019": datetime.date(2019, 10, 19),
    "2018": datetime.date(2018, 9, 14),
    "2017": datetime.date(2017, 10, 6),
    "2016": datetime.date(2016, 9, 29),
}
matches["is_playoff"] = matches.apply(
    lambda row: row["kickoff"] > pd.Timestamp(playoffs[row["season"]]), axis=1
)
matches = matches[~matches["is_playoff"]]
matches = matches[
    [
        "home_team_id",
        "away_team_id",
        "kickoff",
        "home_team_score",
        "away_team_score",
        "season",
    ]
]

# Filter to desired columns
def get_points(team, opp):
    if team > opp:
        return 3
    elif team < opp:
        return 0
    else:
        return 1


home = matches.copy().set_index("home_team_id").join(teams)
home["points"] = home.apply(
    lambda x: get_points(x["home_team_score"], x["away_team_score"]), axis=1
)
home = home[["kickoff", "team_abbreviation", "season", "points"]]

away = matches.copy().set_index("away_team_id").join(teams)
away["points"] = away.apply(
    lambda x: get_points(x["away_team_score"], x["home_team_score"]), axis=1
)
away = away[["kickoff", "team_abbreviation", "season", "points"]]
df = pd.concat([home, away])

# Rename columns
df = df.rename(columns={"kickoff": "date", "team_abbreviation": "team"})
df["date"] = pd.to_datetime(df["date"]).dt.date
df = df.sort_values("date")
df

Unnamed: 0,date,team,season,points
2,2016-04-16,CHI,2016,0
9,2016-04-16,WAS,2016,3
10,2016-04-16,WNY,2016,3
3,2016-04-16,KC,2016,0
1,2016-04-16,BOS,2016,0
...,...,...,...,...
8,2023-10-15,POR,2023,1
11,2023-10-15,NC,2023,1
13,2023-10-15,KCC,2023,1
15,2023-10-15,LA,2023,1


In [89]:
# Add in fbref data
fbref = pd.read_csv("fbref.csv")
fbref = fbref[~fbref["is_playoff"]]
home = fbref.copy()
home["points"] = home.apply(
    lambda x: get_points(x["home_team_score"], x["away_team_score"]), axis=1
)
home["team"] = home["home_team"]
home = home[["kickoff", "team", "season", "points"]]

away = fbref.copy()
away["points"] = away.apply(
    lambda x: get_points(x["away_team_score"], x["home_team_score"]), axis=1
)
away["team"] = away["away_team"]
away = away[["kickoff", "team", "season", "points"]]
df2 = pd.concat([home, away])
df2 = df2.rename(columns={"kickoff": "date"})
df2["date"] = pd.to_datetime(df2["date"]).dt.date
df2 = df2.sort_values("date")
df2

df = pd.concat([df, df2])

In [91]:
def cumulative_sum(lists):
    cu_list = []
    length = len(lists)
    cu_list = [sum(lists[0:x:1]) for x in range(0, length + 1)]
    return cu_list[1:]


def get_points(team, season):
    subset = df[(df["team"] == team) & (df["season"] == season)]
    return cumulative_sum(list(subset["points"]))

matches_per_season = {
    "2013": 22,
    "2014": 24,
    "2015": 20,
    "2016": 20,
    "2017": 24,
    "2018": 24,
    "2019": 24,
    "2021": 24,
    "2022": 22,
    "2023": 22,
}

# Aggregate by team, season
results = pd.DataFrame([], columns=["team", "season", "points"])
for i, row in df.groupby(["team", "season"]).count().reset_index().iterrows():
    points = get_points(row["team"], row["season"])
    if len(points) != matches_per_season[str(row["season"])]:
        raise Error("incorrect number of matches")
    results.loc[len(results.index)] = [
        row["team"],
        row["season"],
        points,
    ]

results = results.sort_values(["season", "team"])

  return array(a, dtype, copy=False, order=order)


In [92]:
# Write to a file
results.to_csv("final.csv", index=False)