In [11]:
import pandas as pd
import datetime

In [12]:
seasons = [
    "2022",
    "2022 Challenge Cup",
    "2021",
    "2021 Challenge Cup",
    "2020 Fall Series",
    "2020 Challenge Cup",
    "2019",
    "2018",
    "2017",
    "2016",
]

In [13]:
df = pd.DataFrame
for season in seasons:
    df2 = pd.read_csv("data/" + season.replace(" ", "_") + "_goals.csv")
    if df.empty:
        df = df2
    else:
        df = df.append(df2, ignore_index=True)
        
df

Unnamed: 0,date,scorer,assist
0,2022-04-29T22:30:00,Vanessa Gilles,Jun Endo
1,2022-04-29T22:30:00,Jun Endo,Savannah McCaskill
2,2022-04-29T22:30:00,Debinha,Emily Gray
3,2022-04-30T18:00:00,Morgan Weaver,
4,2022-04-30T18:00:00,Christine Sinclair,
...,...,...,...
2115,2016-10-02T17:00:00,Lindsey Horan,Dagný Brynjarsdóttir
2116,2016-10-09T17:10:00,Crystal Dunn,Megan Montefusco
2117,2016-10-09T17:10:00,Sam Mewis,Lynn Williams
2118,2016-10-09T17:10:00,Crystal Dunn,Ali Krieger


In [14]:
# Filter to the regular season
# df["date"] = pd.to_datetime(df['date'])
# df = df[df["date"] < datetime.datetime(2022, 10, 3)]

In [15]:
# Calculate the table of goals
goals = pd.DataFrame(
    df.rename(columns={"scorer": "name"}).groupby(["name"])[["name"]].count()
).rename(columns={"name": "goals"}).sort_values("goals", ascending=False).reset_index()
goals

Unnamed: 0,name,goals
0,Lynn Williams,62
1,Sam Kerr,57
2,Christine Sinclair,53
3,Debinha,51
4,Ashley Hatch,45
...,...,...
335,Gina Lewandowski (OG),1
336,Hailie Mace (OG),1
337,Haley Hanson,1
338,Hannah Betfort,1


In [16]:
# Calculate the table of assists
assists = pd.DataFrame(
    df.rename(columns={"assist": "name"}).groupby(["name"])[["name"]].count()
).rename(columns={"name": "assist"}).sort_values("assist", ascending=False).reset_index()
assists

Unnamed: 0,name,assist
0,Jessica McDonald,34
1,Lynn Williams,27
2,Sofia Huerta,26
3,Yuki Nagasato,23
4,Tobin Heath,21
...,...,...
307,Elizabeth Ball,1
308,Mariana Larroquette,1
309,Jordan Baggett,1
310,Manon Melis,1


In [17]:
# Combine the goals and assists table
stats = goals.set_index("name").join(assists.set_index("name"), how="outer").fillna(0).astype(
    int
).sort_values(["goals", "assist"], ascending=False).reset_index()
stats

Unnamed: 0,name,goals,assist
0,Lynn Williams,62,27
1,Sam Kerr,57,13
2,Christine Sinclair,53,11
3,Debinha,51,20
4,Ashley Hatch,45,5
...,...,...,...
404,Shuang Wang,0,1
405,Stephanie Cox,0,1
406,Sydney Miramontez,0,1
407,Tiffany Weimer,0,1


In [18]:
# Count all the goal combos independent where order matters
df_combo_goals = df.dropna()
df_combo_goals = (
    df_combo_goals.groupby(["scorer", "assist"])[["scorer"]]
    .count()
    .rename(columns={"scorer": "goals"})
    .reset_index()
)
df_combo_goals

Unnamed: 0,scorer,assist,goals
0,Abby Erceg,Carson Pickett,2
1,Abby Erceg,Jaelene Daniels,3
2,Abby Erceg,Jessica McDonald,2
3,Abby Erceg,Kristen Hamilton,1
4,Abby Erceg,Malia Berkely,1
...,...,...,...
1021,Yuki Nagasato,Vanessa DiBernardo,1
1022,Yuki Nagasato,Zoey Goralski,1
1023,Zoey Goralski,Morgan Gautrat,1
1024,Évelyne Viens,Ifeoma Onumonu,1


In [20]:
# Order all goals so the higher scorer is first
df_combos = df.dropna()
df_combos = df_combos.rename(columns={"scorer": "p1", "assist": "p2"})
for index, row in df_combos.iterrows():
    p1_index = stats.loc[stats["name"] == row["p1"]].index[0]
    p2_index = stats.loc[stats["name"] == row["p2"]].index[0]
    if p2_index < p1_index:
        p1 = row["p1"]
        p2 = row["p2"]
        df_combos.loc[index, "p1"] = p2
        df_combos.loc[index, "p2"] = p1
df_combos

Unnamed: 0,date,p1,p2
0,2022-04-29T22:30:00,Jun Endo,Vanessa Gilles
1,2022-04-29T22:30:00,Savannah McCaskill,Jun Endo
2,2022-04-29T22:30:00,Debinha,Emily Gray
5,2022-04-30T18:00:00,Sophia Smith,Meghan Klingenberg
6,2022-04-30T20:00:00,Danielle Colaprico,Ella Stevens
...,...,...,...
2115,2016-10-02T17:00:00,Lindsey Horan,Dagný Brynjarsdóttir
2116,2016-10-09T17:10:00,Crystal Dunn,Megan Montefusco
2117,2016-10-09T17:10:00,Lynn Williams,Sam Mewis
2118,2016-10-09T17:10:00,Crystal Dunn,Ali Krieger


In [21]:
# Calculate the table of combos
combos = df_combos.groupby(["p1", "p2"])[["p1"]].count().rename(
    columns={"p1": "combos"}
).sort_values("combos", ascending=False).reset_index()
combos

Unnamed: 0,p1,p2,combos
0,Sam Kerr,Yuki Nagasato,17
1,Lynn Williams,Jessica McDonald,15
2,Lynn Williams,Debinha,12
3,Lindsey Horan,Tobin Heath,9
4,Christine Sinclair,Lindsey Horan,8
...,...,...,...
898,CeCe Kizer,Jorian Baucom,1
899,CeCe Kizer,Hailie Mace,1
900,CeCe Kizer,Erin Simon,1
901,Kristen Hamilton,Julia Spetsmark,1


In [22]:
# p1, p2, p1g, p1a, p2g, p2a, p1g_p2a, p2g_p1a
merged = combos
merged = (
    pd.merge(merged, stats, left_on="p1", right_on="name", how="left")
    .rename(columns={"goals": "p1g", "assist": "p1a"})
    .drop(["name"], axis=1)
)
merged = (
    pd.merge(merged, stats, left_on="p2", right_on="name", how="left")
    .rename(columns={"goals": "p2g", "assist": "p2a"})
    .drop(["name"], axis=1)
)
merged = pd.merge(
    merged,
    df_combo_goals,
    left_on=["p1", "p2"],
    right_on=["scorer", "assist"],
    how="left",
).rename(columns={"goals": "p1g_p2a"}).drop(["scorer", "assist"], axis=1).fillna(0)
merged["p1g_p2a"] = merged["p1g_p2a"].astype(int)
merged = pd.merge(
    merged,
    df_combo_goals,
    left_on=["p1", "p2"],
    right_on=["assist", "scorer"],
    how="left",
).rename(columns={"goals": "p2g_p1a"}).drop(["scorer", "assist"], axis=1).fillna(0)
merged["p2g_p1a"] = merged["p2g_p1a"].astype(int)
merged["goals"] = merged["p1g"] + merged["p2g"]
merged

Unnamed: 0,p1,p2,combos,p1g,p1a,p2g,p2a,p1g_p2a,p2g_p1a,goals
0,Sam Kerr,Yuki Nagasato,17,57,13,19,23,15,2,76
1,Lynn Williams,Jessica McDonald,15,62,27,40,34,10,5,102
2,Lynn Williams,Debinha,12,62,27,51,20,8,4,113
3,Lindsey Horan,Tobin Heath,9,31,14,13,21,7,2,44
4,Christine Sinclair,Lindsey Horan,8,53,11,31,14,5,3,84
...,...,...,...,...,...,...,...,...,...,...
898,CeCe Kizer,Jorian Baucom,1,15,5,1,0,0,1,16
899,CeCe Kizer,Hailie Mace,1,15,5,8,8,0,1,23
900,CeCe Kizer,Erin Simon,1,15,5,0,3,1,0,15
901,Kristen Hamilton,Julia Spetsmark,1,31,12,2,1,0,1,33


In [23]:
# Use total goals as a tiebreaker for the sort
merged = merged.sort_values(["combos", "goals", "p1g"], ascending=False)
merged

Unnamed: 0,p1,p2,combos,p1g,p1a,p2g,p2a,p1g_p2a,p2g_p1a,goals
0,Sam Kerr,Yuki Nagasato,17,57,13,19,23,15,2,76
1,Lynn Williams,Jessica McDonald,15,62,27,40,34,10,5,102
2,Lynn Williams,Debinha,12,62,27,51,20,8,4,113
3,Lindsey Horan,Tobin Heath,9,31,14,13,21,7,2,44
4,Christine Sinclair,Lindsey Horan,8,53,11,31,14,5,3,84
...,...,...,...,...,...,...,...,...,...,...
598,Christina Gibbons,Desiree Scott,1,1,3,0,2,1,0,1
617,Joelle Anderson,Cali Jean Farquharson,1,1,0,0,3,1,0,1
712,Hannah Betfort,Madison Pogarch,1,1,0,0,3,1,0,1
754,Brooke Hendrix,Freja Olofsson,1,1,0,0,1,1,0,1


In [24]:
merged.to_csv("data/all_years_combos.csv", index=False)