# Chasing Aggregates

It looks like all of our `cml_` fields are hovering around `50%`. That makes is feel like we are calculating the for and against at the same time. This is wrong for all of these fields. 

## Steps to Investigate
- Set up test to `calculate aggregates` for a single year
- Print out some aggregate for that year, week by week for a team
  as well as the other meaningful data for that team

In [2]:
import utils.game_utils as gu
import utils.plot as guplot

import math
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from utils.data_boss import DataBoss
dboss = DataBoss()


nfld = gu.NFL_Data()

YEAR = 2018
WEEK = 14
TEAM = gu.TEAM_NAME['Saints']

## Calculate Aggregates

In [3]:
# TBD - once we have the data presenting below

## Print Aggregates for the week

Intent here is to see the data for a team by-week so that we can see the aggregation at work. A visualization may be helpful as well.

In [4]:
FIELDS = gu.COMMON_FIELDS + [
  'team_cml_pass_yards_before',
  'team_pass_yards',
  'team_cml_pass_yards_after',
]

teams_df = dboss.teams_df
year_df = gu.get_year(teams_df, YEAR)
team_df = year_df[year_df['team'] == TEAM]
team_df[FIELDS]

Unnamed: 0,date,year,week,team,team_score,opponent,opponent_score,win,home,team_cml_pass_yards_before,team_pass_yards,team_cml_pass_yards_after
4106,2018-09-09,2018,1,New Orleans Saints,40,Tampa Bay Buccaneers,48,0,1,0,439,439
4144,2018-09-16,2018,2,New Orleans Saints,21,Cleveland Browns,18,1,1,439,243,682
4250,2018-10-08,2018,5,New Orleans Saints,43,Washington Redskins,19,1,1,1305,363,1668
4358,2018-11-04,2018,9,New Orleans Saints,45,Los Angeles Rams,35,1,1,2044,346,2390
4412,2018-11-18,2018,11,New Orleans Saints,48,Philadelphia Eagles,7,1,1,2655,373,3028
4422,2018-11-22,2018,12,New Orleans Saints,31,Atlanta Falcons,17,1,1,3028,171,3199
4570,2018-12-23,2018,16,New Orleans Saints,31,Pittsburgh Steelers,28,1,1,3730,326,4056
4586,2018-12-30,2018,17,New Orleans Saints,14,Carolina Panthers,33,0,1,4056,118,4174


### Checking if accumulation is not working

The below goes through all `cml` fields to see if their `_after_` is equal to `_before + current`. At the time of this writing there were no examples where this was incorrect.

In [5]:
teams_df = dboss.teams_df
year_df = gu.get_year(teams_df, YEAR)

cols = teams_df.columns
team_cml_cols = pd.Series(cols[cols.str.contains('team_cml_')]).sort_values()
cml_cols = team_cml_cols.apply(lambda x: x.replace('team_', '')).values
# cml_cols = team_cml_cols.values

df = teams_df
for col in cml_cols:
  if "_perf" not in col:
    root_field = cml_root_field = col.replace('cml_', '').replace('_before', '').replace('_after', '')
    if root_field == 'points': 
      root_field = 'score'
    l = len(df[df[f'team_cml_{cml_root_field}_after'] != (df[f'team_cml_{cml_root_field}_before']+df[f'team_{root_field}'])])
    print(f"[{l}] {root_field}")

[0] first_downs
[0] first_downs
[0] fumble_gained
[0] fumble_gained
[0] fumble_lost
[0] fumble_lost
[0] interceptions_gained
[0] interceptions_gained
[0] interceptions_lost
[0] interceptions_lost
[0] pass_completions
[0] pass_completions
[0] pass_count
[0] pass_count
[0] pass_yards
[0] pass_yards
[0] penalty_count
[0] penalty_count
[0] score
[0] score
[0] rush_count
[0] rush_count
[0] rush_yards
[0] rush_yards
[0] sack_count
[0] sack_count
[0] sack_gained
[0] sack_gained
[0] top_sec
[0] top_sec
[0] total_yards
[0] total_yards
[0] turnovers_gained
[0] turnovers_gained
[0] turnovers_lost
[0] turnovers_lost


# TESTER : CML Values for all data

The below shows that all fields are 50%, 55%, 45% and the like. Rerunning the below should get us data much more separated once things are fixed.

In [6]:
###
### CML PERCENTAGES
###


def get_perc(df, field):
    '''
    Given a df and a field, return the number of times the 
    winning team lead in that field.
    '''
    wins_df = df[
        # home team won      &  home team lead in stat
        ((df['win'] == 1) & (df[f'team_{field}'] > df[f'opponent_{field}'])) |
        # away team won      &  away team lead in stat
        ((df['win'] == 0) & (df[f'team_{field}'] <= df[f'opponent_{field}']))
      ]
    return (len(wins_df) / len(df)) * 100


def work_fields(df, fields):
    data = {}
    for field in fields:
        data[field] = get_perc(df, field)
    return data


games_df = dboss.games_df
# games_df = dboss.teams_df

# read in all CML columns and build an
# array of them (without team_ and opponent_)
cols = games_df.columns
team_cml_cols = pd.Series(cols[cols.str.contains('team_cml_')]).sort_values()
cml_cols = team_cml_cols.apply(lambda x: x.replace('team_', '')).values

data = work_fields(gu.get_year(games_df, 2021), cml_cols)

# array each item so it can be DF'd
for item in data:
    data[item] = [data[item]]
pdf = pd.DataFrame(data)

# pdf = pdf.T.sort_values(by=0).T
# pdf.T[0].sort_values(ascending=False)
# pdf.T
percs_df = pdf.T

percs_df.sort_values(by=0, ascending=False)


Unnamed: 0,0
cml_points_after,70.30303
cml_turnovers_gained_after,65.454545
cml_interceptions_gained_after,64.848485
cml_comb_comp_perf_after,63.636364
cml_comb_def_perf_after,63.636364
cml_comb_off_perf_after,63.636364
cml_rush_def_perf_after,62.424242
cml_rush_off_perf_after,62.424242
cml_rush_comp_perf_after,62.424242
cml_first_downs_after,61.818182


In [7]:
percs_df.T[['cml_pass_count_before','cml_pass_count_after']]

Unnamed: 0,cml_pass_count_before,cml_pass_count_after
0,53.939394,46.060606


In [17]:
df = dboss.year_week(2018, 6)

home_wins_when_leading_df = df[((df['win'] == 1) & (df[f'team_cml_pass_count_before'] > df[f'opponent_cml_pass_count_before']))]
away_wins_when_leading_df = df[((df['win'] == 0) & (df[f'team_cml_pass_count_before'] <= df[f'opponent_cml_pass_count_before']))]
any_wins_when_leading_df = df[(
    ((df['win'] == 1) & (df[f'team_cml_pass_count_before'] > df[f'opponent_cml_pass_count_before']))
    |
    ((df['win'] == 0) & (df[f'team_cml_pass_count_before'] <= df[f'opponent_cml_pass_count_before']))
  )]

In [9]:
def cmp_fields(field):
  return [
    'year_week',
    'team_score',
    'opponent_score',
    'win',
    f'team_{field}',
    f'opponent_{field}',
    ]
home_wins_when_leading_df[cmp_fields('cml_pass_count_before')]

Unnamed: 0,year_week,team_score,opponent_score,win,team_cml_pass_count_before,opponent_cml_pass_count_before
4256,2018-6,34,29,1,190,124
4264,2018-6,27,17,1,226,186
4280,2018-6,33,30,1,216,162


In [10]:
away_wins_when_leading_df[cmp_fields('cml_pass_count_before')]

Unnamed: 0,year_week,team_score,opponent_score,win,team_cml_pass_count_before,opponent_cml_pass_count_before
4274,2018-6,0,21,0,150,218


In [18]:
any_wins_when_leading_df[cmp_fields('cml_pass_count_before')]

Unnamed: 0,year_week,team_score,opponent_score,win,team_cml_pass_count_before,opponent_cml_pass_count_before
4256,2018-6,34,29,1,190,124
4264,2018-6,27,17,1,226,186
4274,2018-6,0,21,0,150,218
4280,2018-6,33,30,1,216,162


## QUESTION

I am here wondering still about the balanced numbers above. It seems like what I need to do is spot-check a few like below. It looks like the `cml_pass_count_before` was calculated properly, and that the win by team or opponent is calculated properly... but when I spot check it with 1 week in a year there are few who fall into the "did win when" bucket. But maybe I'm not asking quite the right question.

In [39]:
def report(year, week=None):
  print("-----")
  if week is None:
    df = dboss.year(year)
    print(f"year: {year}")
  else:
    df = dboss.year_week(year, week)
    print(f"year: {year}, week: {week}")
  print("-----")

  home_wins_when_leading_df = df[((df['win'] == 1) & (df[f'team_cml_pass_count_before'] > df[f'opponent_cml_pass_count_before']))]
  away_wins_when_leading_df = df[((df['win'] == 0) & (df[f'team_cml_pass_count_before'] <= df[f'opponent_cml_pass_count_before']))]
  any_wins_when_leading_df = df[(
      ((df['win'] == 1) & (df[f'team_cml_pass_count_before'] > df[f'opponent_cml_pass_count_before']))
      |
      ((df['win'] == 0) & (df[f'team_cml_pass_count_before'] <= df[f'opponent_cml_pass_count_before']))
    )]

  home_perc = gu.perc_str(home_wins_when_leading_df, df)
  away_perc = gu.perc_str(away_wins_when_leading_df, df)
  win_perc = gu.perc_str(any_wins_when_leading_df, df)

  print(f"{home_perc} home_perc")
  print(f"{away_perc} away_perc")
  print(f"{win_perc} win_perc")
  return win_perc


year = 2017
percs = []
for i in range(1,18):
  percs.append(report(year, i))

report(year)

percs

-----
year: 2017, week: 1
-----
0.0 home_perc
53.33 away_perc
53.33 win_perc
-----
year: 2017, week: 2
-----
18.75 home_perc
25.0 away_perc
43.75 win_perc
-----
year: 2017, week: 3
-----
37.5 home_perc
25.0 away_perc
62.5 win_perc
-----
year: 2017, week: 4
-----
31.25 home_perc
31.25 away_perc
62.5 win_perc
-----
year: 2017, week: 5
-----
7.14 home_perc
35.71 away_perc
42.86 win_perc
-----
year: 2017, week: 6
-----
7.14 home_perc
57.14 away_perc
64.29 win_perc
-----
year: 2017, week: 7
-----
26.67 home_perc
6.67 away_perc
33.33 win_perc
-----
year: 2017, week: 8
-----
30.77 home_perc
15.38 away_perc
46.15 win_perc
-----
year: 2017, week: 9
-----
23.08 home_perc
23.08 away_perc
46.15 win_perc
-----
year: 2017, week: 10
-----
21.43 home_perc
21.43 away_perc
42.86 win_perc
-----
year: 2017, week: 11
-----
21.43 home_perc
28.57 away_perc
50.0 win_perc
-----
year: 2017, week: 12
-----
18.75 home_perc
18.75 away_perc
37.5 win_perc
-----
year: 2017, week: 13
-----
25.0 home_perc
18.75 away_pe

[53.33,
 43.75,
 62.5,
 62.5,
 42.86,
 64.29,
 33.33,
 46.15,
 46.15,
 42.86,
 50.0,
 37.5,
 43.75,
 50.0,
 50.0,
 50.0,
 37.5]

In [40]:
percs = np.array(percs)
percs.mean()

48.02764705882353

In [13]:
cml_cols

array(['cml_comb_comp_perf_after', 'cml_comb_comp_perf_before',
       'cml_comb_def_perf_after', 'cml_comb_def_perf_before',
       'cml_comb_off_perf_after', 'cml_comb_off_perf_before',
       'cml_first_downs_after', 'cml_first_downs_before',
       'cml_fumble_gained_after', 'cml_fumble_gained_before',
       'cml_fumble_lost_after', 'cml_fumble_lost_before',
       'cml_interceptions_gained_after',
       'cml_interceptions_gained_before', 'cml_interceptions_lost_after',
       'cml_interceptions_lost_before', 'cml_pass_comp_perf_after',
       'cml_pass_comp_perf_before', 'cml_pass_completions_after',
       'cml_pass_completions_before', 'cml_pass_count_after',
       'cml_pass_count_before', 'cml_pass_def_perf_after',
       'cml_pass_def_perf_before', 'cml_pass_off_perf_after',
       'cml_pass_off_perf_before', 'cml_pass_yards_after',
       'cml_pass_yards_before', 'cml_penalty_count_after',
       'cml_penalty_count_before', 'cml_points_after',
       'cml_points_before', '