### Calculates probability of winning a tennis match from any given score dependent on the skill levels


In [1]:
## calculates probability of winning a tennis match from any given score dependent on the skill levels
## of the two players
#from tennisGameProbability import gameProb
#from tennisSetProbability import setGeneral
#from tennisTiebreakProbability import tiebreakProb

def fact(x):
    if x in [0, 1]:  return 1
    r = 1
    for a in range(1, (x+1)):  r = r*a
    return r
 
def ch(a, b):
    return fact(a)/(fact(b)*fact(a-b))
 
def matchGeneral(e, v=0, w=0, s=3):
    ## calculates probability of winning the match
    ## from the beginning of a set
    ## e is p(winning a set)
    ## v and w is current set score
    ## s is total number of sets ("best of")
    towin = (s+1)//2
    left = towin - v
    if left == 0:   return 1
    remain = s - v - w
    if left > remain:   return 0
    win = 0
    for i in range(int(left), int(remain+1)):
        add = ch((i-1), (left-1))*(e**(left-1))*((1-e)**(i-left))*e
        win += add
    return win

def matchProb(s, t, gv=0, gw=0, sv=0, sw=0, mv=0, mw=0, sets=3):
    ## calculates probability of winning a match from any given score,
    ## given:
    ## s, t: p(server wins a service point), p(server wins return point)
    ## gv, gw: current score within the game. e.g. 30-15 is 2, 1
    ## sv, sw: current score within the set. e.g. 5, 4
    ## mv, mw: current score within the match (number of sets for each player)
    ## v's are serving player; w's are returning player
    ## sets: "best of", so default is best of 3
    a = gameProb(s)
    b = gameProb(t)
    c = setGeneral(s, t)
    if gv == 0 and gw == 0: ## no point score
        if sv == 0 and sw == 0: ## no game score
            return matchGeneral(c, v=mv, w=mw, s=sets)
        else:   ## we're in mid-set, no point score
            sWin = setGeneral(a, b, s, t, v=sv, w=sw)
            sLoss = 1 - sWin
    elif sv == 6 and sw == 6:         
        sWin = tiebreakProb(s, t, v=gv, w=gw)
        sLoss = 1 - sWin       
    else:
        gWin = gameProb(s, v=gv, w=gw)
        gLoss = 1 - gWin
        sWin = gWin*(1 - setGeneral((1-b), (1-a), (1-t), (1-s), v=sw, w=(sv+1)))
        sWin += gLoss*(1 - setGeneral((1-b), (1-a), (1-t), (1-s), v=(sw+1), w=sv))
        sLoss = 1 - sWin
    mWin = sWin*matchGeneral(c, v=(mv+1), w=mw, s=sets)
    mWin += sLoss*matchGeneral(c, v=mv, w=(mw+1), s=sets)
    return mWin

In [2]:
# The probability of winning a single set (this would be derived from your setGeneral function)
single_set_win_prob = 0.65

# Current match score (for example, 1 set to 1 set in a best of 3 match)
mv = 2
mw = 1

# Total number of sets in the match (for example, best of 3)
sets = 5

# Using the matchGeneral function to find the probability of winning the match
match_winning_prob = matchGeneral(single_set_win_prob, v=mv, w=mw, s=sets)

# Printing the result
print(f"The probability of winning the match from the current score is: {match_winning_prob * 100:.2f}%")

The probability of winning the match from the current score is: 87.75%


### VISUALIZATIONS

In [3]:
from elo_vs_surface_elo import *


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [4]:
combined

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [5]:
import os
os.getcwd()

'/Users/arnavboppudi/Desktop/FoundationsCS/TENNIS DASHBOARD/tennis_viz'

In [6]:
# -*- coding: utf-8 -*-
import altair as alt
import pandas as pd

## current Elo ratings available here: http://tennisabstract.com/reports/atp_elo_ratings.html
## (historical ratings and code to generate ratings are not public)

## csv contains weekly elo ratings back to the beginning of 2018 for three players
df = pd.read_csv('data/wimb_sfists_weekly_elos.csv')
df.astype({'Elo': 'float', 
           'Date': 'datetime64[ns]',
           }).dtypes

## get list of the first ranking date in every month, for x-axis labels
dates_only = df.filter(['Date'], axis=1)
dates_only['year_month'] = dates_only.apply(lambda row: str(row['Date'])[:7], axis=1)
first_dates = dates_only.groupby('year_month').min()
first_date_list = first_dates['Date'].tolist()

## line chart with all tournaments
line = alt.Chart(df).mark_line().encode(
    alt.X('Date',
          axis=alt.Axis(title='Date',
                        values=first_date_list)),
    alt.Y('Elo',
          axis=alt.Axis(title='Overall Elo Rating'),
          scale=alt.Scale(domain=(1500,2050))),
    color='Player',
    strokeDash='Player'
).properties(
    width=800,
    height=300
)

line.save('output/wimb_sfists_weekly_elos.html')

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [7]:
line

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [8]:
# -*- coding: utf-8 -*-
import altair as alt
import pandas as pd

## path to repo with relevant data
## ( https://github.com/JeffSackmann/tennis_slam_pointbypoint )
data_prefix = '../tennis_slam_pointbypoint/'

player = 'Roger Federer'
tourney = 'wimbledon'

year_speeds = []
for year in ['2014', '2015', '2016', '2017', '2018', '2019', '2021']:
    matches = pd.read_csv(data_prefix + year + '-' + tourney + '-matches.csv')
    matches['year'] = matches['match_id'].str[:4]
    points = pd.read_csv(data_prefix + year + '-' + tourney + '-points.csv')
    
    ## add match metadata to points rows
    mpoints = pd.merge(points, matches).fillna(0)
    
    mpoints.astype({'PointServer': 'int',
                    'ServeNumber': 'int',
                    'Speed_KMH': 'int'
               }).dtypes
    
    ## skip  bad data with serve speed of zero
    mpoints = mpoints.loc[mpoints['Speed_KMH'] != 0]
    
    ## points with target player serving
    serving1 = mpoints.loc[(mpoints['PointServer'] == 1) & (mpoints['player1'] == player)]
    serving2 = mpoints.loc[(mpoints['PointServer'] == 2) & (mpoints['player2'] == player)]
    svpoints = pd.concat([serving1, serving2])
    
    ## first serve points from above
    fsv_points = svpoints.loc[svpoints['ServeNumber'] == 1]
    
    ## Rounds 1, 2, and 3 from above (round number is 3rd-to-last char in match_id)
    first_three_rounds = fsv_points.loc[fsv_points['match_id'].str[-3].isin(['1', '2', '3'])]
    
    ## reduce data to year and serve-speed only
    year_speeds += first_three_rounds[['year', 'Speed_KMH']].values.tolist()


df = pd.DataFrame(year_speeds, columns=['Year', 'KM/H'])
df.astype({'KM/H': 'float', 
            }).dtypes

serve_plot = alt.Chart(df).mark_boxplot().encode(
    alt.X('Year:N',
          ),
    alt.Y('KM/H:Q',
          scale=alt.Scale(domain=(120, 220))
          )
).properties(
    width=400,
    height=350
)

out_path = 'output/' + tourney + '_' + player.replace(' ', '_') + '_first_week_first_serves.html'
serve_plot.save(out_path)

FileNotFoundError: [Errno 2] No such file or directory: '../tennis_slam_pointbypoint/2014-wimbledon-matches.csv'

#### PLAYER SLAM DOM RATIO

In [3]:
# -*- coding: utf-8 -*-

import pandas as pd
import altair as alt
alt.renderers.enable('altair_viewer') 
## path to repo with relevant data
## ( https://github.com/JeffSackmann/tennis_atp )
data_prefix = 'https://github.com/JeffSackmann/tennis_atp/'

player = 'Roger Federer'

# 'https://github.com/JeffSackmann/tennis_atp/blob/master/atp_matches_' + str(y) + '.csv'


player_slams = []
for y in range(2003,2022):
    ## load matches and reduce to the target player, in first four rounds at slams
    # matches = pd.read_csv(data_prefix + 'atp_matches_' + str(y) + '.csv')
    matches = pd.read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_" + str(y) + ".csv");

    pmatches =  matches.loc[(matches['winner_name'] == player) | (matches['loser_name'] == player)]
    first_rounds = ['R128', 'R64', 'R32', 'R16']
    tmatches = pmatches.loc[(pmatches['tourney_level'] == 'G') & (pmatches['round'].isin(first_rounds))]
    
    ## get DR (dominance ratio: RPW / SPL) components for each match for the target player
    tmatches['pSvPt'] = tmatches.apply(lambda row: row.w_svpt 
                                       if row.winner_name == player else row.l_svpt, axis=1)
    tmatches['pSPW'] = tmatches.apply(lambda row: row['w_1stWon'] + row['w_2ndWon']
                                       if row.winner_name == player else row['l_1stWon'] + row['l_2ndWon'], axis=1)
    tmatches['pRetPt'] = tmatches.apply(lambda row: row.l_svpt 
                                       if row.winner_name == player else row.w_svpt, axis=1)
    tmatches['pRPW'] = tmatches.apply(lambda row: (row['l_svpt'] - row['l_1stWon'] - row['l_2ndWon'])
                                       if row.winner_name == player 
                                       else (row['w_svpt'] - row['w_1stWon'] - row['w_2ndWon']), axis=1)
    
    ## list of slams from that year:
    slams = set(tmatches['tourney_id'].tolist())

    for slam in slams:
        ## check if player won this tournament (not [yet] using in this viz)
        titles = pmatches.loc[(pmatches['winner_name'] == player) & (pmatches['round'] == 'F') & (pmatches['tourney_id'] == slam)]
        won_tourney = 1 if len(titles) == 1 else 0
        ## get matches from this tournament and calculate aggregate DR
        smatches = tmatches.loc[pmatches['tourney_id'] == slam]
        slam_total = smatches[['pSvPt', 'pSPW', 'pRetPt', 'pRPW']].sum(axis=0)
        rpw = slam_total['pRPW'].item() / slam_total['pRetPt'].item()
        spw = slam_total['pSPW'].item() / slam_total['pSvPt'].item()
        dr = rpw / (1 - spw)
        row = [smatches.tail(1)['tourney_name'].item(), smatches.tail(1)['tourney_date'].item(), dr, won_tourney] 
        player_slams.append(row)                                                             
    
## add 2021 wimbledon, not yet in the tennis_atp data
player_slams.append(['Wimbledon', 20210628, 1.38, 0])

## sort slams ascending by date
player_slams = sorted(player_slams, key=lambda x: x[1])
    
df = pd.DataFrame(player_slams, columns=['Tourney', 'Date', 'DR', 'Title'])

df.astype({'DR': 'float',
           'Title': 'int'
                }).dtypes

slam_abvs = {'Wimbledon': 'Wimb',
             'US Open': 'USO',
             'Us Open': 'USO',
             'Australian Open': 'AO',
             'Roland Garros': 'RG'
             }
df['FullName'] = df.apply(lambda row: str(row['Date'])[:4] + ' ' + slam_abvs[row['Tourney']], axis=1)

## store list in *date* order for the chart to use:
x_sort = df['FullName'].tolist()

## subset of the data with only tournaments where he won the title, for second layer
titles = df.loc[df['Title'] == 1]

## line chart with all tournaments
line = alt.Chart(df).mark_line(point=True).encode(
    alt.X('FullName',
          sort=x_sort,
          axis=alt.Axis(title='Tournament (first four rounds)')),
    alt.Y('DR',
          axis=alt.Axis(title='Dominance Ratio'),
          scale=alt.Scale(domain=(0.8,2.5))),
)
    
## mark larger, different-colored points for tournaments that he won
points = alt.Chart(titles).mark_point(filled=True, size=200, color='orange').encode(
    alt.X('FullName',
          sort=x_sort),
    alt.Y('DR')
)
    
#(line + points).save('output/federer_slam34343434_dr.html')
(line + points).show()


ValueError: 
To use the 'altair_viewer' renderer, you must install the altair_viewer
package; see http://github.com/altair-viz/altair_viewer/
for more information.


In [5]:
# -*- coding: utf-8 -*-
import altair as alt
import pandas as pd

## path to repo with relevant data
## ( https://github.com/JeffSackmann/tennis_atp )
data_prefix = '../tennis_atp/'

## note that the tennis_atp repo does not contain matches from the current week's
## tournaments and may not be completely up to date otherwise, so some matches 
## can be manually added below.

player = 'Novak Djokovic'

keep_columns = ['tourney_date', 'tourney_name', 'opp_rank', 'is_final', 'result', 'opponent', 'round']
player_slam_matches = []
for y in range(2005,2022):
    ## load matches and reduce to the target player, in first four rounds at slams
    #matches = pd.read_csv(data_prefix + 'atp_matches_' + str(y) + '.csv')
    matches = pd.read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_" + str(y) + ".csv");
    matches.astype({'round': 'object'}).dtypes
    pmatches = matches.loc[(matches['winner_name'] == player) | (matches['loser_name'] == player)]
    pmatches = pmatches.loc[(pmatches['tourney_level'] == 'G') & (pmatches['score'] != 'W/O')]
    
    ## add columns for opponent rank and winner/loser flag (from perspective of the target player)
    pmatches['opp_rank'] = pmatches.apply(lambda row: row.loser_rank 
                                          if row.winner_name == player else row.winner_rank, axis=1)
    pmatches['opponent'] = pmatches.apply(lambda row: row.loser_name 
                                          if row.winner_name == player else row.winner_name, axis=1)
    pmatches['result'] = pmatches.apply(lambda row: 'W' if row.winner_name == player else 'L', axis=1)
    pmatches['is_final'] = pmatches.apply(lambda row: 'Final' if row['round'] == 'F' else 'Other', axis=1)
    
    ## this method only works if replacing a single null opp_rank
    ## At 2007 Wimb, Kiefer was unranked, but had recently been 404
    pmatches['opp_rank'] = pmatches['opp_rank'].fillna(404.0)
    
    y_summary = pd.DataFrame(pmatches, columns=keep_columns).values.tolist()  
    player_slam_matches += y_summary                                                          
    
## add 2021 wimbledon, not yet in the tennis_atp data
manual_add = [[20210628, 'Wimbledon', 253, 'Other', 'W', 'Jack Draper', 'R128'],
              [20210628, 'Wimbledon', 102, 'Other', 'W', 'Kevin Anderson', 'R64'],
              [20210628, 'Wimbledon', 114, 'Other', 'W', 'Denis Kudla', 'R32'],
              [20210628, 'Wimbledon', 20, 'Other', 'W' ,'Cristian Garin', 'R16'],
              [20210628, 'Wimbledon', 48, 'Other', 'W', 'Marton Fucsovics', 'QF'],
              [20210628, 'Wimbledon', 12, 'Other', 'W', 'Denis Shapovalov', 'SF'],
              [20210628, 'Wimbledon', 9, 'Final', '?', 'Matteo Berrettini', 'F']
              ]
player_slam_matches += manual_add

## sort slams ascending by date
player_slam_matches = sorted(player_slam_matches, key=lambda x: x[0])

slam_abvs = {'Wimbledon': 'Wimb',
             'US Open': 'USO',
             'Us Open': 'USO',
             'Australian Open': 'AO',
             'Roland Garros': 'RG'
             }

## abbreviated names of tournaments where the plot looks better
## with labels above the point instead of below it
manual_uppers = ['2007 Wimb', '2012 AO', '2013 USO', '2015 USO',
                 '2016 RG', '2017 Wimb', '2006 AO', '2018 RG',
                 '2005 Wimb', '2009 Wimb']

name_abvs = {'Roger Federer': 'Fed',
            'Rafael Nadal': 'Rafa',
            'Andy Murray': 'Muzz',
            'Stan Wawrinka': 'Stan',
            'Pablo Carreno Busta': 'PCB',
            'Juan Martin del Potro': 'Delpo',
            'Philipp Kohlschreiber': 'Kohli',
            'Kei Nishikori': 'Kei'}

def add_label(row):
    tdate, tname, opp_rank, _, result, opp_name, rd = row
    full_name = str(tdate)[:4] + ' ' + slam_abvs[tname]
    
    ## get opponent name (or abbreviation) label
    ## (if a final, a loss, or opponent is #1)
    need_label, label = 0, ''
    if result != 'W' or rd == 'F':
        need_label = 1
    elif opp_rank == 1:
        need_label = 1
        
    if need_label:
        ## use label specified in name_abvs or, if not available, use last name 
        label = name_abvs[opp_name] if opp_name in name_abvs else opp_name.split(' ')[-1]
        
    if full_name in manual_uppers:
        output = row[2:] + ['', label]
    else:
        output = row[2:] + [label, '']
    output.append(full_name)
    return output

labeled_matches = [add_label(k) for k in player_slam_matches]    

## replace opp_rank = 1 with 1.25, to move matches vs #1 players up off of the bottom horizontal axis
matches = [k if k[0] != 1 else [1.25] + k[1:] for k in labeled_matches]
    
df = pd.DataFrame(matches, columns=['Opponent Rank', 'Round', 'Result', 'Opponent', 
                                    'round_short', 'Label Lower', 'Label Upper', 'FullName'])

df['Tooltip'] = df.apply(lambda row: row['FullName'] + ' ' + row['round_short'] + ' vs #' + str(int(row['Opponent Rank'])) + ' ' + row['Opponent'], axis=1)

## store list in *date* order for the chart to use:
x_sort = df['FullName'].tolist()

## separate title-winning finals from other matches in order to differently size the points
title_finals = df.loc[(df['Result'] != 'L') & (df['Round'] == 'Final')]
other_matches = df.loc[(df['Result'] == 'L') | (df['Round'] != 'Final')]
## results only to be used for calculating won-loss record (exclude pending match)
results_only = df.loc[(df['Result'] != '?')]

chart_title = player + ' Matches at Grand Slams'

brush = alt.selection(type='interval')

## line chart with all matches (except title-winning finals)
points = alt.Chart(other_matches).mark_point(filled=True, size=60).encode(
    alt.X('FullName',
          sort=x_sort,
          axis=alt.Axis(title='Tournament')),
    alt.Y('Opponent Rank',
          axis=alt.Axis(title='Opponent Ranks',
                        values=[5, 10, 20, 50, 100, 250]
                        ),
          scale=alt.Scale(domain=(1, 600),
                          type='log',
                          base=3)),
    color=alt.Color('Result',
                    scale=alt.Scale(
                    domain=['W', 'L', '?'],
                    range=['purple', 'red', 'orange'])),
    shape=alt.Shape('Round',
                    scale=alt.Scale(
                    domain=['Final', 'Other'],
                    range=['square', 'circle'])),
    tooltip='Tooltip:N'
)

## add bigger points for title-winning finals    
titles = alt.Chart(title_finals).mark_point(filled=True, size=120).encode(
    alt.X('FullName:N',
          sort=x_sort,
          ),
    alt.Y('Opponent Rank:Q'),
    color=alt.Color('Result',
                    scale=alt.Scale(
                    domain=['W', 'L', '?'],
                    range=['purple', 'red', 'orange'])),
    shape=alt.Shape('Round',
                    scale=alt.Scale(
                    domain=['Final', 'Other'],
                    range=['square', 'circle'])),
    tooltip='Tooltip:N',
)

## vertical lines for each tournament -- opacity level is *per match*,
## so opacity depends on how many matches played at that event. Could
## change that by using a list of unique tournament names only.
rules = alt.Chart(df).mark_rule(opacity=0.01).encode(
    alt.X('FullName:N',
          sort=x_sort,
          )
)
    
## four separate caption layers:
## for lower/upper and title-winning finals/other matches
text_lower = points.mark_text(
    align='left',
    baseline='line-top',
    dx=5,
    dy=1
).encode(
    text='Label Lower'
)
    
text_upper = points.mark_text(
    align='left',
    baseline='line-bottom',
    dx=5,
    dy=-1
).encode(
    text='Label Upper'
)
    
text_lower_titles = titles.mark_text(
    align='left',
    baseline='line-top',
    dx=6,
    dy=3
).encode(
    text='Label Lower'
)
    
text_upper_titles = titles.mark_text(
    align='left',
    baseline='line-bottom',
    dx=5,
    dy=-3
).encode(
    text='Label Upper'
)
    
layered = alt.layer(points, titles, text_lower, text_upper, text_lower_titles, text_upper_titles, rules).add_selection(
    brush
).properties(
    width=1200,
    height=400,
    title=chart_title
)
    
## bar chart with wins and losses from highlighted section
bars = alt.Chart(results_only).mark_bar().encode(
    alt.X('count(Result):Q',
          axis=alt.Axis(title='Won-Loss Record (highlight section above to update)',
                        labels=False,
                        ticks=False)
          ),
    alt.Y('Result:N',
          sort=['W', 'L']
          ),
    color='Result:N'
)

## text overlay for the bar chart  
bar_text = alt.Chart(results_only).mark_text(dx=-10, dy=3, color='white').encode(
    x=alt.X('count(Result):Q', stack='zero'),
    y=alt.Y('Result:N',
            sort=['W', 'L'],
            ),
    text=alt.Text('count(Result):Q', format='.0f')
)

bar_layer = alt.layer(bars, bar_text).transform_filter(
    brush
).properties(
    width=1200
)
    
combined = alt.vconcat(layered, bar_layer).configure_axisX(
    labelAngle=300
).configure_legend(
    titleFontSize=14,
    labelFontSize=12
).save('output/djokovic_slam_opponent_ranks_8888.html')

   Use 'selection_point()' or 'selection_interval()' instead; these functions also include more helpful docstrings.
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
