In [1]:
library(tidyverse)
library(testthat)
library(digest)
library(repr)
library(dplyr)
library(rvest)
library(tidymodels)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘testthat’


The following object is masked from ‘package:dplyr’:

    matches


The following object is masked from ‘package:purrr’:

**Introduction:** <p> Our goal for this project is to be able to predict which round of a tournament a tennis match occured, from top 128 to finals, based on relevant match data.</p> <p> The first round of a tournament begins with 128 players and after each match the number of players remaining is halved. This continues until quarterfinals(QF), then to semi-finals(SF), and ending with the grand finals(F).</p> <p>This includes data from both the winners and losers and the length of a match. This data was gathered from the 2021 matches played by athletes part of the Association of Tennis Professionals (ATP).</p>



In [2]:
tennis_data <- read_csv("https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_2021.csv") %>%
select(winner_name, loser_name, round, best_of, minutes, w_ace, w_svpt, winner_rank, w_bpSaved, l_ace, l_svpt, l_bpSaved, loser_rank) %>%
filter(best_of == 5)
tennis_data

tennis_split <- initial_split(tennis_data, prop = .75, strata = round)  
tennis_train <- training(tennis_split)   
tennis_test <- testing(tennis_split)

Parsed with column specification:
cols(
  .default = col_double(),
  tourney_id = [31mcol_character()[39m,
  tourney_name = [31mcol_character()[39m,
  surface = [31mcol_character()[39m,
  tourney_level = [31mcol_character()[39m,
  winner_entry = [31mcol_character()[39m,
  winner_name = [31mcol_character()[39m,
  winner_hand = [31mcol_character()[39m,
  winner_ioc = [31mcol_character()[39m,
  loser_entry = [31mcol_character()[39m,
  loser_name = [31mcol_character()[39m,
  loser_hand = [31mcol_character()[39m,
  loser_ioc = [31mcol_character()[39m,
  score = [31mcol_character()[39m,
  round = [31mcol_character()[39m
)

See spec(...) for full column specifications.



winner_name,loser_name,round,best_of,minutes,w_ace,w_svpt,winner_rank,w_bpSaved,l_ace,l_svpt,l_bpSaved,loser_rank
<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Novak Djokovic,Jeremy Chardy,R128,5,91,9,57,1,0,3,86,7,61
Frances Tiafoe,Stefano Travaglia,R128,5,111,6,87,64,4,8,83,4,60
Reilly Opelka,Yen Hsun Lu,R128,5,115,18,84,38,1,3,92,5,1009
Taylor Fritz,Albert Ramos,R128,5,203,11,143,31,11,6,142,6,46
Stan Wawrinka,Pedro Sousa,R128,5,96,7,74,18,0,2,91,9,107
Marton Fucsovics,Marc Polmans,R128,5,252,7,151,55,9,3,160,15,126
Corentin Moutet,John Millman,R128,5,225,1,148,71,3,3,154,6,39
Milos Raonic,Federico Coria,R128,5,95,17,70,14,2,2,98,11,92
Emil Ruusuvuori,Gael Monfils,R128,5,226,4,195,86,17,13,162,7,11
Pedro Martinez,Yoshihito Nishioka,R128,5,159,7,101,87,8,4,116,10,58


**Method** <p>Through classification

In [7]:
tennis_group <- tennis_comb %>%
group_by(round) %>%
summarise_if(is.numeric, mean, na.rm = TRUE) %>%
slice(3,6,5,4,2,7,1)
tennis_group

round,minutes,total_ace,total_svpt,avg_rank,total_bpSaved
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
R128,157.0469,19.06771,229.625,89.33333,11.91146
R64,156.5789,20.0,225.4632,67.46842,11.56842
R32,145.4286,16.08163,208.2449,45.44898,10.0
R16,152.75,18.0,222.7368,34.85714,12.0
QF,157.8667,19.86667,214.8667,16.73333,10.73333
SF,165.0,18.57143,213.8571,14.07143,10.14286
F,124.5,15.5,168.5,2.0,7.5


In [4]:
tennis_comb <- tennis_train %>%
                mutate(total_ace = w_ace + l_ace) %>%
                mutate(total_svpt = w_svpt + l_svpt) %>%
                mutate(avg_rank = ((winner_rank + loser_rank)/2)) %>%
                mutate(total_bpSaved = w_bpSaved + l_bpSaved) %>%
                select(round, minutes, total_ace:total_bpSaved)
tennis_comb

round,minutes,total_ace,total_svpt,avg_rank,total_bpSaved
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
R128,91,12,143,31.0,7
R128,111,14,170,62.0,8
R128,115,21,176,523.5,6
R128,203,17,285,38.5,17
R128,252,10,311,90.5,24
R128,95,19,168,53.0,13
R128,226,17,357,48.5,24
R128,204,17,316,113.5,16
R128,141,8,187,67.0,9
R128,104,15,173,75.0,7


In [26]:
#tennis_grouped <- tennis_total %>%
#                  group_by(round) %>%
#                  mutate(avg_minutes = mean(minutes), na.rm = TRUE) 
#                  summarize(mean(total_ace), na.rm = TRUE) %>%
#                  summarize(mean(total_svpt), na.rm = TRUE) %>%
#                  summarize(mean(avg_rank), na.rm = TRUE) %>%
#                  summarize(mean(total_bpSaved), na.rm = TRUE)
#tennis_grouped

In [None]:
tennis_grouped <- tennis_train %>%
group_by(round) %>%
summarizse