In [1]:
library(engsoccerdata)
library(dplyr)
library(ggplot2)
library(gridExtra)
options(repr.plot.width=5, repr.plot.height=5)

“package ‘engsoccerdata’ was built under R version 3.3.2”
Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘gridExtra’

The following object is masked from ‘package:dplyr’:

    combine



In [2]:
#FUNCTION -- summarybetweenteamsengland:: used to find the number of wins, losses and draws between any 2 teams
#between any yearly time period 
winLossDraw <- function(df,team1,team2,year1,year2,type = "summary"){
    df_team1_team2 <- games_between(df,teamname1 = team1,teamname2 = team2)
    df_team1_team2_yearly <- df_team1_team2 %>%
    filter(Season >= year1 & Season <= year2) %>% 
    mutate(homegoals = substr(FT,1,1)) %>% 
    mutate(awaygoals = substr(FT,3,3)) %>%
    mutate(winner = ifelse(  home == team1,
                             ifelse(homegoals > awaygoals,
                                   team1,ifelse(homegoals == awaygoals,
                                                   "Draw",team2)),
                             ifelse(homegoals > awaygoals,
                                   team2,ifelse(homegoals == awaygoals,
                                                             "Draw",team1))))
    ifelse(type == "summary",
           return(data.frame(result = c(paste(team1,"Wins"),paste(team2,"Wins"),"Draw"),
                      matches = c(sum((df_team1_team2_yearly$winner == team1) == TRUE),
                                  sum((df_team1_team2_yearly$winner == team2) == TRUE),
                                  sum((df_team1_team2_yearly$winner == "Draw") == TRUE)))),
           return(df_team1_team2_yearly))
}
x <- winLossDraw(england,team1 = "Arsenal",team2 = "Manchester United",year1 = 2006,year2 = 2016,type = "summary")
print(x)

                  result matches
1           Arsenal Wins       5
2 Manchester United Wins      10
3                   Draw       5


In [3]:
#FUNCTION -- winLossDrawHomeAway:: Used to find wins, losses, draws both overall & home and away for any 2 teams
#between any specific time period
winLossDrawHomeAway <- function(df,team1,team2,year1,year2,type = "summary"){
    df_win_loss <- winLossDraw(england,team1 = team1,team2 = team2,year1 = year1,year2 = year2,type = "dataframe")
    df_res <- winLossDraw(england,team1 = team1,team2 = team2,year1 = year1,year2 = year2,type = "summary")
    #print(df_res)
    t1_h <- sum((df_win_loss$winner == team1 & df_win_loss$home == team1) == TRUE)
    t2_h <- sum((df_win_loss$winner == team2 & df_win_loss$home == team2) == TRUE)
    t1_a <- sum((df_win_loss$winner == team1 & df_win_loss$visitor == team1) == TRUE)
    t2_a <- sum((df_win_loss$winner == team2 & df_win_loss$visitor == team2) == TRUE)
    t1_hd <- sum((df_win_loss$winner == "Draw" & df_win_loss$home == team1) == TRUE)
    t2_hd <- sum((df_win_loss$winner == "Draw" & df_win_loss$home == team2) == TRUE)
    t1_ad <- t2_hd
    t2_ad <- t1_hd
    return(rbind(df_res,data.frame(result = c(paste(team1,"Home Wins"),paste(team2,"Home Wins"),
                                              paste(team1,"Away Wins"),paste(team2,"Away Wins"),
                                             paste(team1,"Home Draws"),paste(team2,"Home Draws"),
                                             paste(team1,"Away Draws"),paste(team2,"Away Draws")),
                                   matches = c(t1_h,t2_h,t1_a,t2_a,t1_hd,t2_hd,t1_ad,t2_ad))))
}
x <- winLossDrawHomeAway(england,"Arsenal","Chelsea","2006","2015")
print(x)

               result matches
1        Arsenal Wins       4
2        Chelsea Wins      11
3                Draw       5
4   Arsenal Home Wins       2
5   Chelsea Home Wins       7
6   Arsenal Away Wins       2
7   Chelsea Away Wins       4
8  Arsenal Home Draws       4
9  Chelsea Home Draws       1
10 Arsenal Away Draws       1
11 Chelsea Away Draws       4


In [4]:
#FUNCTION -- querying records of teams against the bottom n teams in the league; returns points per game
bottomNheadtoheadEngland <- function(team,season,bottomn=5,tier=1,type = "points"){
    df_table <- maketable_eng(df = england,Season = season,tier = tier)
    bottom <- as.character(tail(df_table$team,bottomn))
    #ifelse(team %in% bottom,print("Team is already part of bottom 6, so record against other 5 teams is considered"),"")
    df_results <- tbl_df(england %>% filter(Season == season & tier == tier))
    df_results_top_bottom <- df_results %>% filter((home %in% team & visitor %in% bottom) | 
                                                    (home %in% bottom & visitor %in% team)) %>% 
    mutate(points = ifelse(hgoal == vgoal,1,ifelse(hgoal > vgoal & home %in% team,3,
                                               ifelse(hgoal < vgoal & home %in% team,0,
                                                      ifelse(vgoal > hgoal & visitor %in% team,3,0)))))
    res_df <- df_results_top_bottom %>% filter(home == team | visitor == team)
    sum <- sum(res_df$points/
              length(res_df$points))
    ifelse(type == "points",return(sum),return(res_df))
}
pts <- bottomNheadtoheadEngland("Manchester City",2015,bottomn=6)
pts

In [5]:
#FUNCTION -- querying records of teams against the top n teams in the league; returns points per game
topNheadtoheadEngland <- function(team,season,topn=5,tier=1,type = "points"){
    df_table <- maketable_eng(df = england,Season = season,tier = tier)
    top <- as.character(head(df_table$team,topn))
    #ifelse(team %in% top,print("Team is already part of top 6, so record against other 5 teams is considered"),"")
    df_results <- tbl_df(england %>% filter(Season == season & tier == tier))
    df_results_top_top <- df_results %>% filter((home %in% team & visitor %in% top) | 
                                               home %in% top & visitor %in% team) %>% 
    mutate(points = ifelse(hgoal == vgoal,1,ifelse(hgoal > vgoal & home %in% team,3,
                                               ifelse(hgoal < vgoal & home %in% team,0,
                                                      ifelse(vgoal > hgoal & visitor %in% team,3,0)))))
    sum <- sum((df_results_top_top %>% filter(home == team | visitor == team))$points/
               length((df_results_top_top %>% filter(home == team | visitor == team))$points))
    res_df <- df_results_top_top %>% filter(home == team | visitor == team)
    ifelse(type == "points",return(sum),return(res_df))
}
pts <- topNheadtoheadEngland("Liverpool",2015,topn=10)
pts

In [6]:
#FUNCTION -- getting the top n teams for any particular season in the english league
getTopNEngland <- function(n,season,tier = 1){
    df_table <- maketable_eng(df = england,Season = season,tier = tier)
    return(as.character(head(df_table$team,n)))
}
topn <- getTopNEngland(5,2015)
topn

#FUNCTION -- getting the bottom n teams for any particular season in the english league
getBottomNEngland <- function(n,season,tier = 1){
    df_table <- maketable_eng(df = england,Season = season,tier = tier)
    return(as.character(tail(df_table$team,n)))
}
bottomn <- getBottomNEngland(5,2015)
bottomn

In [7]:
#FUNCTION -- for each team in the top 4, for each season, 
#returns points per game against top n and bottom n teams as entered by the user.
top4PointPerGameTopAndBottomN <- function(season,topn,bottomn){
    df_res <- data.frame(season = integer(0),position = integer(0), team = character(0),PPGT = numeric(0),
                         PPGB = numeric(0))
    for(s in season){
        top4 <- getTopNEngland(4,s)
        j = 1
        for(i in top4){
            #print(c(as.integer(s),as.numeric(j),as.character(i),as.numeric(topNheadtoheadEngland(i,s,6))))
            df_temp <- data.frame(season = as.integer(s),position = as.integer(j),
                                  team = i,PPGT = topNheadtoheadEngland(i,s,topn),
                                 PPGB = bottomNheadtoheadEngland(i,s,bottomn))
            #print(df_temp)
            df_res <- rbind(df_res,df_temp)
            #print(df_res)
            j <- j+1
        }
    }
    return(df_res)
}
df_res <-top4PointPerGameTopAndBottomN(seq(1992,2015),10,10)
df_res

Unnamed: 0,season,position,team,PPGT,PPGB
1,1992,1,Manchester United,1.94444444444444,2.1
2,1992,2,Aston Villa,1.77777777777778,1.65
3,1992,3,Norwich City,1.33333333333333,1.95
4,1992,4,Blackburn Rovers,1.88888888888889,1.55
5,1993,1,Manchester United,2,2.4
6,1993,2,Blackburn Rovers,1.83333333333333,2.35
7,1993,3,Newcastle United,1.77777777777778,1.8
8,1993,4,Arsenal,1.66666666666667,1.95
9,1994,1,Blackburn Rovers,1.88888888888889,2.25
10,1994,2,Manchester United,1.94444444444444,2.25


In [10]:
#FUNCTION -- calculates the likelihood of finishing top of the table, having the best record against the topn and 
#bottomn teams seperately
getVictoryLikelihood <- function(year1,year2,topn,bottomn)
{
    df_res <-top4PointPerGameTopAndBottomN(seq(year1,year2),topn,bottomn)
    df_res <- df_res %>% 
    group_by(season) %>% 
    mutate(maxPPGT = max(PPGT)) %>%
    mutate(maxPPGB = max(PPGB))
    df_resPPGT <- df_res %>% 
    filter(maxPPGT == PPGT & position == 1)
    df_resPPGB <- df_res %>% 
    filter(maxPPGB == PPGB & position == 1)
    totals <- list(length(df_resPPGT$position),length(df_resPPGB$position))
    return(totals)
}

#The above function implemented between years 1955-2014 on a 10 year scale & overall
#l_top and l_bottom in df are likelihood of finishing first given best performance against top half & bottom half.
x1 <- getVictoryLikelihood(1955,1964,11,11)
x2 <- getVictoryLikelihood(1965,1974,11,11)
x3 <- getVictoryLikelihood(1975,1984,11,11)
x4 <- getVictoryLikelihood(1985,1994,11,11)
x5 <- getVictoryLikelihood(1995,2004,10,10)
x6 <- getVictoryLikelihood(2005,2014,10,10)
x7 <- getVictoryLikelihood(1955,2014,10,10)

df <- data.frame(start = c(1955,1965,1975,1985,1995,2005,1955),
                           end = c(1964,1974,1984,1994,2004,2014,2014))
df["topn"] <- c(x1[[1]],x2[[1]],x3[[1]],x4[[1]],x5[[1]],x6[[1]],x7[[1]])
df["bottomn"] <- c(x1[[2]],x2[[2]],x3[[2]],x4[[2]],x5[[2]],x6[[2]],x7[[2]])
df <- df %>% mutate(l_topn = topn/(end - start)) %>%  mutate(l_bottomn = bottomn/(end-start))
df

Unnamed: 0,start,end,topn,bottomn,l_topn,l_bottomn
1,1955.0,1964.0,8.0,5.0,0.8888889,0.5555556
2,1965.0,1974.0,4.0,7.0,0.4444444,0.7777778
3,1975.0,1984.0,6.0,5.0,0.6666667,0.5555556
4,1985.0,1994.0,9.0,6.0,1.0,0.6666667
5,1995.0,2004.0,7.0,6.0,0.7777778,0.6666667
6,2005.0,2014.0,9.0,4.0,1.0,0.4444444
7,1955.0,2014.0,43.0,31.0,0.7288136,0.5254237


As we see, the overall statistics, _in our example_, seem to suggest that performance against the top half are more likely to predict the winner of the league; with average likelihoods of 72% and 52% respectively between 1955 and 2014. 

No strong statistical results maybe derived from the results as yet, as the research here is premature, and is just an example. Although, further work could include testing this on different time period intervals applying advanced statistical methods to perhaps get more meaningful, and publish-able results. The work could be reproduced to other major European leagues as well, using the wonderful datasets available as a part of the engsoccerdata package.