In [25]:
setwd("~/Projects/livemanager")
load("plm.RData")
odds <- read.csv("data/match_odds.csv")
next_odds <- read.csv("data/next_match_odds.csv")
str(odds)

'data.frame':	234 obs. of  18 variables:
 $ LEAGUE   : Factor w/ 1 level "Bundesliga": 1 1 1 1 1 1 1 1 1 1 ...
 $ FTR      : Factor w/ 3 levels "A","D","H": 3 3 3 3 2 1 3 2 3 3 ...
 $ FTSC     : Factor w/ 28 levels "0:0","0:1","0:2",..: 12 6 25 17 14 3 6 20 12 12 ...
 $ MATCH_URL: Factor w/ 234 levels "http://www.oddsportal.com/soccer/germany/bundesliga/augsburg-b-monchengladbach-GncNw7cb/",..: 77 33 52 18 54 114 144 155 128 100 ...
 $ SEASON   : int  2015 2015 2015 2015 2015 2015 2015 2015 2015 2015 ...
 $ HOMETEAM : Factor w/ 18 levels "1. FC Koln","Augsburg",..: 7 4 5 3 6 10 12 13 11 9 ...
 $ TIME     : Factor w/ 10 levels "13:30","14:30",..: 5 2 6 2 2 2 2 2 10 5 ...
 $ FTHG     : int  2 1 5 3 2 0 1 3 2 2 ...
 $ AVOD     : num  5.14 3.97 11 4.55 3.18 3.31 3.7 3.31 3.24 3.15 ...
 $ AWAYTEAM : Factor w/ 18 levels "1. FC Koln","Augsburg",..: 14 9 17 8 2 1 18 16 15 11 ...
 $ AVOA     : num  8.85 5.44 24.43 6.21 2.75 ...
 $ COUNTRY  : Factor w/ 1 level "Germany": 1 1 1 1 1 1 1 1 1 1 ...


## Expand player data

### Define Functions for aggregate predictor data

In [26]:
library('dplyr')

In [46]:
starterfeatures <- function(data = plm, spieltag)  {
    
    data.subset <- subset(data, data$matchday < spieltag)
    data.features <- data.subset %>%
            group_by(id, Name, Club) %>%
            summarise(played_rate = sum(time_on_pitch > 0)/max(matchday),
                      starter_rate = sum(status == "starter")/max(matchday),
                      last_week_status = status[matchday == max(matchday)],
                      last_week_points = total_earnings[matchday == max(matchday)],
                      played_last_3 = sum(time_on_pitch[matchday >= max(matchday) - 2] > 0),
                      started_last_3 = sum(status[matchday >= max(matchday) -2] == "starter"),
                      avg_mins = mean(time_on_pitch),
                      avg_earnings = mean(total_earnings),
                      init_Value = mean(init_Value)
                      ) %>%
            ungroup() %>%
            arrange(id, Name)

    return(data.features)
    
}

In [28]:
starterlabels <- function(data = plm, spieltag)  {
    
    data.subset <- subset(data, data$matchday == spieltag)
    data.labels <- data.subset %>%
            group_by(id) %>%
            summarise(played = sum(time_on_pitch > 0),
                      starter = sum(status == "starter"),
                      status = first(status),
                      matchday = first(matchday)
                      ) %>%
            ungroup() %>%
            arrange(id)

    return(data.labels)
    
}

In [47]:
max_matchday = max(plm$matchday)

plm.played <- as.data.frame(NULL)

for (i in 5:max_matchday) {
    features <- starterfeatures(data = plm, spieltag = i)
    labels <- starterlabels(data = plm, spieltag = i)
    merge <- merge(features, labels, by = "id")
    plm.played <- rbind(plm.played, merge)
    cat("Completed matchday", i, "; ")
}

Completed matchday 5 ; Completed matchday 6 ; Completed matchday 7 ; Completed matchday 8 ; Completed matchday 9 ; Completed matchday 10 ; Completed matchday 11 ; Completed matchday 12 ; Completed matchday 13 ; Completed matchday 14 ; Completed matchday 15 ; Completed matchday 16 ; Completed matchday 17 ; Completed matchday 18 ; Completed matchday 19 ; Completed matchday 20 ; Completed matchday 21 ; Completed matchday 22 ; Completed matchday 23 ; Completed matchday 24 ; Completed matchday 25 ; Completed matchday 26 ; 

In [37]:
head(plm.played)

Unnamed: 0,id,Name,Club,played_rate,starter_rate,last_week_status,last_week_points,played_last_3,started_last_3,avg_mins,avg_earnings,init_Value,played,starter,status,matchday
1,64467,T. Horn,KOE,1,1,starter,86000,3,3,90,79375,8.5,1,1,starter,5
2,64468,,,0,0,not_present,0,0,0,0,0,,0,0,bench,5
3,64469,,,0,0,bench,0,0,0,0,0,,0,0,not_present,5
4,64470,,,0,0,not_present,0,0,0,0,0,,0,0,not_present,5
5,64471,D. Maroh,KOE,0,0,not_present,0,0,0,0,0,8.5,0,0,bench,5
6,64472,J. Hector,KOE,1,1,starter,71500,3,3,90,95125,9.5,1,1,starter,5


### Add Odds Data Formats

In [2]:
## Express Odds as Chances
next_odds$Hprob <- (1 / next_odds$AVOH) / ((1/next_odds$AVOH) + (1/next_odds$AVOD) + (1/next_odds$AVOA))
next_odds$Aprob <- (1 / next_odds$AVOA) / ((1/next_odds$AVOH) + (1/next_odds$AVOD) + (1/next_odds$AVOA))
odds$Hprob <- (1 / odds$AVOH) / ((1/odds$AVOH) + (1/odds$AVOD) + (1/odds$AVOA))
odds$Aprob <- (1 / odds$AVOA) / ((1/odds$AVOH) + (1/odds$AVOD) + (1/odds$AVOA))

In [3]:
## Align Teamnames with PLM format
levels(odds$HOMETEAM)
levels(next_odds$HOMETEAM) <- c("KOE", "FCA", "BMG", "B04", "FCB", "D98", "BVB", "SGE", "HSV", "H96", 
                           "BSC", "TSG", "FCI", "M05", "S04", "VFB", "BRE", "WOB")
levels(next_odds$AWAYTEAM) <- c("KOE", "FCA", "BMG", "B04", "FCB", "D98", "BVB", "SGE", "HSV", "H96", 
                           "BSC", "TSG", "FCI", "M05", "S04", "VFB", "BRE", "WOB")
levels(odds$HOMETEAM) <- c("KOE", "FCA", "BMG", "B04", "FCB", "D98", "BVB", "SGE", "HSV", "H96", 
                           "BSC", "TSG", "FCI", "M05", "S04", "VFB", "BRE", "WOB")
levels(odds$AWAYTEAM) <- c("KOE", "FCA", "BMG", "B04", "FCB", "D98", "BVB", "SGE", "HSV", "H96", 
                           "BSC", "TSG", "FCI", "M05", "S04", "VFB", "BRE", "WOB")
levels(odds$HOMETEAM)

In [4]:
## Add matchday column
matchdays <- nrow(odds) / 9
odds$matchday <- rep(matchdays:1, each = 9)
furthest_matchday <- (nrow(next_odds) / 9) + matchdays
next_odds$matchday <- rep((matchdays+1):furthest_matchday, each = 9)

In [5]:
## Get odds by club & matchday --> New dataset 'probs'

# Split data by home / away
probs.home <- odds[c('matchday', 'HOMETEAM', 'Hprob')]
probs.away <- odds[c('matchday', 'AWAYTEAM', 'Aprob')]
next_probs.home <- next_odds[c('matchday', 'HOMETEAM', 'Hprob')]
next_probs.away <- next_odds[c('matchday', 'AWAYTEAM', 'Aprob')]

# Align column names
names <- c("matchday", "Club", "Prob")
colnames(probs.home) <- names
colnames(probs.away) <- names
colnames(next_probs.home) <- names
colnames(next_probs.away) <- names

# Generate new dataset, including future data
probs <- rbind(probs.home, probs.away, next_probs.home, next_probs.away)
subset(probs, probs$matchday == (matchdays + 1))

Unnamed: 0,matchday,Club,Prob
469,27,S04,0.4129104
470,27,HSV,0.4287797
471,27,BSC,0.4785659
472,27,KOE,0.06792041
473,27,BRE,0.408474
474,27,WOB,0.6708902
475,27,SGE,0.5265881
476,27,VFB,0.399586
477,27,FCA,0.1366051
496,27,BMG,0.3168847


In [52]:
### Merge Probs with plm.played (for past only)
plm.played <- merge(plm.played, probs, by = c("matchday", "Club"), all.x = TRUE)
#subset(plm.played, plm.played$id == 64471)

## Build dataset for next week

In [51]:
next_week <- starterfeatures(plm, (max_matchday + 1))
next_week$matchday <- (max_matchday + 1)
next_week <- merge(next_week, probs, by = c("matchday", "Club"), all.x = TRUE)
next_week

Unnamed: 0,matchday,Club,id,Name,played_rate,starter_rate,last_week_status,last_week_points,played_last_3,started_last_3,avg_mins,avg_earnings,init_Value,Prob
1,27,B04,64513,Wendell,0.9230769,0.7692308,starter,123500,3,3,78.73077,67057.69,10.0,0.3325226
2,27,B04,64923,J. Tah,0.9615385,0.9230769,starter,142500,2,2,84.80769,88750.0,7.5,0.3325226
3,27,B04,64509,R. Hilbert,0.4230769,0.1923077,not_present,0,1,0,28.57692,26961.54,10.0,0.3325226
4,27,B04,64506,B. Leno,1.0,1.0,starter,154000,3,3,90.0,78961.54,9.0,0.3325226
5,27,B04,64545,K. Kampl,0.6923077,0.5384615,not_present,0,0,0,55.96154,59711.54,6.5,0.3325226
6,27,B04,64512,S. Boenisch,0.3076923,0.1923077,not_present,0,1,1,21.07692,25269.23,7.5,0.3325226
7,27,B04,64889,A. Ramalho,0.4230769,0.1923077,sub_in,35000,3,2,21.38462,19519.23,7.5,0.3325226
8,27,B04,64522,K. Bellarabi,1.0,0.4230769,sub_out,73500,3,2,78.19231,79326.92,13.0,0.3325226
9,27,B04,64998,B. Henrichs,0.1153846,0.0,bench,0,1,0,1.153846,-1403.846,5.0,0.3325226
10,27,B04,64514,<96>. Toprak,0.5769231,0.3846154,starter,108000,1,1,45.07692,61115.38,10.5,0.3325226


In [53]:
save(plm.played, next_week, file = "formatted_data.RData")