In [1]:
setwd("~/Projects/livemanager/")
library("rpart")
load("plm.RData")
library("dplyr")


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



### Define Functions

In [2]:
starterfeatures <- function(data = plm, spieltag)  {
    
    data.subset <- subset(data, data$matchday < spieltag)
    data.features <- data.subset %>%
            group_by(id, Name) %>%
            summarise(played_rate = sum(time_on_pitch > 0)/max(matchday),
                      starter_rate = sum(status == "starter")/max(matchday),
                      last_week_status = status[matchday = max(matchday)],
                      last_week_points = total_earnings[matchday = max(matchday)],
                      played_last_3 = sum(time_on_pitch[matchday >= max(matchday) - 2] > 0),
                      started_last_3 = sum(status[matchday >= max(matchday) -2] == "starter"),
                      avg_mins = mean(time_on_pitch),
                      avg_earnings = mean(total_earnings),
                      init_Value = mean(init_Value)
                      ) %>%
            ungroup() %>%
            arrange(id, Name)

    return(data.features)
    
}

In [3]:
starterlabels <- function(data = plm, spieltag)  {
    
    data.subset <- subset(data, data$matchday == spieltag)
    data.labels <- data.subset %>%
            group_by(id) %>%
            summarise(played = sum(time_on_pitch > 0),
                      starter = sum(status == "starter"),
                      status = first(status),
                      matchday = first(matchday)
                      ) %>%
            ungroup() %>%
            arrange(id)

    return(data.labels)
    
}

### Build dataset, starting at Spieltag 5

In [4]:
max_matchday = max(plm$matchday)

plm.played.all <- as.data.frame(NULL)

for (i in 5:max_matchday) {
    features <- starterfeatures(data = plm, spieltag = i)
    labels <- starterlabels(data = plm, spieltag = i)
    merge <- merge(features, labels, by = "id")
    plm.played.all <- rbind(plm.played.all, merge)
    cat("Completed matchday", i, "; ")
}

Completed matchday 5 ; Completed matchday 6 ; Completed matchday 7 ; Completed matchday 8 ; Completed matchday 9 ; Completed matchday 10 ; Completed matchday 11 ; Completed matchday 12 ; Completed matchday 13 ; Completed matchday 14 ; Completed matchday 15 ; Completed matchday 16 ; Completed matchday 17 ; Completed matchday 18 ; Completed matchday 19 ; Completed matchday 20 ; Completed matchday 21 ; Completed matchday 22 ; Completed matchday 23 ; Completed matchday 24 ; Completed matchday 25 ; Completed matchday 26 ; 

In [5]:
dim(plm.played.all)

### Split training & test data

In [6]:
## Use 75% as train
smp_size <- floor(0.75 * nrow(plm.played.all))

## set the seed to make your partition reproductible
set.seed(123)
train_indices <- sample(seq_len(nrow(plm.played.all)), size = smp_size)

train <- plm.played.all[train_indices, ]
test <- plm.played.all[-train_indices, ]

In [7]:
names(train)

### Decision Tree - Played

In [8]:
library("rpart")
dtree.played <- rpart(played~played_rate+ starter_rate+ last_week_status + last_week_points+ played_last_3+ started_last_3+ avg_earnings+ init_Value,
                        data=train, method = "class")

In [9]:
#plot(dtree.played)
#text(dtree.played)
#plotcp(treemodel)
#printcp(treemodel)
#text(treemodel, cex = 0.75)
summary(dtree.played.prune)

ERROR: Error in summary(dtree.played.prune): object 'dtree.played.prune' not found


In [10]:
predictions <- predict(dtree.played, newdata = test, type = "class")
table(predictions, test$played) 

           
predictions    0    1
          0 1750  213
          1  251 1075

In [11]:
dtree.played.prune <- prune(dtree.played, cp = 0.011)
predictions.pruned <- predict(dtree.played.prune, newdata = test, type = "class")
table(predictions.pruned, test$played) 

                  
predictions.pruned    0    1
                 0 1750  213
                 1  251 1075

### Decision Tree: Starter

In [12]:
library("rpart")
dtree.starter <- rpart(starter~last_week_status + played_rate+ starter_rate+ last_week_points+ played_last_3+ started_last_3+ avg_earnings+ init_Value,
                        data=train, method = "class")

In [13]:
summary(dtree.starter)

Call:
rpart(formula = starter ~ last_week_status + played_rate + starter_rate + 
    last_week_points + played_last_3 + started_last_3 + avg_earnings + 
    init_Value, data = train, method = "class")
  n= 9866 

          CP nsplit rel error    xerror       xstd
1 0.30681337      0 1.0000000 1.0000000 0.01793970
2 0.06517139      1 0.6931866 0.6931866 0.01564119
3 0.04739738      2 0.6280152 0.6555226 0.01529237
4 0.01000000      3 0.5806179 0.5806179 0.01454450

Variable importance
  started_last_3     starter_rate    played_last_3      played_rate 
              27               17               16               14 
    avg_earnings last_week_status 
              14               13 

Node number 1: 9866 observations,    complexity param=0.3068134
  predicted class=0  expected loss=0.2395094  P(node) =1
    class counts:  7503  2363
   probabilities: 0.760 0.240 
  left son=2 (6421 obs) right son=3 (3445 obs)
  Primary splits:
      started_last_3 < 0.5       to the left,  improve=

In [14]:
dtree.starter.prune <- prune(dtree.starter, cp = 0.011)
summary(dtree.starter.prune)

Call:
rpart(formula = starter ~ last_week_status + played_rate + starter_rate + 
    last_week_points + played_last_3 + started_last_3 + avg_earnings + 
    init_Value, data = train, method = "class")
  n= 9866 

          CP nsplit rel error    xerror       xstd
1 0.30681337      0 1.0000000 1.0000000 0.01793970
2 0.06517139      1 0.6931866 0.6931866 0.01564119
3 0.04739738      2 0.6280152 0.6555226 0.01529237
4 0.01000000      3 0.5806179 0.5806179 0.01454450

Variable importance
  started_last_3     starter_rate    played_last_3      played_rate 
              27               17               16               14 
    avg_earnings last_week_status 
              14               13 

Node number 1: 9866 observations,    complexity param=0.3068134
  predicted class=0  expected loss=0.2395094  P(node) =1
    class counts:  7503  2363
   probabilities: 0.760 0.240 
  left son=2 (6421 obs) right son=3 (3445 obs)
  Primary splits:
      started_last_3 < 0.5       to the left,  improve=

In [15]:
prediction <- predict(dtree.starter, newdata = test, type = "class")
predict_prob <- predict(dtree.starter, newdata = test, type = "prob")
colnames(predict_prob) <- c("pnostart", "pstart")
table(prediction, test$starter)

          
prediction    0    1
         0 2288  239
         1  229  533

In [16]:
names(model19)

ERROR: Error in eval(expr, envir, enclos): object 'model19' not found


### Now with randomForest

In [17]:
library("randomForest")

# Note: Took out init_Value as random Forest doesn't like missing values. Init_Value wasn't important in DT, so it's ok
rforest <- randomForest(as.factor(starter)~played_rate+ starter_rate+ last_week_status + last_week_points+ played_last_3+ started_last_3+ avg_earnings,
                        data=train, ntree = 1001)
#Force Classifier outcome by making dependent variable factor

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:dplyr':

    combine



In [18]:
prediction <- predict(rforest, newdata = test, type = "class")
table(prediction, test$starter)

          
prediction    0    1
         0 2280  261
         1  237  511

In [19]:
starter_probs <- predict(rforest, newdata = plm.played.all, type = "prob")
starter_probs <- cbind(starter_probs, plm.played.all[c("id", "matchday")])
save(starter_probs, file = "starter_probs.RData")

### Old Code that I might want to reuse

In [20]:
colnames(rfpred.df) <- c("pnostart", "pstart")
ggplot(rfpred.df, aes(pstart)) + geom_histogram()

ERROR: Error in colnames(rfpred.df) <- c("pnostart", "pstart"): object 'rfpred.df' not found


ERROR: Error in eval(expr, envir, enclos): could not find function "ggplot"


In [21]:
# Should be easier to identify non-starters
threshold = 0.75
rfpred.df$vote[rfpred.df$pstart < threshold] = 0
rfpred.df$vote[rfpred.df$pstart >= threshold] = 1

#rfpred.df$vote
table(rfpred.df$vote, model19$started)

ERROR: Error in rfpred.df$vote[rfpred.df$pstart < threshold] = 0: object 'rfpred.df' not found


ERROR: Error in rfpred.df$vote[rfpred.df$pstart >= threshold] = 1: object 'rfpred.df' not found


ERROR: Error in table(rfpred.df$vote, model19$started): object 'rfpred.df' not found


In [22]:
#treemodel2 = prune(dtreeplayed, cp = 0.011)
#plot(treemodel2, uniform = TRUE)
#text(treemodel2)
#summary(treemodel2)

In [23]:
## Who are these 17?
test <- cbind(model19, prediction, predict_prob)
#names(test)
subset(test, test$prediction == TRUE & test$started == FALSE)

ERROR: Error in cbind(model19, prediction, predict_prob): object 'model19' not found


Unnamed: 0,id,Name,played_rate,starter_rate,last_week_status,last_week_points,played_last_3,started_last_3,avg_mins,avg_earnings,init_Value,played,starter,status,matchday


In [24]:
library("ROCR")
predictions_array = as.integer(as.logical(predictions)) # scores should be numeric

pred <- prediction(predictions_array, model19$played) 

perf <- performance(pred, "prec", "rec")

plot(perf)

ERROR: Error in library("ROCR"): there is no package called 'ROCR'


ERROR: Error in eval(expr, envir, enclos): could not find function "prediction"


ERROR: Error in eval(expr, envir, enclos): could not find function "performance"


ERROR: Error in plot(perf): object 'perf' not found
