In [1]:
setwd("~/Projects/livemanager/")
library("rpart")
load("plm.RData")
library("dplyr")


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



### Define Functions

In [2]:
starterfeatures <- function(data = plm, spieltag)  {
    
    data.subset <- subset(data, data$matchday < spieltag)
    data.features <- data.subset %>%
            group_by(id, Name) %>%
            summarise(played_rate = sum(time_on_pitch > 0)/max(matchday),
                      starter_rate = sum(status == "starter")/max(matchday),
                      last_week_status = status[matchday = max(matchday)],
                      last_week_points = total_earnings[matchday = max(matchday)],
                      played_last_3 = sum(time_on_pitch[matchday >= max(matchday) - 2] > 0),
                      started_last_3 = sum(status[matchday >= max(matchday) -2] == "starter"),
                      avg_mins = mean(time_on_pitch),
                      avg_earnings = mean(total_earnings),
                      init_Value = mean(init_Value)
                      ) %>%
            ungroup() %>%
            arrange(id, Name)

    return(data.features)
    
}

In [3]:
starterlabels <- function(data = plm, spieltag)  {
    
    data.subset <- subset(data, data$matchday == spieltag)
    data.labels <- data.subset %>%
            group_by(id) %>%
            summarise(played = sum(time_on_pitch > 0),
                      starter = sum(status == "starter"),
                      status = first(status),
                      matchday = first(matchday)
                      ) %>%
            ungroup() %>%
            arrange(id)

    return(data.labels)
    
}

### Build dataset, starting at Spieltag 5

In [4]:
max_matchday = max(plm$matchday)

plm.played.all <- as.data.frame(NULL)

for (i in 5:max_matchday) {
    features <- starterfeatures(data = plm, spieltag = i)
    labels <- starterlabels(data = plm, spieltag = i)
    merge <- merge(features, labels, by = "id")
    plm.played.all <- rbind(plm.played.all, merge)
    cat("Completed matchday ", i)
}

Completed matchday  5Completed matchday  6Completed matchday  7Completed matchday  8Completed matchday  9Completed matchday  10Completed matchday  11Completed matchday  12Completed matchday  13Completed matchday  14Completed matchday  15Completed matchday  16Completed matchday  17Completed matchday  18Completed matchday  19

### Split training & test data

In [5]:
## Use 75% as train
smp_size <- floor(0.75 * nrow(plm.played.all))

## set the seed to make your partition reproductible
set.seed(123)
train_indices <- sample(seq_len(nrow(plm.played.all)), size = smp_size)

train <- plm.played.all[train_indices, ]
test <- plm.played.all[-train_indices, ]

In [6]:
names(train)

### Decision Tree - Played

In [7]:
library("rpart")
dtree.played <- rpart(played~played_rate+ starter_rate+ last_week_status + last_week_points+ played_last_3+ started_last_3+ avg_earnings+ init_Value,
                        data=train, method = "class")

In [8]:
#plot(dtree.played)
#text(dtree.played)
#plotcp(treemodel)
#printcp(treemodel)
#text(treemodel, cex = 0.75)
summary(dtree.played.prune)

ERROR: Error in summary(dtree.played.prune): object 'dtree.played.prune' not found


In [9]:
predictions <- predict(dtree.played, newdata = test, type = "class")
table(predictions, test$played) 

           
predictions    0    1
          0 1163  140
          1  185  749

In [10]:
dtree.played.prune <- prune(dtree.played, cp = 0.011)
predictions.pruned <- predict(dtree.played.prune, newdata = test, type = "class")
table(predictions.pruned, test$played) 

                  
predictions.pruned    0    1
                 0 1163  140
                 1  185  749

### Decision Tree: Starter

In [11]:
library("rpart")
dtree.starter <- rpart(starter~last_week_status + played_rate+ starter_rate+ last_week_points+ played_last_3+ started_last_3+ avg_earnings+ init_Value,
                        data=train, method = "class")

In [12]:
summary(dtree.starter)

Call:
rpart(formula = starter ~ last_week_status + played_rate + starter_rate + 
    last_week_points + played_last_3 + started_last_3 + avg_earnings + 
    init_Value, data = train, method = "class")
  n= 6708 

         CP nsplit rel error    xerror       xstd
1 0.4477245      0 1.0000000 1.0000000 0.02158542
2 0.0100000      1 0.5522755 0.5522755 0.01715178

Variable importance
last_week_status   started_last_3 last_week_points     starter_rate 
              30               20               14               14 
   played_last_3     avg_earnings 
              12               10 

Node number 1: 6708 observations,    complexity param=0.4477245
  predicted class=0  expected loss=0.2423971  P(node) =1
    class counts:  5082  1626
   probabilities: 0.758 0.242 
  left son=2 (5076 obs) right son=3 (1632 obs)
  Primary splits:
      last_week_status splits as  -LLLRL-L, improve=996.4722, (0 missing)
      started_last_3   < 0.5       to the left,  improve=976.6058, (0 missing)
      s

In [13]:
dtree.starter.prune <- prune(dtree.starter, cp = 0.011)
summary(dtree.starter.prune)

Call:
rpart(formula = starter ~ last_week_status + played_rate + starter_rate + 
    last_week_points + played_last_3 + started_last_3 + avg_earnings + 
    init_Value, data = train, method = "class")
  n= 6708 

         CP nsplit rel error    xerror       xstd
1 0.4477245      0 1.0000000 1.0000000 0.02158542
2 0.0100000      1 0.5522755 0.5522755 0.01715178

Variable importance
last_week_status   started_last_3 last_week_points     starter_rate 
              30               20               14               14 
   played_last_3     avg_earnings 
              12               10 

Node number 1: 6708 observations,    complexity param=0.4477245
  predicted class=0  expected loss=0.2423971  P(node) =1
    class counts:  5082  1626
   probabilities: 0.758 0.242 
  left son=2 (5076 obs) right son=3 (1632 obs)
  Primary splits:
      last_week_status splits as  -LLLRL-L, improve=996.4722, (0 missing)
      started_last_3   < 0.5       to the left,  improve=976.6058, (0 missing)
      s

In [14]:
prediction <- predict(dtree.starter, newdata = test, type = "class")
predict_prob <- predict(dtree.starter, newdata = test, type = "prob")
colnames(predict_prob) <- c("pnostart", "pstart")
table(prediction, test$starter)

          
prediction    0    1
         0 1543  160
         1  158  376

In [15]:
names(model19)

ERROR: Error in eval(expr, envir, enclos): object 'model19' not found


### Now with randomForest

In [29]:
library("randomForest")

# Note: Took out init_Value as random Forest doesn't like missing values. Init_Value wasn't important in DT, so it's ok
rforest <- randomForest(as.factor(starter)~played_rate+ starter_rate+ last_week_status + last_week_points+ played_last_3+ started_last_3+ avg_earnings,
                        data=train, ntree = 1001)
#Force Classifier outcome by making dependent variable factor

In [30]:
prediction <- predict(rforest, newdata = test, type = "class")
table(prediction, test$starter)

          
prediction    0    1
         0 1564  165
         1  137  371

In [63]:
starter_probs <- predict(rforest, newdata = plm.played.all, type = "prob")
starter_probs <- cbind(starter_probs, plm.played.all[c("id", "matchday")])
save(starter_probs, file = "starter_probs.RData")

### Old Code that I might want to reuse

In [18]:
colnames(rfpred.df) <- c("pnostart", "pstart")
ggplot(rfpred.df, aes(pstart)) + geom_histogram()

ERROR: Error in colnames(rfpred.df) <- c("pnostart", "pstart"): object 'rfpred.df' not found


ERROR: Error in eval(expr, envir, enclos): could not find function "ggplot"


In [19]:
# Should be easier to identify non-starters
threshold = 0.75
rfpred.df$vote[rfpred.df$pstart < threshold] = 0
rfpred.df$vote[rfpred.df$pstart >= threshold] = 1

#rfpred.df$vote
table(rfpred.df$vote, model19$started)

ERROR: Error in rfpred.df$vote[rfpred.df$pstart < threshold] = 0: object 'rfpred.df' not found


ERROR: Error in rfpred.df$vote[rfpred.df$pstart >= threshold] = 1: object 'rfpred.df' not found


ERROR: Error in table(rfpred.df$vote, model19$started): object 'rfpred.df' not found


In [20]:
#treemodel2 = prune(dtreeplayed, cp = 0.011)
#plot(treemodel2, uniform = TRUE)
#text(treemodel2)
#summary(treemodel2)

In [21]:
## Who are these 17?
test <- cbind(model19, prediction, predict_prob)
#names(test)
subset(test, test$prediction == TRUE & test$started == FALSE)

ERROR: Error in cbind(model19, prediction, predict_prob): object 'model19' not found


Unnamed: 0,id,Name,played_rate,starter_rate,last_week_status,last_week_points,played_last_3,started_last_3,avg_mins,avg_earnings,init_Value,played,starter,status,matchday


In [22]:
library("ROCR")
predictions_array = as.integer(as.logical(predictions)) # scores should be numeric

pred <- prediction(predictions_array, model19$played) 

perf <- performance(pred, "prec", "rec")

plot(perf)

Loading required package: gplots

Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess



ERROR: Error in is.data.frame(labels): object 'model19' not found


ERROR: Error in performance(pred, "prec", "rec"): object 'pred' not found


ERROR: Error in plot(perf): error in evaluating the argument 'x' in selecting a method for function 'plot': Error: object 'perf' not found

