In [None]:
library(tidyverse)
library(DBI)
library(RSQLite)

In [None]:
soccer <- dbConnect(SQLite(), 
                 "../input/soccer/database.sqlite")

In [None]:
##### John's code
library(reshape2)
library(data.table)
library(dplyr)
library(knitr)

In [None]:
## list all tables
tables <- dbListTables(soccer)

## exclude sqlite_sequence (contains table information)
tables <- tables[tables != "sqlite_sequence"]
lDataFrames <- vector("list", length=length(tables))

## create a data.frame for each table
for (i in seq(along=tables)) {
  lDataFrames[[i]] <- 
    dbGetQuery(conn=soccer, 
               statement=paste("SELECT * FROM '", 
                               tables[[i]], "'", sep=""))
}

# create dataframes
country <-  data.frame(lDataFrames[1])
league  <-  data.frame(lDataFrames[2])
match   <-  data.frame(lDataFrames[3])
player  <-  data.frame(lDataFrames[4])
team    <-  data.frame(lDataFrames[6])

# select relevant columns
country <- select(country, id, name) %>% 
  rename(country_id = id)  %>% 
  rename(country_name = name)   # use country_id as key for join
league  <- select(league, country_id, name) %>% 
  rename(league_name = name) # use country_id as key for join
match   <- select(match, id, country_id, league_id, season, 
                  stage, date, match_api_id, home_team_api_id, 
                  away_team_api_id, home_team_goal, away_team_goal, 
                  home_player_1, home_player_2, home_player_3, 
                  home_player_4, home_player_5, home_player_6, 
                  home_player_7, home_player_8, home_player_9, 
                  home_player_10, home_player_11, away_player_1, 
                  away_player_2, away_player_3, away_player_4, 
                  away_player_5, away_player_6, away_player_7, 
                  away_player_8, away_player_9, away_player_10, 
                  away_player_11, goal, shoton, shotoff, 
                  foulcommit, card, cross, corner, possession)
player  <- select(player,player_api_id, player_name) # use player_api_id as key for join
team    <- select(team, team_api_id, team_long_name, team_short_name) # use team_api_id as key for join

others = colnames(match)[1:11]
players = colnames(match)[12:33]

flatten <- melt(match, id = others, measure = players, na.rm = TRUE, value.name = "player_api_id") %>% 
  mutate(team_api_id = 
           ifelse(grepl("home",variable),home_team_api_id, 
                  ifelse(grepl("away",variable),away_team_api_id,NA))) %>%  
  left_join(country, by = "country_id")    %>% 
  left_join(league,  by = "country_id")    %>%  
  left_join(team,    by = "team_api_id")   %>%
  left_join(player,  by = "player_api_id") %>% 
  separate(season, into = c("season_start","season_end"), sep = "/", convert = TRUE) 
head(flatten)

team_players <- select(flatten, season_start, season_end, 
                       country_name, league_name, player_api_id, 
                       team_api_id, team_long_name, team_short_name,
                       player_name)

team_players_2015 <- team_players %>% filter(season_start == 2015)
team_players_2014 <- team_players %>% filter(season_start == 2014)

In [None]:
##### my code
player <- dbReadTable(soccer, 'Player')
player_attr <- dbReadTable(soccer, 'Player_Attributes')

player <- as.data.frame(player)
player_attr <- as.data.frame(player_attr)
player_attr$date <- substr(player_attr$date,1,10)

In [None]:
# filter season date
player_attr_2014 <- subset(player_attr, 
                           date >= '2014-07-18' & date <= '2015-05-31')
player_attr_2015 <- subset(player_attr, 
                           date >= '2015-07-17' & date <= '2016-05-25')

In [None]:
# remove negative values
player[player < 0] <- NA
player_attr_2014[player_attr_2014 < 0] <- NA
player_attr_2015[player_attr_2015 < 0] <- NA

In [None]:
# merge values of one player to one
aggregate_process <- function(x) {
  if (is.numeric(x)) {mean(x)}
  else {x[1]}
}

player_attr_2014_aggregate <- aggregate(player_attr_2014, 
               by=list(player_attr_2014$player_api_id),
               aggregate_process)
player_attr_2014 <- 
  player_attr_2014_aggregate[2:ncol(player_attr_2014_aggregate)]

player_attr_2015_aggregate <- aggregate(player_attr_2015, 
                                        by=list(player_attr_2015$player_api_id),
                                        aggregate_process)
player_attr_2015 <- 
  player_attr_2015_aggregate[2:ncol(player_attr_2015_aggregate)]

In [None]:
# remove outliers
# normalize
preprocess <- function(data) {
  for(i in 1:ncol(data)) {
    if (is.numeric(data[,i]) && !grepl("id",names(data)[i])) {
      mean <- mean(data[,i])
      sd <- sd(data[,i])
      data[,i] <- replace(data[,i],data[,i]>mean+2*sd,NA)
      data[,i] <- replace(data[,i],data[,i]<mean-2*sd,NA)
      data[,i] <- scale(data[,i])
    }
  }
  return(data)
}

player <- preprocess(player)
player_attr_2014 <- preprocess(player_attr_2014)
player_attr_2015 <- preprocess(player_attr_2015)

In [None]:
# merge with players' basic info
player_2014 <- merge(player,player_attr_2014[,c(2:42)],
                     by=c('player_api_id','player_fifa_api_id'),
                     all.x = TRUE, all.y = TRUE)
player_2015 <- merge(player,player_attr_2015[,c(2:42)],
                     by=c('player_api_id','player_fifa_api_id'),
                     all.x = TRUE, all.y = TRUE)

In [None]:
# filter only 3 leagues
player_name_2014 <- team_players_2014$player_name
player_2014 <- subset(player_2014,
                      player_name %in% player_name_2014)
player_name_2015 <- team_players_2015$player_name
player_2015 <- subset(player_2015,
                      player_name %in% player_name_2015)

In [None]:
# change dummies
player_2014$left_foot <- ifelse(player_2014$preferred_foot == 'left', 1, 0)
player_2014$right_foot <- ifelse(player_2014$preferred_foot == 'right', 1, 0)
player_2015$left_foot <- ifelse(player_2015$preferred_foot == 'left', 1, 0)
player_2015$right_foot <- ifelse(player_2015$preferred_foot == 'right', 1, 0)

player_2014$attacking_work_rate <- replace(
  player_2014$attacking_work_rate,player_2014$attacking_work_rate=='None',NA)
player_2014$attack_high <- ifelse(player_2014$attacking_work_rate == 'high', 1, 0)
player_2014$attack_medium <- ifelse(player_2014$attacking_work_rate == 'medium', 1, 0)

player_2015$attacking_work_rate <- replace(
  player_2015$attacking_work_rate,player_2015$attacking_work_rate=='None',NA)
player_2015$attack_high <- ifelse(player_2015$attacking_work_rate == 'high', 1, 0)
player_2015$attack_medium <- ifelse(player_2015$attacking_work_rate == 'medium', 1, 0)

player_2014$defend_high <- ifelse(player_2014$defensive_work_rate == 'high', 1, 0)
player_2014$defend_medium <- ifelse(player_2014$defensive_work_rate == 'medium', 1, 0)
player_2015$defend_high <- ifelse(player_2015$defensive_work_rate == 'high', 1, 0)
player_2015$defend_medium <- ifelse(player_2015$defensive_work_rate == 'medium', 1, 0)

In [None]:
#### Jaja's code

## Preprocessing Team & Team Attributes
team <- data.frame(lDataFrames[6])
team_attr <- data.frame(lDataFrames[7])

team <- select(team,id,team_api_id)
team_attr$date <- substr(team_attr$date,1,10)
team_attr <- select(team_attr,id,team_api_id,date,
                    buildUpPlaySpeed,buildUpPlayDribbling,
                    buildUpPlayPassing,chanceCreationPassing,chanceCreationCrossing,
                    chanceCreationShooting,chanceCreationPositioningClass,defencePressure,
                    defenceAggression,defenceTeamWidth,defenceDefenderLineClass)

#filter season date
team_attr_2014 <- subset(team_attr, date >= '2014-07-18' & date <= '2015-05-31')
team_attr_2015 <- subset(team_attr, date >= '2015-07-17' & date <= '2016-05-25')

# remove negative values
team_attr_2014[team_attr_2014 < 0] <- NA
team_attr_2015[team_attr_2015 < 0] <- NA

# merge values of one team to one
aggregate_process <- function(x) {
  if (is.numeric(x)) {mean(x)}
  else {x[1]}
}

team_attr_2014_aggregate <- aggregate(team_attr_2014, 
               by=list(team_attr_2014$team_api_id),
               aggregate_process)
team_attr_2014 <- 
  team_attr_2014_aggregate[2:ncol(team_attr_2014_aggregate)]

team_attr_2015_aggregate <- aggregate(team_attr_2015, 
                                        by=list(team_attr_2015$team_api_id),
                                        aggregate_process)
team_attr_2015 <- 
  team_attr_2015_aggregate[2:ncol(team_attr_2015_aggregate)]

# remove outliers and normalize
preprocess <- function(data) {
  for(i in 1:ncol(data)) {
    if (is.numeric(data[,i]) && !grepl("id",names(data)[i])) {
      mean <- mean(data[,i])
      sd <- sd(data[,i])
      data[,i] <- replace(data[,i],data[,i]>mean+2*sd,NA)
      data[,i] <- replace(data[,i],data[,i]<mean-2*sd,NA)
      data[,i] <- scale(data[,i])
    }
  }
  return(data)
}

team_attr_2014 <- preprocess(team_attr_2014)
team_attr_2015 <- preprocess(team_attr_2015)

In [None]:
full_team_stat_14 <- team_players_2014 %>% left_join(player_2014, by = 'player_api_id') %>%
    select(-c('player_name.y', 'player_name.x', 'birthday', 'preferred_foot', 'date', 'id')) %>% 
    group_by_at(c(1:4, 6:8)) %>%
    summarise_each(funs(ave(.,na.rm=T)), c(9:47)) %>% 
    distinct() 
full_team_stat_14 <- left_join(full_team_stat_14, team_attr_2014, by = 'team_api_id') %>%
    select(-c('chanceCreationPositioningClass', 'defenceDefenderLineClass'))

In [None]:
full_team_stat_15 <- team_players_2015 %>% left_join(player_2015, by = 'player_api_id') %>%
    select(-c('player_name.y', 'player_name.x', 'birthday', 'preferred_foot', 'date', 'id')) %>% 
    group_by_at(c(1:4, 6:8)) %>%
    summarise_each(funs(ave(.,na.rm=T)), c(9:47)) %>% 
    distinct() 
full_team_stat_15 <- left_join(full_team_stat_15, team_attr_2015, by = 'team_api_id') %>%
    select(-c('chanceCreationPositioningClass', 'defenceDefenderLineClass'))

In [None]:
country = 'Spain'

In [None]:
full_team_stat_14_league <- full_team_stat_14 %>% 
    filter(season_start == 2014 & country_name == country) %>%
    ungroup() %>%
    select(-c('season_start', 'season_end', 'country_name', 'league_name', 'id'))
full_team_stat_14_league

In [None]:
full_team_stat_15_league <- full_team_stat_15 %>% 
    filter(season_start == 2015 & country_name == country) %>%
    ungroup() %>%
    select(-c('season_start', 'season_end', 'country_name', 'league_name', 'id'))
full_team_stat_15_league

In [None]:
match_14 <- match %>% filter(season == '2014/2015')
match_stat_14 <- match_14 %>% right_join(full_team_stat_14_league, by = c('home_team_api_id'='team_api_id')) %>%
                right_join(full_team_stat_14_league, by = c('away_team_api_id'='team_api_id')) %>%
                select_at(c(7:11, 44:92, 95:143)) %>%
                select(-c('date', 'date.y'))

In [None]:
match_15 <- match %>% filter(season == '2014/2015')
match_stat_15 <- match_15 %>% right_join(full_team_stat_15_league, by = c('home_team_api_id'='team_api_id')) %>%
                right_join(full_team_stat_15_league, by = c('away_team_api_id'='team_api_id')) %>%
                select_at(c(7:11, 44:92, 95:143)) %>%
                select(-c('date', 'date.y')) %>%
                drop_na()

In [None]:
match_stat_14$result = ifelse(match_stat_14$away_team_goal < match_stat_14$home_team_goal, 1, 0)
result = ifelse(match_stat_14$away_team_goal < match_stat_14$home_team_goal, 1, 0)
match_stat_15$result = ifelse(match_stat_15$away_team_goal < match_stat_15$home_team_goal, 1, 0)
match_stat_15$result_2 = ifelse(match_stat_15$away_team_goal < match_stat_15$home_team_goal, 1, 
                                ifelse(match_stat_15$away_team_goal == match_stat_15$home_team_goal, 0.5, 0))

In [None]:
cor(match_stat_14, use = "complete.obs")

In [None]:
colnames(match_stat_14)[6:98]

In [None]:
d <- cbind(result, match_stat_14[6:98])
lmf <- reformulate(names(d)[-1], names(d[1]))
model.fit <- glm(lmf, data = match_stat_14, family="binomial", na.action = na.omit)

In [None]:
nn <- names(na.exclude(model.fit$coefficients))[-1]
nn

In [None]:
lmf <- reformulate(nn, 'result')
model.fit <- glm(lmf, data = match_stat_14, family="binomial", na.action = na.omit)

In [None]:
model.prob = predict(model.fit, match_stat_15, type="response")

In [None]:
model.pred = rep(0, dim(match_stat_15)[1])
model.pred[model.prob > .5] = 1
a<- table(model.pred, match_stat_15$result)
mean(model.pred == match_stat_15$result)
# The confusion matrix from a single assessment set (i.e. fold)
cm <- conf_mat(a, match_stat_15$result, model.pred)
autoplot(cm, type = "heatmap") +
  scale_fill_gradient(low="#D6EAF8",high = "#2E86C1") +
  theme(text = element_text(size=20))

In [None]:
colnames(match_stat_14)

In [None]:
model.pred = rep(0, dim(match_stat_15)[1])
model.pred[model.prob > .45] = 0.5
model.pred[model.prob > .55] = 1
a <- table(model.pred, match_stat_15$result_2)
mean(model.pred == match_stat_15$result_2)
model.pred = rep(0, dim(match_stat_15)[1])
model.pred[model.prob > .5] = 1
table(model.pred, match_stat_15$result)
mean(model.pred == match_stat_15$result)
library(yardstick)
library(ggplot2)
# The confusion matrix from a single assessment set (i.e. fold)
cm <- conf_mat(a, match_stat_15$result, model.pred)
autoplot(cm, type = "heatmap") +
  scale_fill_gradient(low="#D6EAF8",high = "#2E86C1") +
  theme(text = element_text(size=20))

In [None]:
library(vcdExtra)
HLtest(model = model.fit)