In [None]:
library(tidyverse)
library(reshape2)

In [None]:
library("RSQLite")

## connect to db
con <- dbConnect(drv=RSQLite::SQLite(), dbname="../input/soccer/database.sqlite")

## list all tables
tables <- dbListTables(con)

## exclude sqlite_sequence (contains table information)
tables <- tables[tables != "sqlite_sequence"]

lDataFrames <- vector("list", length=length(tables))

## create a data.frame for each table
for (i in seq(along=tables)) {
  lDataFrames[[i]] <- dbGetQuery(conn=con, statement=paste("SELECT * FROM '", tables[[i]], "'", sep=""))
}



In [None]:
match <- data.frame(lDataFrames[3])
team <- data.frame(lDataFrames[6])
# head(match,2)
# head(team,2)


In [None]:
match <- match %>% select(match_api_id,date,home_team_api_id,away_team_api_id,home_team_goal, away_team_goal)
team <- team %>% select(team_api_id,team_long_name)


In [None]:
match$home_score <- (match$home_team_goal - match$away_team_goal)
match$away_score <- - match$home_score
match$home_win_rate <- as.integer(as.logical(match$home_score>0))
match$away_win_rate <- as.integer(as.logical(match$away_score>0))
head(match,6)


In [None]:
match_home <- match %>% select(match_api_id,date,home_team_api_id,home_score,home_win_rate) %>% 
  rename(
    team_api_id = home_team_api_id,
    score = home_score,
    win_rate = home_win_rate
    )
match_away <- match %>% select(match_api_id,date,away_team_api_id,away_score, away_win_rate) %>% 
  rename(
    team_api_id = away_team_api_id,
    score = away_score,
    win_rate = away_win_rate
    )

In [None]:
total <- rbind(match_home, match_away)
total$date <- as.Date(total$date)
head(arrange(total,match_api_id))

In [None]:
match14 = with(total, total[(date >= "2014-01-01" & date < "2015-01-01"),])
match15 = with(total, total[(date >= "2015-01-01" & date < "2016-01-01"),])
                            

In [None]:
df14 = subset(match14, select=-c(date))
df15 = subset(match15, select=-c(date))

In [None]:
team_ranking14_byscore <- aggregate(df14$score, by=list(team_api_id=df14$team_api_id), FUN=mean) %>% rename(score=x)
team_ranking15_byscore <- aggregate(df15$score, by=list(team_api_id=df15$team_api_id), FUN=mean) %>% rename(score=x)

In [None]:
team_ranking14_bywin <- aggregate(df14$win_rate, by=list(team_api_id=df14$team_api_id), FUN=mean) %>% rename(win_rate=x)
team_ranking15_bywin <- aggregate(df15$win_rate, by=list(team_api_id=df15$team_api_id), FUN=mean) %>% rename(win_rate=x)

In [None]:
team_ranking14 <- merge(team,team_ranking14_byscore, by="team_api_id",all.team_ranking14_byscore = TRUE)
team_ranking15 <- merge(team,team_ranking15_byscore,by="team_api_id",all.team_ranking14_byscore = TRUE)

team_ranking14 <- arrange(merge(team_ranking14,team_ranking14_bywin, by="team_api_id"),-win_rate)
team_ranking15 <- arrange(merge(team_ranking15,team_ranking15_bywin,by="team_api_id"),-win_rate)




In [None]:
head(team_ranking14)
head(team_ranking15)

In [None]:
#Preprocessing Team & Team Attributes
team <- data.frame(lDataFrames[6])
team_attr <- data.frame(lDataFrames[7])

team <- team %>% select(team_api_id,team_long_name)
team_attr <- team_attr %>% select(-c(id, team_fifa_api_id))

team_attr[sapply(team_attr, is.character)] <- lapply(team_attr[sapply(team_attr, is.character)], as.factor)
team_attr[sapply(team_attr, is.numeric)] <- lapply(team_attr[sapply(team_attr, is.numeric)], function(x){replace(x, x <0,NA)})
team_attr[sapply(team_attr, is.numeric)] <- lapply(team_attr[sapply(team_attr, is.numeric)],scale)
team_attr[sapply(team_attr, is.numeric)] <- lapply(team_attr[sapply(team_attr, is.numeric)],function(x){replace(x, x>2 | x < (-2),NA)})
team_attr[sapply(team_attr, is.numeric)] <- lapply(team_attr[sapply(team_attr, is.numeric)], function(x) { attributes(x) <- NULL; x })
str(team_attr)