# Import data

In [None]:
library(tidyverse)
library(reshape2)

In [None]:
library("RSQLite")

## connect to db
con <- dbConnect(drv=RSQLite::SQLite(), dbname="../input/soccer/database.sqlite")

## list all tables
tables <- dbListTables(con)

## exclude sqlite_sequence (contains table information)
tables <- tables[tables != "sqlite_sequence"]

lDataFrames <- vector("list", length=length(tables))

## create a data.frame for each table
for (i in seq(along=tables)) {
  lDataFrames[[i]] <- dbGetQuery(conn=con, statement=paste("SELECT * FROM '", tables[[i]], "'", sep=""))
}



# Rank the teams

In [None]:
match <- data.frame(lDataFrames[3])
match


In [None]:
match <- data.frame(lDataFrames[3])
team <- data.frame(lDataFrames[6])
# head(match,2)
# head(team,2)


In [None]:
match <- match %>% select(match_api_id,league_id,season,date,home_team_api_id,away_team_api_id,home_team_goal, away_team_goal)
team <- team %>% select(team_api_id,team_long_name)


In [None]:
scoring <- function(x){  
    if (x==0) {
        ans = 1/3
    }else if (x>0){
        ans = 1
    }else{
        ans = 0}
    return(ans)
    }

In [None]:
match$home_score <- (match$home_team_goal - match$away_team_goal)
match$away_score <- - match$home_score
head(match)

In [None]:
match$home_win_rate <- as.numeric(lapply(match$home_score,FUN=scoring))
match$away_win_rate <- as.numeric(lapply(match$away_score,FUN=scoring))
head(match)


In [None]:
match_home <- match %>% select(match_api_id,league_id,season,date,home_team_api_id,home_score,home_win_rate) %>% 
  rename(
    team_api_id = home_team_api_id,
    score = home_score,
    win_rate = home_win_rate
    )
match_away <- match %>% select(match_api_id,league_id,season,date,away_team_api_id,away_score, away_win_rate) %>% 
  rename(
    team_api_id = away_team_api_id,
    score = away_score,
    win_rate = away_win_rate
    )

In [None]:
total <- rbind(match_home, match_away)
total$date <- as.Date(total$date)
head(arrange(total,match_api_id))

In [None]:
# match14 = with(total, total[(date >= "2014-01-01" & date < "2015-01-01"),])
# match15 = with(total, total[(date >= "2015-01-01" & date < "2016-01-01"),])

match14 = with(total, total[(season=='2014/2015'),])
match15 = with(total, total[(season=='2015/2016'),])
head(match14)

In [None]:
df14 = subset(match14, select=-c(date,season))
df15 = subset(match15, select=-c(date,season))
head(df14)

In [None]:
team_ranking14_byscore <- aggregate(df14$score, by=list(team_api_id=df14$team_api_id), FUN=mean) %>% rename(score=x)
team_ranking15_byscore <- aggregate(df15$score, by=list(team_api_id=df15$team_api_id), FUN=mean) %>% rename(score=x)

In [None]:
team_ranking14_bywin <- aggregate(df14$win_rate, by=list(team_api_id=df14$team_api_id), FUN=mean) %>% rename(win_rate=x)
team_ranking15_bywin <- aggregate(df15$win_rate, by=list(team_api_id=df15$team_api_id), FUN=mean) %>% rename(win_rate=x)

In [None]:
head(team_ranking14_bywin)

In [None]:
team_ranking14 <- merge(team,team_ranking14_byscore, by="team_api_id",all.team_ranking14_byscore = TRUE)
team_ranking15 <- merge(team,team_ranking15_byscore,by="team_api_id",all.team_ranking14_byscore = TRUE)

team_ranking14 <- arrange(merge(team_ranking14,team_ranking14_bywin, by="team_api_id"),-win_rate)
team_ranking15 <- arrange(merge(team_ranking15,team_ranking15_bywin,by="team_api_id"),-win_rate)

team_ranking14 <- arrange(merge(team_ranking14,subset(match14, select=c(team_api_id,league_id)), by="team_api_id"),-win_rate) %>% distinct()
team_ranking15 <- arrange(merge(team_ranking15,subset(match15, select=c(team_api_id,league_id)), by="team_api_id"),-win_rate) %>% distinct()




In [None]:
head(team_ranking14)
head(team_ranking15)

# Rank by players


Code from John:

In [None]:
# create dataframes
country <-  data.frame(lDataFrames[1])
league  <-  data.frame(lDataFrames[2])
match   <-  data.frame(lDataFrames[3])
player  <-  data.frame(lDataFrames[4])
team    <-  data.frame(lDataFrames[6])

# select relevant columns
country <- select(country, id, name) %>% rename(country_id = id)  %>% rename(country_name = name)   # use country_id as key for join
league  <- select(league, country_id, name) %>% rename(league_name = name) # use country_id as key for join
match   <- select(match, id, country_id, league_id, season, stage, date, match_api_id, home_team_api_id, away_team_api_id, home_team_goal, away_team_goal, home_player_1, home_player_2, home_player_3, home_player_4, home_player_5, home_player_6, home_player_7, home_player_8, home_player_9, home_player_10, home_player_11, away_player_1, away_player_2, away_player_3, away_player_4, away_player_5, away_player_6, away_player_7, away_player_8, away_player_9, away_player_10, away_player_11, goal, shoton, shotoff, foulcommit, card, cross, corner, possession)
player  <- select(player,player_api_id, player_name) # use player_api_id as key for join
team    <- select(team, team_api_id, team_long_name, team_short_name) # use team_api_id as key for join

In [None]:
others = colnames(match)[1:11]
players = colnames(match)[12:33]

In [None]:
# flatten the data
flatten <- melt(match, id = others, measure = players, na.rm = TRUE, value.name = "player_api_id") %>% 
  mutate(team_api_id = 
         ifelse(grepl("home",variable),home_team_api_id, 
         ifelse(grepl("away",variable),away_team_api_id,NA))) %>%  
  left_join(country, by = "country_id")    %>% 
  left_join(league,  by = "country_id")    %>%  
  left_join(team,    by = "team_api_id")   %>%
  left_join(player,  by = "player_api_id") %>% 
  separate(season, into = c("season_start","season_end"), sep = "/", convert = TRUE) 
head(flatten)

In [None]:
team_players <- select(flatten, season_start, season_end, country_name, league_name, player_api_id, team_api_id, team_long_name, team_short_name, player_name)
head(team_players)

In [None]:
team_players_2015 <- team_players %>% filter(season_start == 2015)
head(team_players_2015)
team_players_2014 <- team_players %>% filter(season_start == 2014)
head(team_players_2014)

## Spearman Correlation:

In [None]:
player_att  <-  data.frame(lDataFrames[5])
player_att  <- select(player_att,player_api_id, date, overall_rating, potential)
player_att14 = select(with(player_att, player_att[(date >= "2014-07-18" & date <= "2015-05-31"),]),-c(date))
player_att15 = select(with(player_att, player_att[(date >= "2015-07-17" & date <= "2016-05-25"),]), -c(date))

# Average potential and overall_rating by player over a season
player14 <- player_att14 %>% group_by(player_api_id) %>% summarise_all("mean")
player15 <- player_att15 %>% group_by(player_api_id) %>% summarise_all("mean")

# Match players with teams
team14 <- select(team_players_2014,player_api_id,team_api_id)
team14 <- merge(team14,player14,by="player_api_id") %>% distinct()

team15 <- select(team_players_2015,player_api_id,team_api_id)
team15 <- merge(team15,player15,by="player_api_id") %>% distinct()

# find average player for each team for that season
player_ranking14 <- select(team14,-c(player_api_id)) %>% group_by(team_api_id) %>% summarise_all("mean")
player_ranking15 <- select(team15,-c(player_api_id)) %>% group_by(team_api_id) %>% summarise_all("mean")


# Create dataframe with team winning-rate and player's abilities
final14 <- arrange(merge(team_ranking14,player_ranking14, by="team_api_id"),-win_rate)
final15 <- arrange(merge(team_ranking15,player_ranking15, by="team_api_id"),-win_rate)

In [None]:
head(final14)
head(final15)

In [None]:
library(ggplot2)
ggplot(final14, aes(x=win_rate, y=overall_rating)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE, color='#2C3E50')
ggplot(final14, aes(x=win_rate, y=potential)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE, color='#2C3E50')


ggplot(final15, aes(x=win_rate, y=overall_rating)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE, color='#2C3E50')
ggplot(final15, aes(x=win_rate, y=potential)) + 
  geom_point(color='#2980B9', size = 4) + 
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE, color='#2C3E50')

In [None]:
# Correlate by Overall-rating and Winning-rate
corr14 <- cor.test(x=final14$win_rate, y=final14$overall_rating, method = 'spearman',conf.level = 0.95)
corr15 <- cor.test(x=final15$win_rate, y=final14$overall_rating, method = 'spearman',conf.level = 0.95)

corr14
corr15

In [None]:
# Correlate by Potential and Winning-rate
corr14 <- cor.test(x=final14$win_rate, y=final14$potential, method = 'spearman',conf.level = 0.95)
corr15 <- cor.test(x=final15$win_rate, y=final14$potential, method = 'spearman',conf.level = 0.95)

corr14
corr15
