In [1]:
players23 <- read.csv("./fifa23_players_data.csv")
teams23 <- read.csv("./fifa23_teams_data.csv")

In [2]:
relevantCols <- c("Overall", "Value.in.Euro.", "Best.Position", "Height.in.cm.", "Weight.in.kg.", 
                 "Club.Name", "Club.Jersey.Number", "Crossing", "Sprint.Speed", "Acceleration", "Preferred.Foot", "Skill.Moves")

# assumed cleaned
playersCleaned23 <- subset(players23, select=relevantCols)

head(playersCleaned23); dim(playersCleaned23)

Overall,Value.in.Euro.,Best.Position,Height.in.cm.,Weight.in.kg.,Club.Name,Club.Jersey.Number,Crossing,Sprint.Speed,Acceleration,Preferred.Foot,Skill.Moves
91,54000000,CAM,169,67,Paris Saint-Germain,30,84,76,87,Left,4
91,64000000,CF,185,81,Real Madrid CF,9,75,80,79,Right,4
91,84000000,ST,185,81,FC Barcelona,9,71,75,76,Right,4
91,107500000,CM,181,70,Manchester City,17,94,73,76,Right,4
91,190500000,ST,182,73,Paris Saint-Germain,7,78,97,97,Right,5
90,115500000,RW,175,71,Liverpool,11,80,91,89,Left,4


In [3]:
teamsCleaned23 <- subset(teams23, select=c("Name", "IntPrestige"))
head(teamsCleaned23)

Name,IntPrestige
AFC Bournemouth,1
AFC Richmond,3
Arsenal,7
Aston Villa,3
Brentford,1
Brighton & Hove Albion,2


In [4]:
playersTeamCombined <- merge(playersCleaned23, teamsCleaned23, by.x="Club.Name", by.y="Name", all.x = TRUE)
head(playersTeamCombined); dim(playersTeamCombined)

Club.Name,Overall,Value.in.Euro.,Best.Position,Height.in.cm.,Weight.in.kg.,Club.Jersey.Number,Crossing,Sprint.Speed,Acceleration,Preferred.Foot,Skill.Moves,IntPrestige
1. FC Heidenheim 1846,68,2400000,RM,175,70,16,67,84,85,Right,3,1
1. FC Heidenheim 1846,67,2100000,CB,186,80,27,42,61,65,Right,2,1
1. FC Heidenheim 1846,68,1000000,CB,178,73,8,60,55,71,Right,2,1
1. FC Heidenheim 1846,71,1800000,RB,182,80,2,69,93,83,Right,2,1
1. FC Heidenheim 1846,67,1900000,CB,187,80,4,48,63,59,Left,2,1
1. FC Heidenheim 1846,67,1100000,LM,176,72,17,62,80,88,Right,4,1


# Data Cleaning

Extract and rename col names

In [5]:
relevantCols <- c("Overall", "Value.in.Euro.", "Best.Position", "Height.in.cm.", "Weight.in.kg.", "Club.Jersey.Number", 
                  "Crossing", "Sprint.Speed", "Acceleration", "IntPrestige", "Preferred.Foot", "Skill.Moves")

In [6]:
playersTeamCleaned <- subset(playersTeamCombined, select=relevantCols)
head(playersTeamCleaned); dim(playersTeamCleaned)

Overall,Value.in.Euro.,Best.Position,Height.in.cm.,Weight.in.kg.,Club.Jersey.Number,Crossing,Sprint.Speed,Acceleration,IntPrestige,Preferred.Foot,Skill.Moves
68,2400000,RM,175,70,16,67,84,85,1,Right,3
67,2100000,CB,186,80,27,42,61,65,1,Right,2
68,1000000,CB,178,73,8,60,55,71,1,Right,2
71,1800000,RB,182,80,2,69,93,83,1,Right,2
67,1900000,CB,187,80,4,48,63,59,1,Left,2
67,1100000,LM,176,72,17,62,80,88,1,Right,4


In [7]:
# rename
colnames(playersTeamCleaned) <- c("Overall", "Value", "BestPosition", "Height", "Weight", "JerseyNumber", 
                  "Crossing", "Speed", "Acceleration", "IntPrestige", "PreferredFoot", "SkillMoves")
head(playersTeamCleaned)

Overall,Value,BestPosition,Height,Weight,JerseyNumber,Crossing,Speed,Acceleration,IntPrestige,PreferredFoot,SkillMoves
68,2400000,RM,175,70,16,67,84,85,1,Right,3
67,2100000,CB,186,80,27,42,61,65,1,Right,2
68,1000000,CB,178,73,8,60,55,71,1,Right,2
71,1800000,RB,182,80,2,69,93,83,1,Right,2
67,1900000,CB,187,80,4,48,63,59,1,Left,2
67,1100000,LM,176,72,17,62,80,88,1,Right,4


Remove null values

In [8]:
colSums(is.na(playersTeamCleaned))

In [9]:
# remove int prestige null
playersTeamCleaned <- na.omit(playersTeamCleaned)
colSums(is.na(playersTeamCleaned))

In [10]:
# remove players with no value
playersTeamCleaned <- playersTeamCleaned[playersTeamCleaned$Value != 0, ]

In [11]:
summary(playersTeamCleaned)

    Overall          Value            BestPosition      Height     
 Min.   :47.00   Min.   :     9000   CB     :3632   Min.   :155.0  
 1st Qu.:62.00   1st Qu.:   500000   ST     :2548   1st Qu.:177.0  
 Median :66.00   Median :  1000000   CAM    :2300   Median :182.0  
 Mean   :65.83   Mean   :  2894661   GK     :2039   Mean   :181.5  
 3rd Qu.:70.00   3rd Qu.:  2000000   RM     :1434   3rd Qu.:186.0  
 Max.   :91.00   Max.   :190500000   CDM    :1395   Max.   :206.0  
                                     (Other):5059                  
     Weight        JerseyNumber      Crossing         Speed      
 Min.   : 49.00   7      :  598   Min.   : 6.00   Min.   :15.00  
 1st Qu.: 70.00   8      :  598   1st Qu.:39.00   1st Qu.:57.00  
 Median : 75.00   11     :  591   Median :54.00   Median :68.00  
 Mean   : 75.17   4      :  586   Mean   :49.48   Mean   :64.85  
 3rd Qu.: 80.00   6      :  584   3rd Qu.:63.00   3rd Qu.:75.00  
 Max.   :105.00   10     :  578   Max.   :94.00   Max.   :97

In [12]:
dim(playersTeamCleaned)

# Skill Moves vs Overall

## Normality Assumption Check

In [13]:
library("repr")
# library("ggplot2")

In [14]:
# checking normality for overall 
dim(playersTeamCleaned)

Since sample size > 5000, shapiro wilk test cannot be used here. By using Central Limit Theorem, we can approximate each distribution to be normal.

In [15]:
# checking normality for each group skill moves 1 - 5

In [16]:
summary(playersTeamCleaned$SkillMoves)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   2.000   2.366   3.000   5.000 

In [17]:
skillMoves1 <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == 1, 'Overall']
skillMoves2 <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == 2, 'Overall']
skillMoves3 <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == 3, 'Overall']
skillMoves4 <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == 4, 'Overall']
skillMoves5 <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == 5, 'Overall']
length(skillMoves1); length(skillMoves2); length(skillMoves3); length(skillMoves4); length(skillMoves5);

In [18]:
skillMoves5 <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == 5, 'Overall']
shapiro.test(skillMoves5); length(skillMoves5)


	Shapiro-Wilk normality test

data:  skillMoves5
W = 0.96312, p-value = 0.1133


Since skillMoves 1- 5 > 30 samples, by CLT they assume to be normal

## Homogeneity of Variance

In [19]:
# F test
group <- c(1, 2, 3, 4, 5)
groupC <- combn(group, 2, simplify=FALSE)

In [20]:
pvalue <- vector()
pair <- vector()
for (i in groupC){
    pos <- unlist(i)
    x <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == pos[1], 'Overall']
    y <- playersTeamCleaned[playersTeamCleaned['SkillMoves'] == pos[2], 'Overall']
    pair <- append(pair, paste(pos[1], pos[2]))
    pvalue <- append(pvalue, round(var.test(x, y)[[3]], 3))
}
variancedf <- data.frame(pair, pvalue)
# Select combinations which do not have the same variance
variancedf

pair,pvalue
1 2,0.0
1 3,0.0
1 4,0.0
1 5,0.119
2 3,0.0
2 4,0.041
2 5,0.942
3 4,0.001
3 5,0.198
4 5,0.599


Assumptions of Anova failed, that is parametric test assumptions have failed, going for non parametric tests

## Conducting Kruskal-Wallis Test

H0: Overall Rating is independent of Skill Moves

H1: Overall Rating is not independent of Skill Moves

In [21]:
kruskal.test(playersTeamCleaned$Overall, playersTeamCleaned$SkillMoves)


	Kruskal-Wallis rank sum test

data:  playersTeamCleaned$Overall and playersTeamCleaned$SkillMoves
Kruskal-Wallis chi-squared = 3012.1, df = 4, p-value < 2.2e-16


There is enough evidence against the null hypothesis that Overall Rating is independent of skill moves

In [22]:
pairwise.wilcox.test(playersTeamCleaned$Overall, playersTeamCleaned$SkillMoves, p.adjust.method='none')


	Pairwise comparisons using Wilcoxon rank sum test 

data:  playersTeamCleaned$Overall and playersTeamCleaned$SkillMoves 

  1       2       3       4      
2 0.00028 -       -       -      
3 < 2e-16 < 2e-16 -       -      
4 < 2e-16 < 2e-16 < 2e-16 -      
5 < 2e-16 < 2e-16 < 2e-16 1.8e-10

P value adjustment method: none 