In [1]:
# https://goo.gl/seiSek
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("FIFA Players joins")\
                    .getOrCreate()

In [8]:
player = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("datasets/player.csv")

In [9]:
player.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)



In [10]:
player.count()

11060

In [11]:
player.describe()

DataFrame[summary: string, id: string, player_api_id: string, player_name: string, player_fifa_api_id: string, birthday: string, height: string, weight: string]

In [12]:
player.show(5)

+---+-------------+------------------+------------------+-------------------+------+------+
| id|player_api_id|       player_name|player_fifa_api_id|           birthday|height|weight|
+---+-------------+------------------+------------------+-------------------+------+------+
|  1|       505942|Aaron Appindangoye|            218353|1992-02-29 00:00:00|182.88|   187|
|  2|       155782|   Aaron Cresswell|            189615|1989-12-15 00:00:00|170.18|   146|
|  3|       162549|       Aaron Doran|            186170|1991-05-13 00:00:00|170.18|   163|
|  4|        30572|     Aaron Galindo|            140161|1982-05-08 00:00:00|182.88|   198|
|  5|        23780|      Aaron Hughes|             17725|1979-11-08 00:00:00|182.88|   154|
+---+-------------+------------------+------------------+-------------------+------+------+
only showing top 5 rows



In [17]:
player_attributes = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("datasets/player_attributes.csv")

In [18]:
player_attributes.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- overall_rating: string (nullable = true)
 |-- potential: string (nullable = true)
 |-- preferred_foot: string (nullable = true)
 |-- attacking_work_rate: string (nullable = true)
 |-- defensive_work_rate: string (nullable = true)
 |-- crossing: string (nullable = true)
 |-- finishing: string (nullable = true)
 |-- heading_accuracy: string (nullable = true)
 |-- short_passing: string (nullable = true)
 |-- volleys: string (nullable = true)
 |-- dribbling: string (nullable = true)
 |-- curve: string (nullable = true)
 |-- free_kick_accuracy: string (nullable = true)
 |-- long_passing: string (nullable = true)
 |-- ball_control: string (nullable = true)
 |-- acceleration: string (nullable = true)
 |-- sprint_speed: string (nullable = true)
 |-- agility: string (nullable = true)
 |-- reactions: string (nullable = true

In [19]:
player_attributes.count()

183978

In [24]:
player_attributes.select("player_api_id").distinct().count()

11060

In [23]:
player.count()

11060

In [25]:
# 2 strange rows in player_fifa_api_id
player_attributes.select("player_fifa_api_id").distinct().count()

11062

In [27]:
player.drop('id','player_fifa_api_id')

DataFrame[player_api_id: string, player_name: string, birthday: string, height: string, weight: string]

In [28]:
player_attributes = player_attributes.drop(
    'id',
    'player_fifa_api_id',
    'preferred_foot',
    'attacking_work_rate',
    'defensive_work_rate',
    'crossing',
    'jumping',
    'sprint_speed',
    'balance',
    'aggression',
    'short_passing',
    'potential'
)

In [29]:
player_attributes = player_attributes.dropna()
players = players.dropna()

In [30]:
from pyspark.sql.functions import udf

In [31]:
year_extract_udf = udf(lambda date: date.split('-')[0])

In [32]:
player_attributes = player_attributes.withColumn('year', year_extract_udf(player_attributes.date))

In [34]:
player_attributes.select('year').show(5)

+----+
|year|
+----+
|2016|
|2015|
|2015|
|2015|
|2007|
+----+
only showing top 5 rows



In [35]:
player_attributes.drop('date')

DataFrame[player_api_id: string, overall_rating: string, finishing: string, heading_accuracy: string, volleys: string, dribbling: string, curve: string, free_kick_accuracy: string, long_passing: string, ball_control: string, acceleration: string, agility: string, reactions: string, shot_power: string, stamina: string, strength: string, long_shots: string, interceptions: string, positioning: string, vision: string, penalties: string, marking: string, standing_tackle: string, sliding_tackle: string, gk_diving: string, gk_handling: string, gk_kicking: string, gk_positioning: string, gk_reflexes: string, year: string]

In [39]:
player.filter(player['player_name'].contains('Iniesta')).show()

+---+-------------+--------------+------------------+-------------------+------+------+
| id|player_api_id|   player_name|player_fifa_api_id|           birthday|height|weight|
+---+-------------+--------------+------------------+-------------------+------+------+
|742|        30955|Andres Iniesta|                41|1984-05-11 00:00:00|170.18|   150|
+---+-------------+--------------+------------------+-------------------+------+------+



In [40]:
# #### Filter to get all players who were active in the year 2016

# In[17]:


pa_2016 = player_attributes.filter(player_attributes.year == 2016)


# In[18]:


pa_2016.count()


# In[19]:


pa_2016.select(pa_2016.player_api_id)       .distinct()       .count()


# #### Find the best striker in the year 2016
# 
# * Consider the scores for finishing, shot_power and acceleration to determine this
# * There can be more than one entry for a player in the year (multiple seasons, some teams make entries per quarter)
# * Find the average scores across the multiple records

# In[20]:


pa_striker_2016 = pa_2016.groupBy('player_api_id').agg({
                           'finishing':"avg",
                           "shot_power":"avg",
                           "acceleration":"avg"
                       })
# In[21]:


pa_striker_2016.count()


# In[22]:


pa_striker_2016.show(5)


# In[23]:


pa_striker_2016 = pa_striker_2016.withColumnRenamed("avg(finishing)","finishing")\
                  .withColumnRenamed("avg(shot_power)","shot_power")\
                  .withColumnRenamed("avg(acceleration)","acceleration")




+-------------+-----------------+-----------------+---------------+
|player_api_id|   avg(finishing)|avg(acceleration)|avg(shot_power)|
+-------------+-----------------+-----------------+---------------+
|       309726|75.44444444444444|74.11111111111111|           76.0|
|        26112|             53.0|             51.0|           76.0|
|        38433|            68.25|             74.0|           74.0|
|       295060|             25.0|             62.0|           40.0|
|       161396|             29.0|             72.0|           69.0|
+-------------+-----------------+-----------------+---------------+
only showing top 5 rows



In [41]:
# #### Find an aggregate score to represent how good a particular player is
# 
# * Each attribute has a weighing factor
# * Find a total score for each striker

# In[24]:


weight_finishing = 1
weight_shot_power = 2
weight_acceleration = 1

total_weight = weight_finishing + weight_shot_power + weight_acceleration


# In[27]:


strikers = pa_striker_2016.withColumn("striker_grade",
                                      (pa_striker_2016.finishing * weight_finishing + \
                                       pa_striker_2016.shot_power * weight_shot_power+ \
                                       pa_striker_2016.acceleration * weight_acceleration) / total_weight)


# In[28]:


strikers = strikers.drop('finishing',
                         'acceleration',
                         'shot_power'
)


# In[31]:


strikers = strikers.filter(strikers.striker_grade > 70)                   .sort(strikers.striker_grade.desc())

strikers.show(10)


+-------------+-----------------+
|player_api_id|    striker_grade|
+-------------+-----------------+
|        20276|            89.25|
|        37412|             89.0|
|        38817|            88.75|
|        32118|            88.25|
|        31921|             87.0|
|        30834|            86.75|
|       303824|85.10714285714286|
|       129944|             85.0|
|       158263|            84.75|
|       150565|            84.75|
+-------------+-----------------+
only showing top 10 rows



In [42]:
# #### Find name and other details of the best strikers
# 
# * The information is present in the *players* dataframe
# * Will involve a join operation between *players* and *strikers*

# In[33]:


strikers.count(), players.count()


# #### Joining dataframes

# In[35]:


striker_details = players.join(strikers, players.player_api_id == strikers.player_api_id)


# In[36]:


striker_details.columns


# In[37]:


striker_details.count()


# In[38]:


striker_details = players.join(strikers, ['player_api_id'])


# In[39]:


striker_details.show(5)


+-------------+----+--------------+------------------+-------------------+------+------+-------------+
|player_api_id|  id|   player_name|player_fifa_api_id|           birthday|height|weight|striker_grade|
+-------------+----+--------------+------------------+-------------------+------+------+-------------+
|        20276|4283|          Hulk|            189362|1986-07-25 00:00:00|180.34|   187|        89.25|
|        37412|9674| Sergio Aguero|            153079|1988-06-02 00:00:00|172.72|   163|         89.0|
|        38817|1581|  Carlos Tevez|            143001|1984-02-05 00:00:00|172.72|   157|        88.75|
|        32118|6400|Lukas Podolski|            150516|1985-06-04 00:00:00|182.88|   183|        88.25|
|        31921|3660|   Gareth Bale|            173731|1989-07-16 00:00:00|182.88|   163|         87.0|
+-------------+----+--------------+------------------+-------------------+------+------+-------------+
only showing top 5 rows

