In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [32]:
spark= SparkSession.builder.getOrCreate()

In [33]:
players= spark.read.format('csv')\
    .option("header", "True")\
    .option("inferSchema", "True")\
    .load("/home/jovyan/work/data/players.csv")

In [34]:
players.show()

+---------+---------+---------+-----+---+---+---+---+---+---+---+---+---+---+-------+-------+----+------+---+----+---+------+-------+------+-------+------+----------+---+
| LastName|FirstName|     Date|Start|Pos|Min|  G|  A| PK|PKA|  S|SoT| YK| RK|Touches|Tackles|Ints|Blocks| xG|npxG|xAG|Passes|PassesA|PrgPas|Carries|PrgCar|      Line|  C|
+---------+---------+---------+-----+---+---+---+---+---+---+---+---+---+---+-------+-------+----+------+---+----+---+------+-------+------+-------+------+----------+---+
| Bellerin|   Hector|8/11/2017|    1| WB| 90|  0|  0|  0|  0|  1|  1|  0|  0|     79|      3|   0|     0|0.3| 0.3|0.0|    61|     70|     3|     51|     1|  Defender|  0|
|   Elneny|  Mohamed|8/11/2017|    1| CM| 66|  0|  1|  0|  0|  1|  0|  0|  0|     82|      4|   0|     2|0.0| 0.0|0.1|    65|     72|     4|     57|     0|Midfielder|  0|
|  Holding|      Rob|8/11/2017|    1| CB| 66|  0|  0|  0|  0|  0|  0|  0|  0|     75|      1|   1|     0|0.0| 0.0|0.0|    50|     60|     4|     

In [35]:
players.columns

['LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'G',
 'A',
 'PK',
 'PKA',
 'S',
 'SoT',
 'YK',
 'RK',
 'Touches',
 'Tackles',
 'Ints',
 'Blocks',
 'xG',
 'npxG',
 'xAG',
 'Passes',
 'PassesA',
 'PrgPas',
 'Carries',
 'PrgCar',
 'Line',
 'C']

In [36]:
players.schema

StructType([StructField('LastName', StringType(), True), StructField('FirstName', StringType(), True), StructField('Date', StringType(), True), StructField('Start', IntegerType(), True), StructField('Pos', StringType(), True), StructField('Min', IntegerType(), True), StructField('G', IntegerType(), True), StructField('A', IntegerType(), True), StructField('PK', IntegerType(), True), StructField('PKA', IntegerType(), True), StructField('S', IntegerType(), True), StructField('SoT', IntegerType(), True), StructField('YK', IntegerType(), True), StructField('RK', IntegerType(), True), StructField('Touches', IntegerType(), True), StructField('Tackles', IntegerType(), True), StructField('Ints', IntegerType(), True), StructField('Blocks', IntegerType(), True), StructField('xG', DoubleType(), True), StructField('npxG', DoubleType(), True), StructField('xAG', DoubleType(), True), StructField('Passes', IntegerType(), True), StructField('PassesA', IntegerType(), True), StructField('PrgPas', Intege

In [37]:
players.createOrReplaceTempView("players")

In [38]:
spark.sql("""
    select concat(firstname, " ", lastname) as fullname
    from players

""").count()

2741

In [39]:
distinct_players= spark.sql("""
    select distinct concat(firstname, " ", lastname) as fullname
    from players

""")
distinct_players.show(10)

+----------------+
|        fullname|
+----------------+
|Emile Smith Rowe|
| Folarin Balogun|
| Hector Bellerin|
|     Joe Willock|
|  William Saliba|
|    Aaron Ramsey|
|     Bukayo Saka|
|  Kieran Tierney|
|Shkodran Mustafi|
| Daniel Ceballos|
+----------------+
only showing top 10 rows



In [40]:
players_Dates= spark.sql("""
    select count(distinct Date) 
    from players

""")
players_Dates.show()

+--------------------+
|count(DISTINCT Date)|
+--------------------+
|                 214|
+--------------------+



## Here we make sure that the distinct count date for matches in DimMatches Equal the distinct count Date for DimPlayers 214

In [41]:
distinct_players= distinct_players.withColumn("PlayerID", monotonically_increasing_id())

In [42]:
players= players.withColumn('fullname', concat_ws(" ", col('FirstName'),col('LastName')))
players.select("fullname").show(5, False)

+-------------------+
|fullname           |
+-------------------+
|Hector Bellerin    |
|Mohamed Elneny     |
|Rob Holding        |
|Sead Kolasinac     |
|Alexandre Lacazette|
+-------------------+
only showing top 5 rows



In [43]:
dimPlayer= players.join(distinct_players, on ='fullname', how="inner")

In [44]:
dimPlayer.write.csv('/home/jovyan/work/data/Dimplayers.csv', header=True, mode="overwrite")

In [45]:
dimPlayer.select('PlayerID',
'LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'G',
 'A',
 'PK',
 'PKA' 
).show()

+--------+---------+---------+---------+-----+---+---+---+---+---+---+
|PlayerID| LastName|FirstName|     Date|Start|Pos|Min|  G|  A| PK|PKA|
+--------+---------+---------+---------+-----+---+---+---+---+---+---+
|       2| Bellerin|   Hector|8/11/2017|    1| WB| 90|  0|  0|  0|  0|
|      49|   Elneny|  Mohamed|8/11/2017|    1| CM| 66|  0|  1|  0|  0|
|      26|  Holding|      Rob|8/11/2017|    1| CB| 66|  0|  0|  0|  0|
|      18|Kolasinac|     Sead|8/11/2017|    1| CB| 90|  0|  1|  0|  0|
|      10|Lacazette|Alexandre|8/11/2017|    1| FW| 90|  1|  0|  0|  0|
|      21|  Monreal|    Nacho|8/11/2017|    1| CB| 90|  0|  0|  0|  0|
|      53|     Ozil|    Mesut|8/11/2017|    1| AM| 90|  0|  0|  0|  0|
|      13|  Welbeck|    Danny|8/11/2017|    1| AM| 74|  1|  0|  0|  0|
|      44|    Xhaka|   Granit|8/11/2017|    1| CM| 90|  0|  2|  0|  0|
|       5|   Ramsey|    Aaron|8/11/2017|    0| DM| 24|  1|  0|  0|  0|
|       2| Bellerin|   Hector|8/19/2017|    1| WB| 90|  0|  0|  0|  0|
|     