### Transforming the GK DIMTable 

In [48]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [49]:
spark= SparkSession.builder.getOrCreate()

In [50]:
GoalKeepers = spark.read.format('csv')\
    .option("header", "True")\
    .option("inferSchema", "True")\
    .load("/home/jovyan/work/data/goalkeepers.csv")

In [51]:
GoalKeepers.columns

['LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'C']

In [65]:
GoalKeepers.createOrReplaceTempView("GK")

In [66]:
GoalKeepers= GoalKeepers.withColumn('fullname', concat_ws(" ", col('FirstName'),col('LastName')))
GoalKeepers.select("fullname").show(5, False)

+------------+
|fullname    |
+------------+
|David Ospina|
|Petr Cech   |
|Petr Cech   |
|David Ospina|
|David Ospina|
+------------+
only showing top 5 rows



In [67]:
GoalKeepers_f = spark.sql("""
    select distinct concat(firstname, " ", lastname) as fullname
    from GK

""")


In [68]:
GoalKeepers_f.columns

['fullname']

In [69]:
GK_ = spark.sql("""
    select Count(distinct fullname) from GK
""").show()

+------------------------+
|count(DISTINCT fullname)|
+------------------------+
|                       7|
+------------------------+



In [70]:
GoalKeepers_f= GoalKeepers_f.withColumn('GkID',monotonically_increasing_id()+1)


In [71]:
GoalKeepers_f.show()

+-----------------+----+
|         fullname|GkID|
+-----------------+----+
|Emiliano Martinez|   1|
|   Aaron Ramsdale|   2|
|       Bernd Leno|   3|
|        Petr Cech|   4|
|  Runar Runarsson|   5|
|      Mathew Ryan|   6|
|     David Ospina|   7|
+-----------------+----+



In [72]:
DimGoalKeepers= GoalKeepers.join(GoalKeepers_f, on ='fullname', how="inner")

In [73]:
DimGoalKeepers.show()

+------------+--------+---------+----------+-----+---+---+----+---+-----+----+-----+----+----+-------+------+------+-----+--------+---+----+
|    fullname|LastName|FirstName|      Date|Start|Pos|Min|SoTA| GA|Saves|PSxG|PKatt| PKA| PKm|PassAtt|Throws|AvgLen|GKAtt|GKAvgLen|  C|GkID|
+------------+--------+---------+----------+-----+---+---+----+---+-----+----+-----+----+----+-------+------+------+-----+--------+---+----+
|David Ospina|  Ospina|    David| 5/13/2018|    1| GK| 90|   3|  0|    3| 0.4|    0|   0|   0|     39|     8|  31.9|    9|    48.2|  0|   7|
|   Petr Cech|    Cech|     Petr|  5/9/2018|    1| GK| 90|  10|  3|    7| 3.2|    1|   1|   0|     26|     7|  34.5|   11|    66.0|  1|   4|
|   Petr Cech|    Cech|     Petr|  5/6/2018|    1| GK| 90|   2|  0|    2| 0.2|    0|   0|   0|     31|     8|  32.4|    2|    56.0|  1|   4|
|David Ospina|  Ospina|    David| 4/29/2018|    1| GK| 90|   2|  2|    0| 1.3|    0|   0|   0|     15|     4|  41.1|    5|    49.0|  0|   7|
|David Ospina

In [74]:
DimGoalKeepers.columns

['fullname',
 'LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'C',
 'GkID']

In [75]:
DimGoalKeepers.write.csv('/home/jovyan/work/data/DimGoalkeeper', header=True, mode="overwrite")

### Creating the FactGK 

In [76]:
DimMatches = spark.read.format('csv')\
    .option("header", "True")\
    .option("inferSchema", "True")\
    .load("/home/jovyan/work/data/DimMatches/DimMatches.csv")

In [77]:
FactGk = DimMatches.join(DimGoalKeepers, on='Date', how='left')
FactGk.columns

['Date',
 'Season',
 'Tour',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee',
 'MatchID',
 'fullname',
 'LastName',
 'FirstName',
 'Start',
 'Pos',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'C',
 'GkID']

In [78]:
FactGK = FactGk.drop( 
 'Season',
 'Tour',
 'Time',
 'Opponent',
 'HoAw',
 'Stadium',
 'Coach',
 'Referee',
  'Pos',
    'C',
'fullname',
 'LastName',
 'FirstName'
 
)

In [79]:
FactGK.columns

['Date',
 'ArsenalScore',
 'OpponentScore',
 'Attendance',
 'MatchID',
 'Start',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'GkID']

In [80]:
FactGK.write.csv('/home/jovyan/work/data/FactGK', header=True, mode="overwrite")