In [0]:
from pyspark.sql.functions import *

In [0]:
# File location and type
file_location = "dbfs:/FileStore/tables/data.xlsx"
file_type = "com.crealytics.spark.excel"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
teams_df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(teams_df)

team,group,matches,win,draw,lose,gf,ga,gd,pts
Turkey,A,3.0,1.0,1.0,1.0,5.0,5.0,0.0,4.0
Italy,A,3.0,2.0,1.0,0.0,2.0,6.0,-4.0,7.0
Wales,A,3.0,1.0,0.0,2.0,3.0,0.0,3.0,3.0
Switzerland,A,3.0,1.0,1.0,1.0,6.0,3.0,3.0,4.0
Denmark,B,3.0,2.0,0.0,1.0,6.0,6.0,0.0,6.0
Finland,B,3.0,2.0,0.0,1.0,0.0,5.0,-5.0,6.0
Belgium,B,3.0,2.0,1.0,0.0,2.0,5.0,-3.0,7.0
Russia,B,3.0,1.0,1.0,1.0,0.0,2.0,-2.0,4.0
Netherlands,C,3.0,1.0,1.0,1.0,1.0,4.0,-3.0,4.0
Ukraine,C,3.0,1.0,0.0,2.0,6.0,1.0,5.0,3.0


In [0]:
# Create a view or table
temp_table_name = "group_stage"
teams_df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql
SELECT * FROM group_stage;

team,group,matches,win,draw,lose,gf,ga,gd,pts
Turkey,A,3.0,1.0,1.0,1.0,5.0,5.0,0.0,4.0
Italy,A,3.0,2.0,1.0,0.0,2.0,6.0,-4.0,7.0
Wales,A,3.0,1.0,0.0,2.0,3.0,0.0,3.0,3.0
Switzerland,A,3.0,1.0,1.0,1.0,6.0,3.0,3.0,4.0
Denmark,B,3.0,2.0,0.0,1.0,6.0,6.0,0.0,6.0
Finland,B,3.0,2.0,0.0,1.0,0.0,5.0,-5.0,6.0
Belgium,B,3.0,2.0,1.0,0.0,2.0,5.0,-3.0,7.0
Russia,B,3.0,1.0,1.0,1.0,0.0,2.0,-2.0,4.0
Netherlands,C,3.0,1.0,1.0,1.0,1.0,4.0,-3.0,4.0
Ukraine,C,3.0,1.0,0.0,2.0,6.0,1.0,5.0,3.0


In [0]:

top_2_teams_per_group =spark.sql("""SELECT *
                    FROM 
                        (SELECT *,
                                ROW_NUMBER() OVER(PARTITION BY group ORDER BY pts DESC, gd DESC) AS rn 
                        FROM group_stage) AS qualifer 
                    WHERE rn <= 2""")
top_2_teams_per_group.show()

+--------------+-----+-------+---+----+----+---+---+----+---+---+
|         team |group|matches|win|draw|lose| gf| ga|  gd|pts| rn|
+--------------+-----+-------+---+----+----+---+---+----+---+---+
|         Italy|    A|    3.0|2.0| 1.0| 0.0|2.0|6.0|-4.0|7.0|  1|
|   Switzerland|    A|    3.0|1.0| 1.0| 1.0|6.0|3.0| 3.0|4.0|  2|
|       Belgium|    B|    3.0|2.0| 1.0| 0.0|2.0|5.0|-3.0|7.0|  1|
|       Denmark|    B|    3.0|2.0| 0.0| 1.0|6.0|6.0| 0.0|6.0|  2|
|   Netherlands|    C|    3.0|1.0| 1.0| 1.0|1.0|4.0|-3.0|4.0|  1|
|       Ukraine|    C|    3.0|1.0| 0.0| 2.0|6.0|1.0| 5.0|3.0|  2|
|Checz Republic|    D|    3.0|2.0| 0.0| 1.0|3.0|0.0| 3.0|6.0|  1|
|      Scotland|    D|    3.0|2.0| 0.0| 1.0|3.0|3.0| 0.0|6.0|  2|
|      Slovakia|    E|    3.0|2.0| 1.0| 0.0|2.0|2.0| 0.0|7.0|  1|
|        Sweden|    E|    3.0|2.0| 0.0| 1.0|1.0|4.0|-3.0|6.0|  2|
|       Germany|    F|    3.0|2.0| 1.0| 0.0|5.0|5.0| 0.0|7.0|  1|
|        France|    F|    3.0|2.0| 1.0| 0.0|4.0|6.0|-2.0|7.0|  2|
+---------

In [0]:
four_teams =spark.sql("""SELECT *
                    FROM 
                        (SELECT *,
                                ROW_NUMBER() OVER(PARTITION BY group ORDER BY pts DESC, gd DESC) AS rn 
                        FROM group_stage) AS qualifer 
                    WHERE rn = 3 ORDER BY pts DESC,gd DESC LIMIT 4 """)
four_teams.show()

+-------+-----+-------+---+----+----+---+---+----+---+---+
|  team |group|matches|win|draw|lose| gf| ga|  gd|pts| rn|
+-------+-----+-------+---+----+----+---+---+----+---+---+
|England|    D|    3.0|2.0| 0.0| 1.0|2.0|5.0|-3.0|6.0|  3|
|Finland|    B|    3.0|2.0| 0.0| 1.0|0.0|5.0|-5.0|6.0|  3|
|Hungary|    F|    3.0|1.0| 1.0| 1.0|2.0|0.0| 2.0|4.0|  3|
| Turkey|    A|    3.0|1.0| 1.0| 1.0|5.0|5.0| 0.0|4.0|  3|
+-------+-----+-------+---+----+----+---+---+----+---+---+



In [0]:
knockout_16 =  top_2_teams_per_group.union(four_teams)
knockout_16.show()

+--------------+-----+-------+---+----+----+---+---+----+---+---+
|         team |group|matches|win|draw|lose| gf| ga|  gd|pts| rn|
+--------------+-----+-------+---+----+----+---+---+----+---+---+
|         Italy|    A|    3.0|2.0| 1.0| 0.0|2.0|6.0|-4.0|7.0|  1|
|   Switzerland|    A|    3.0|1.0| 1.0| 1.0|6.0|3.0| 3.0|4.0|  2|
|       Belgium|    B|    3.0|2.0| 1.0| 0.0|2.0|5.0|-3.0|7.0|  1|
|       Denmark|    B|    3.0|2.0| 0.0| 1.0|6.0|6.0| 0.0|6.0|  2|
|   Netherlands|    C|    3.0|1.0| 1.0| 1.0|1.0|4.0|-3.0|4.0|  1|
|       Ukraine|    C|    3.0|1.0| 0.0| 2.0|6.0|1.0| 5.0|3.0|  2|
|Checz Republic|    D|    3.0|2.0| 0.0| 1.0|3.0|0.0| 3.0|6.0|  1|
|      Scotland|    D|    3.0|2.0| 0.0| 1.0|3.0|3.0| 0.0|6.0|  2|
|      Slovakia|    E|    3.0|2.0| 1.0| 0.0|2.0|2.0| 0.0|7.0|  1|
|        Sweden|    E|    3.0|2.0| 0.0| 1.0|1.0|4.0|-3.0|6.0|  2|
|       Germany|    F|    3.0|2.0| 1.0| 0.0|5.0|5.0| 0.0|7.0|  1|
|        France|    F|    3.0|2.0| 1.0| 0.0|4.0|6.0|-2.0|7.0|  2|
|       En

In [0]:
knockout_16 = knockout_16.drop('rn').withColumn('group_stage',lit('win')) 
knockout_16.show()

+--------------+-----+-------+---+----+----+---+---+----+---+-----------+
|         team |group|matches|win|draw|lose| gf| ga|  gd|pts|group_stage|
+--------------+-----+-------+---+----+----+---+---+----+---+-----------+
|         Italy|    A|    3.0|2.0| 1.0| 0.0|2.0|6.0|-4.0|7.0|        win|
|   Switzerland|    A|    3.0|1.0| 1.0| 1.0|6.0|3.0| 3.0|4.0|        win|
|       Belgium|    B|    3.0|2.0| 1.0| 0.0|2.0|5.0|-3.0|7.0|        win|
|       Denmark|    B|    3.0|2.0| 0.0| 1.0|6.0|6.0| 0.0|6.0|        win|
|   Netherlands|    C|    3.0|1.0| 1.0| 1.0|1.0|4.0|-3.0|4.0|        win|
|       Ukraine|    C|    3.0|1.0| 0.0| 2.0|6.0|1.0| 5.0|3.0|        win|
|Checz Republic|    D|    3.0|2.0| 0.0| 1.0|3.0|0.0| 3.0|6.0|        win|
|      Scotland|    D|    3.0|2.0| 0.0| 1.0|3.0|3.0| 0.0|6.0|        win|
|      Slovakia|    E|    3.0|2.0| 1.0| 0.0|2.0|2.0| 0.0|7.0|        win|
|        Sweden|    E|    3.0|2.0| 0.0| 1.0|1.0|4.0|-3.0|6.0|        win|
|       Germany|    F|    3.0|2.0| 1.0

#Function

In [0]:
def get_round_of_8(df, table_name, column_name):
    
    sql_query = f"""
    WITH ShuffledRows AS (
        SELECT *, ROW_NUMBER() OVER (ORDER BY RAND()) AS rn
        FROM {table_name}
    )
    SELECT *
    FROM ShuffledRows
    WHERE rn <= (SELECT COUNT(*)/2 FROM ShuffledRows)
    ORDER BY rn
    """
    
    # Execute the SQL query
    result = spark.sql(sql_query)
    
    # Add 'round_of_8' column and drop 'rn' column
    result = result.drop('rn').withColumn(column_name, lit('win'))
    
    return result



In [0]:
# Create a view or table

temp_table_name = "knockout_16"
knockout_16.createOrReplaceTempView(temp_table_name)

In [0]:
knockout_8 = get_round_of_8(knockout_16,"knockout_16", "round_of_8")
knockout_8.show()

+--------------+-----+-------+---+----+----+---+---+----+---+-----------+----------+
|         team |group|matches|win|draw|lose| gf| ga|  gd|pts|group_stage|round_of_8|
+--------------+-----+-------+---+----+----+---+---+----+---+-----------+----------+
|       Finland|    B|    3.0|2.0| 0.0| 1.0|0.0|5.0|-5.0|6.0|        win|       win|
|       Germany|    F|    3.0|2.0| 1.0| 0.0|5.0|5.0| 0.0|7.0|        win|       win|
|       England|    D|    3.0|2.0| 0.0| 1.0|2.0|5.0|-3.0|6.0|        win|       win|
|       Denmark|    B|    3.0|2.0| 0.0| 1.0|6.0|6.0| 0.0|6.0|        win|       win|
|Checz Republic|    D|    3.0|2.0| 0.0| 1.0|3.0|0.0| 3.0|6.0|        win|       win|
|      Scotland|    D|    3.0|2.0| 0.0| 1.0|3.0|3.0| 0.0|6.0|        win|       win|
|       Ukraine|    C|    3.0|1.0| 0.0| 2.0|6.0|1.0| 5.0|3.0|        win|       win|
|        Sweden|    E|    3.0|2.0| 0.0| 1.0|1.0|4.0|-3.0|6.0|        win|       win|
+--------------+-----+-------+---+----+----+---+---+----+---+----

In [0]:
# Create a view or table
temp_table_name = "knockout_8"
knockout_8.createOrReplaceTempView(temp_table_name)

In [0]:
# Example usage:
knockout_4 = get_round_of_8(knockout_8,"knockout_8", "Quater-Final")
knockout_4.show()

+--------------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+
|         team |group|matches|win|draw|lose| gf| ga|  gd|pts|group_stage|round_of_8|Quater-Final|
+--------------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+
|       Denmark|    B|    3.0|2.0| 0.0| 1.0|6.0|6.0| 0.0|6.0|        win|       win|         win|
|Checz Republic|    D|    3.0|2.0| 0.0| 1.0|3.0|0.0| 3.0|6.0|        win|       win|         win|
|        Sweden|    E|    3.0|2.0| 0.0| 1.0|1.0|4.0|-3.0|6.0|        win|       win|         win|
|       Finland|    B|    3.0|2.0| 0.0| 1.0|0.0|5.0|-5.0|6.0|        win|       win|         win|
+--------------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+



In [0]:
# Create a view or table
temp_table_name = "knockout_4"
knockout_4.createOrReplaceTempView(temp_table_name)


In [0]:
# Example usage:
knockout_2 = get_round_of_8(knockout_4,"knockout_4", "Semifinal")
knockout_2.show()


+-------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+---------+
|  team |group|matches|win|draw|lose| gf| ga|  gd|pts|group_stage|round_of_8|Quater-Final|Semifinal|
+-------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+---------+
|Finland|    B|    3.0|2.0| 0.0| 1.0|0.0|5.0|-5.0|6.0|        win|       win|         win|      win|
|Denmark|    B|    3.0|2.0| 0.0| 1.0|6.0|6.0| 0.0|6.0|        win|       win|         win|      win|
+-------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+---------+



In [0]:
# Create a view or table
temp_table_name = "knockout_2"
knockout_2.createOrReplaceTempView(temp_table_name)


In [0]:
final = get_round_of_8(knockout_2,"knockout_2", "Final")
final.show()


+-------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+---------+-----+
|  team |group|matches|win|draw|lose| gf| ga|  gd|pts|group_stage|round_of_8|Quater-Final|Semifinal|Final|
+-------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+---------+-----+
|Finland|    B|    3.0|2.0| 0.0| 1.0|0.0|5.0|-5.0|6.0|        win|       win|         win|      win|  win|
+-------+-----+-------+---+----+----+---+---+----+---+-----------+----------+------------+---------+-----+

