In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType
from pyspark.sql.functions import monotonically_increasing_id, coalesce, lit, concat, split
import random


In [0]:
spark = SparkSession.builder \
    .appName("Utsha") \
    .getOrCreate()

In [0]:
team_schema = StructType([
    StructField("team_id", StringType()),
    StructField("group_name", StringType()),
    StructField("team_name", StringType()),
])

group_stage_schema = StructType([
    StructField("pos", IntegerType()),
    StructField("team_name", StringType()),
    StructField("group_name", StringType()),
    StructField("matches_played", IntegerType()),
    StructField("points", IntegerType()),
    StructField("match_won", IntegerType()),
    StructField("match_drawn", IntegerType()),
    StructField("match_loss", IntegerType()),
    StructField("goal_scored", IntegerType()),
    StructField("goal_conceded", IntegerType()),
    StructField("goal_difference", IntegerType()),
])

match_fixture_schema = StructType(
    [
        StructField("match_id", IntegerType()),
        StructField("match", StringType()),
    ]
)


match_fixture_goal_schema = StructType([
    StructField("match_fixture_id", IntegerType(), nullable=True),
    StructField("match_fixture", StringType(), nullable=True),
    StructField("group_name", StringType(), nullable=True),
    StructField("team_1_name", StringType(), nullable=True),
    StructField("team_2_name", StringType(), nullable=True),
    StructField("team_1_goal", IntegerType(), nullable=True),
    StructField("team_2_goal", IntegerType(), nullable=True),
    StructField("winner", StringType(), nullable=True)   
])


In [0]:
groups = ['Group A', 'Group B', 'Group C', 'Group D', 'Group E', 'Group F']
team_names = [
    "Albania", "Austria", "Belgium", "Bosnia", "Bulgaria", "Croatia",
    "Cyprus", "Czech Republic", "Denmark", "Estonia", "Finland", "France", "Germany",
    "Greece", "Hungary", "Iceland", "Ireland", "Italy", "Latvia", "Lithuania", 
    "Luxembourg", "Malta", "Netherlands", "Norway"
]
def assign_team_id():
    return [f"{i+1}" for i in range(24)]

def assign_group():
    group_index = 0
    group_assignments = []
    for _ in range(24):
        group_assignments.append(groups[group_index])
        group_index = (group_index + 1) % len(groups)
    return group_assignments

group_assignments = assign_group()
team_ids = assign_team_id()
data = [(team_id, group, team) for team_id, group, team in zip(team_ids, group_assignments, team_names)]
team_df = spark.createDataFrame(data, schema=team_schema)
print('team_df')
display(team_df.orderBy("group_name"))

team_df


team_id,group_name,team_name
7,Group A,Cyprus
19,Group A,Latvia
13,Group A,Germany
1,Group A,Albania
20,Group B,Lithuania
14,Group B,Greece
8,Group B,Czech Republic
2,Group B,Austria
21,Group C,Luxembourg
15,Group C,Hungary


In [0]:
def generate_ids(num_items):
    return list(range(1, num_items + 1))

In [0]:
#match fixture

match_fixtures=[]
for group_name in groups:
    teams_in_group = team_df.filter(team_df.group_name == group_name).select("team_name","group_name").collect()
    # print('teams_in_group',teams_in_group)
    matches = [(teams_in_group[i].team_name+", "+teams_in_group[i].group_name, teams_in_group[j].team_name+", "+teams_in_group[j].group_name) for i in range(len(teams_in_group)) for j in range(i+1, len(teams_in_group))]
    
    # print(len(matches))
    for match in matches:
        match_data=match[0] + ' vs ' + match[1]
        # print(match_data)
        match_fixtures.append(match_data)
    # print("len(match_fixtures)",len(match_fixtures))
    data=[(id, match) for id, match in zip(generate_ids(len(match_fixtures)), match_fixtures)]
# print("len(data)",len(data))

df_match_fixture = spark.createDataFrame(data, schema=match_fixture_schema)
display(df_match_fixture)


match_id,match
1,"Albania, Group A vs Cyprus, Group A"
2,"Albania, Group A vs Germany, Group A"
3,"Albania, Group A vs Latvia, Group A"
4,"Cyprus, Group A vs Germany, Group A"
5,"Cyprus, Group A vs Latvia, Group A"
6,"Germany, Group A vs Latvia, Group A"
7,"Austria, Group B vs Czech Republic, Group B"
8,"Austria, Group B vs Greece, Group B"
9,"Austria, Group B vs Lithuania, Group B"
10,"Czech Republic, Group B vs Greece, Group B"


In [0]:
import random
from pyspark.sql import Row

# Define the schema for df_match_fixture_goal
match_fixture_goal_schema = StructType([
    StructField("match_fixture_id", IntegerType(), nullable=True),
    StructField("match_fixture", StringType(), nullable=True),
    # StructField("group_name", StringType(), nullable=True),
    StructField("team_1_name", StringType(), nullable=True),
    StructField("team_2_name", StringType(), nullable=True),
    StructField("team_1_goal", IntegerType(), nullable=True),
    StructField("team_2_goal", IntegerType(), nullable=True),
    StructField("winner", StringType(), nullable=True)   
])

# Create an empty list to hold the rows for df_match_fixture_goal
match_fixture_goal_data = []

# Iterate through df_match_fixture DataFrame
for row in df_match_fixture.collect():
    match_id = row['match_id']
    match_fixture = row['match']
    teams = match_fixture.split(' vs ')
    team_1_name = teams[0]
    team_2_name = teams[1]
    # Generate random goals for each team
    team_1_goal = random.randint(0, 5)
    team_2_goal = random.randint(0, 5)
    
    # Determine the winner
    winner = team_1_name if team_1_goal > team_2_goal else team_2_name if team_2_goal > team_1_goal else "Draw"
    
    # Construct a row for df_match_fixture_goal
    match_fixture_goal_row = Row(
        match_fixture_id=match_id,
        match_fixture=match_fixture,
        # group_name=group_name,  # Assuming you have 'group_name' available
        team_1_name=team_1_name,
        team_2_name=team_2_name,
        team_1_goal=team_1_goal,
        team_2_goal=team_2_goal,
        winner=winner
    )
    
    # Append the row to the list
    match_fixture_goal_data.append(match_fixture_goal_row)

# Create DataFrame df_match_fixture_goal
df_match_fixture_goal = spark.createDataFrame(match_fixture_goal_data, schema=match_fixture_goal_schema)

display(df_match_fixture_goal)


match_fixture_id,match_fixture,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
1,"Albania, Group A vs Cyprus, Group A","Albania, Group A","Cyprus, Group A",5,2,"Albania, Group A"
2,"Albania, Group A vs Germany, Group A","Albania, Group A","Germany, Group A",0,4,"Germany, Group A"
3,"Albania, Group A vs Latvia, Group A","Albania, Group A","Latvia, Group A",3,4,"Latvia, Group A"
4,"Cyprus, Group A vs Germany, Group A","Cyprus, Group A","Germany, Group A",2,5,"Germany, Group A"
5,"Cyprus, Group A vs Latvia, Group A","Cyprus, Group A","Latvia, Group A",0,0,Draw
6,"Germany, Group A vs Latvia, Group A","Germany, Group A","Latvia, Group A",5,2,"Germany, Group A"
7,"Austria, Group B vs Czech Republic, Group B","Austria, Group B","Czech Republic, Group B",0,0,Draw
8,"Austria, Group B vs Greece, Group B","Austria, Group B","Greece, Group B",0,5,"Greece, Group B"
9,"Austria, Group B vs Lithuania, Group B","Austria, Group B","Lithuania, Group B",1,0,"Austria, Group B"
10,"Czech Republic, Group B vs Greece, Group B","Czech Republic, Group B","Greece, Group B",3,3,Draw


In [0]:
from pyspark.sql.functions import col, when, count, sum

group_standing_schema = StructType([
    StructField("pos", IntegerType()),
    StructField("team_name", StringType()),
    StructField("group_name", StringType()),
    StructField("matches_played", IntegerType()),
    StructField("points", IntegerType()),
    StructField("match_won", IntegerType()),
    StructField("match_drawn", IntegerType()),
    StructField("match_loss", IntegerType()),
    StructField("goal_scored", IntegerType()),
    StructField("goal_conceded", IntegerType()),
    StructField("goal_difference", IntegerType()),
])

all_teams = df_match_fixture_goal.select(
    col("team_1_name").alias("team_name"),
    split(col("team_1_name"), ", ")[1].alias("group_name"),
    col("winner"),
    col("team_1_goal").alias("gs"),
    col("team_2_goal").alias("gc")
).union(
    df_match_fixture_goal.select(
        col("team_2_name").alias("team_name"),
        split(col("team_2_name"), ", ")[1].alias("group_name"),
        col("winner"),
        col("team_2_goal").alias("gs"),
        col("team_1_goal").alias("gc")
    )
)
display(all_teams)
group_stage_summary = all_teams.groupBy(
    "team_name", "group_name"
).agg(
    count("*").alias("matches_played"),
    sum(when(col("winner") == col("team_name"), 1).otherwise(0)).alias("match_won"),
    sum(when((col("winner") != col("team_name")) & (col("winner") != "Draw"), 1).otherwise(0)).alias("match_loss"),
    sum(when(col("winner") == "Draw", 1).otherwise(0)).alias("match_drawn"),
    sum(col("gs")).alias("goal_scored"),
    sum(col("gc")).alias("goal_conceded"),
    (sum(col("gs")) - sum(col("gc"))).alias("goal_difference"),
    sum(when(col("winner") == col("team_name"), 3).when(col("winner") == "Draw", 1).otherwise(0)).alias("points")
).orderBy(
    "group_name", col("points").desc(), col("goal_difference").desc(), col("goal_scored").desc(), col("goal_conceded").asc(), "team_name"
)

display(group_stage_summary)





team_name,group_name,winner,gs,gc
"Albania, Group A",Group A,"Albania, Group A",5,2
"Albania, Group A",Group A,"Germany, Group A",0,4
"Albania, Group A",Group A,"Latvia, Group A",3,4
"Cyprus, Group A",Group A,"Germany, Group A",2,5
"Cyprus, Group A",Group A,Draw,0,0
"Germany, Group A",Group A,"Germany, Group A",5,2
"Austria, Group B",Group B,Draw,0,0
"Austria, Group B",Group B,"Greece, Group B",0,5
"Austria, Group B",Group B,"Austria, Group B",1,0
"Czech Republic, Group B",Group B,Draw,3,3


team_name,group_name,matches_played,match_won,match_loss,match_drawn,goal_scored,goal_conceded,goal_difference,points
"Germany, Group A",Group A,3,3,0,0,14,4,10,9
"Latvia, Group A",Group A,3,1,1,1,6,8,-2,4
"Albania, Group A",Group A,3,1,2,0,8,10,-2,3
"Cyprus, Group A",Group A,3,0,2,1,4,10,-6,1
"Greece, Group B",Group B,3,1,0,2,10,5,5,5
"Czech Republic, Group B",Group B,3,1,0,2,7,6,1,5
"Austria, Group B",Group B,3,1,1,1,1,5,-4,4
"Lithuania, Group B",Group B,3,0,2,1,5,7,-2,1
"Belgium, Group C",Group C,3,3,0,0,10,5,5,9
"Denmark, Group C",Group C,3,2,1,0,10,5,5,6


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

# Define a window specification for ranking
window_spec = Window.partitionBy("group_name").orderBy(
    col("points").desc(), col("goal_difference").desc(),
    col("goal_scored").desc(), col("goal_conceded").asc(), "team_name"
)

# Add rank column using rank function
group_stage_summary_with_rank = group_stage_summary.withColumns(
  {  "rank":rank().over(window_spec),
    "position":concat(rank().over(window_spec).cast("string"), lit("-"), col("group_name"))
  }
)

display(group_stage_summary_with_rank)


team_name,group_name,matches_played,match_won,match_loss,match_drawn,goal_scored,goal_conceded,goal_difference,points,rank,position
"Germany, Group A",Group A,3,3,0,0,14,4,10,9,1,1-Group A
"Latvia, Group A",Group A,3,1,1,1,6,8,-2,4,2,2-Group A
"Albania, Group A",Group A,3,1,2,0,8,10,-2,3,3,3-Group A
"Cyprus, Group A",Group A,3,0,2,1,4,10,-6,1,4,4-Group A
"Greece, Group B",Group B,3,1,0,2,10,5,5,5,1,1-Group B
"Czech Republic, Group B",Group B,3,1,0,2,7,6,1,5,2,2-Group B
"Austria, Group B",Group B,3,1,1,1,1,5,-4,4,3,3-Group B
"Lithuania, Group B",Group B,3,0,2,1,5,7,-2,1,4,4-Group B
"Belgium, Group C",Group C,3,3,0,0,10,5,5,9,1,1-Group C
"Denmark, Group C",Group C,3,2,1,0,10,5,5,6,2,2-Group C


In [0]:
filtered_summary = group_stage_summary_with_rank.filter((col("rank") == 1) | (col("rank") == 2))

display(filtered_summary)


team_name,group_name,matches_played,match_won,match_loss,match_drawn,goal_scored,goal_conceded,goal_difference,points,rank,position
"Germany, Group A",Group A,3,3,0,0,14,4,10,9,1,1-Group A
"Latvia, Group A",Group A,3,1,1,1,6,8,-2,4,2,2-Group A
"Greece, Group B",Group B,3,1,0,2,10,5,5,5,1,1-Group B
"Czech Republic, Group B",Group B,3,1,0,2,7,6,1,5,2,2-Group B
"Belgium, Group C",Group C,3,3,0,0,10,5,5,9,1,1-Group C
"Denmark, Group C",Group C,3,2,1,0,10,5,5,6,2,2-Group C
"Estonia, Group D",Group D,3,2,1,0,8,4,4,6,1,1-Group D
"Iceland, Group D",Group D,3,1,0,2,6,4,2,5,2,2-Group D
"Bulgaria, Group E",Group E,3,2,1,0,9,8,1,6,1,1-Group E
"Netherlands, Group E",Group E,3,1,1,1,9,8,1,4,2,2-Group E


In [0]:
filtered_summary_3rd_all_team = group_stage_summary_with_rank.filter((col("rank") == 3)).orderBy(
   col("points").desc(), col("goal_difference").desc(), col("goal_scored").desc(), col("goal_conceded").asc(), "team_name")

top_4_3rd_place = filtered_summary_3rd_all_team.limit(4)
display(top_4_3rd_place)


team_name,group_name,matches_played,match_won,match_loss,match_drawn,goal_scored,goal_conceded,goal_difference,points,rank,position
"Finland, Group E",Group E,3,1,1,1,8,9,-1,4,3,3-Group E
"Austria, Group B",Group B,3,1,1,1,1,5,-4,4,3,3-Group B
"Hungary, Group C",Group C,3,1,2,0,7,7,0,3,3,3-Group C
"Croatia, Group F",Group F,3,1,2,0,8,9,-1,3,3,3-Group F


In [0]:
df_goup_stage_teams = filtered_summary.union(top_4_3rd_place)
display(df_goup_stage_teams)

team_name,group_name,matches_played,match_won,match_loss,match_drawn,goal_scored,goal_conceded,goal_difference,points,rank,position
"Germany, Group A",Group A,3,3,0,0,14,4,10,9,1,1-Group A
"Latvia, Group A",Group A,3,1,1,1,6,8,-2,4,2,2-Group A
"Greece, Group B",Group B,3,1,0,2,10,5,5,5,1,1-Group B
"Czech Republic, Group B",Group B,3,1,0,2,7,6,1,5,2,2-Group B
"Belgium, Group C",Group C,3,3,0,0,10,5,5,9,1,1-Group C
"Denmark, Group C",Group C,3,2,1,0,10,5,5,6,2,2-Group C
"Estonia, Group D",Group D,3,2,1,0,8,4,4,6,1,1-Group D
"Iceland, Group D",Group D,3,1,0,2,6,4,2,5,2,2-Group D
"Bulgaria, Group E",Group E,3,2,1,0,9,8,1,6,1,1-Group E
"Netherlands, Group E",Group E,3,1,1,1,9,8,1,4,2,2-Group E


In [0]:
df_goup_stage_teams.select("team_name", "position").show()

+--------------------+---------+
|           team_name| position|
+--------------------+---------+
|    Germany, Group A|1-Group A|
|     Latvia, Group A|2-Group A|
|     Greece, Group B|1-Group B|
|Czech Republic, G...|2-Group B|
|    Belgium, Group C|1-Group C|
|    Denmark, Group C|2-Group C|
|    Estonia, Group D|1-Group D|
|    Iceland, Group D|2-Group D|
|   Bulgaria, Group E|1-Group E|
|Netherlands, Group E|2-Group E|
|      Italy, Group F|1-Group F|
|     France, Group F|2-Group F|
|    Finland, Group E|3-Group E|
|    Austria, Group B|3-Group B|
|    Hungary, Group C|3-Group C|
|    Croatia, Group F|3-Group F|
+--------------------+---------+



In [0]:
df_goup_stage_teams.select("team_name",).show()

+--------------------+
|           team_name|
+--------------------+
|    Germany, Group A|
|     Latvia, Group A|
|     Greece, Group B|
|Czech Republic, G...|
|    Belgium, Group C|
|    Denmark, Group C|
|    Estonia, Group D|
|    Iceland, Group D|
|   Bulgaria, Group E|
|Netherlands, Group E|
|      Italy, Group F|
|     France, Group F|
|    Finland, Group E|
|    Austria, Group B|
|    Hungary, Group C|
|    Croatia, Group F|
+--------------------+



In [0]:
combination_data = [
('C-1', ['Group A', 'Group B', 'Group C', 'Group D'], '1-Group B', '3-Group A'),
('C-1', ['Group A', 'Group B', 'Group C', 'Group D'], '1-Group C', '3-Group D'),
('C-1', ['Group A', 'Group B', 'Group C', 'Group D'], '1-Group E', '3-Group B'),
('C-1', ['Group A', 'Group B', 'Group C', 'Group D'], '1-Group F', '3-Group C'),

('C-2', ['Group A', 'Group B', 'Group C', 'Group E'], '1-Group B', '3-Group A'),
('C-2', ['Group A', 'Group B', 'Group C', 'Group E'], '1-Group C', '3-Group E'),
('C-2', ['Group A', 'Group B', 'Group C', 'Group E'], '1-Group E', '3-Group B'),
('C-2', ['Group A', 'Group B', 'Group C', 'Group E'], '1-Group F', '3-Group C'),

('C-3', ['Group A', 'Group B', 'Group C', 'Group F'], '1-Group B', '3-Group A'),
('C-3', ['Group A', 'Group B', 'Group C', 'Group F'], '1-Group C', '3-Group F'),
('C-3', ['Group A', 'Group B', 'Group C', 'Group F'], '1-Group E', '3-Group B'),
('C-3', ['Group A', 'Group B', 'Group C', 'Group F'], '1-Group F', '3-Group C'),

('C-4', ['Group A', 'Group B', 'Group D', 'Group E'], '1-Group B', '3-Group D'),
('C-4', ['Group A', 'Group B', 'Group D', 'Group E'], '1-Group C', '3-Group E'),
('C-4', ['Group A', 'Group B', 'Group D', 'Group E'], '1-Group E', '3-Group A'),
('C-4', ['Group A', 'Group B', 'Group D', 'Group E'], '1-Group F', '3-Group B'),

('C-5', ['Group A', 'Group B', 'Group D', 'Group F'], '1-Group B', '3-Group D'),
('C-5', ['Group A', 'Group B', 'Group D', 'Group F'], '1-Group C', '3-Group F'),
('C-5', ['Group A', 'Group B', 'Group D', 'Group F'], '1-Group E', '3-Group A'),
('C-5', ['Group A', 'Group B', 'Group D', 'Group F'], '1-Group F', '3-Group B'),

('C-6', ['Group A', 'Group B', 'Group E', 'Group F'], '1-Group B', '3-Group E'),
('C-6', ['Group A', 'Group B', 'Group E', 'Group F'], '1-Group C', '3-Group F'),
('C-6', ['Group A', 'Group B', 'Group E', 'Group F'], '1-Group E', '3-Group B'),
('C-6', ['Group A', 'Group B', 'Group E', 'Group F'], '1-Group F', '3-Group A'),

('C-7', ['Group A', 'Group C', 'Group D', 'Group E'], '1-Group B', '3-Group E'),
('C-7', ['Group A', 'Group C', 'Group D', 'Group E'], '1-Group C', '3-Group D'),
('C-7', ['Group A', 'Group C', 'Group D', 'Group E'], '1-Group E', '3-Group C'),
('C-7', ['Group A', 'Group C', 'Group D', 'Group E'], '1-Group F', '3-Group A'),

('C-8', ['Group A', 'Group C', 'Group D', 'Group F'], '1-Group B', '3-Group F'),
('C-8', ['Group A', 'Group C', 'Group D', 'Group F'], '1-Group C', '3-Group D'),
('C-8', ['Group A', 'Group C', 'Group D', 'Group F'], '1-Group E', '3-Group C'),
('C-8', ['Group A', 'Group C', 'Group D', 'Group F'], '1-Group F', '3-Group A'),

('C-9', ['Group A', 'Group C', 'Group E', 'Group F'], '1-Group B', '3-Group E'),
('C-9', ['Group A', 'Group C', 'Group E', 'Group F'], '1-Group C', '3-Group F'),
('C-9', ['Group A', 'Group C', 'Group E', 'Group F'], '1-Group E', '3-Group C'),
('C-9', ['Group A', 'Group C', 'Group E', 'Group F'], '1-Group F', '3-Group A'),

('C-10', ['Group A', 'Group D', 'Group E', 'Group F'], '1-Group B', '3-Group E'),
('C-10', ['Group A', 'Group D', 'Group E', 'Group F'], '1-Group C', '3-Group F'),
('C-10', ['Group A', 'Group D', 'Group E', 'Group F'], '1-Group E', '3-Group D'),
('C-10', ['Group A', 'Group D', 'Group E', 'Group F'], '1-Group F', '3-Group A'),

('C-11', ['Group B', 'Group C', 'Group D', 'Group E'], '1-Group B', '3-Group E'),
('C-11', ['Group B', 'Group C', 'Group D', 'Group E'], '1-Group C', '3-Group D'),
('C-11', ['Group B', 'Group C', 'Group D', 'Group E'], '1-Group E', '3-Group B'),
('C-11', ['Group B', 'Group C', 'Group D', 'Group E'], '1-Group F', '3-Group C'),

('C-12', ['Group B', 'Group C', 'Group D', 'Group F'], '1-Group B', '3-Group F'),
('C-12', ['Group B', 'Group C', 'Group D', 'Group F'], '1-Group C', '3-Group D'),
('C-12', ['Group B', 'Group C', 'Group D', 'Group F'], '1-Group E', '3-Group C'),
('C-12', ['Group B', 'Group C', 'Group D', 'Group F'], '1-Group F', '3-Group B'),

('C-13', ['Group B', 'Group C', 'Group E', 'Group F'], '1-Group B', '3-Group F'),
('C-13', ['Group B', 'Group C', 'Group E', 'Group F'], '1-Group C', '3-Group E'),
('C-13', ['Group B', 'Group C', 'Group E', 'Group F'], '1-Group E', '3-Group C'),
('C-13', ['Group B', 'Group C', 'Group E', 'Group F'], '1-Group F', '3-Group B'),

('C-14', ['Group B', 'Group D', 'Group E', 'Group F'], '1-Group B', '3-Group F'),
('C-14', ['Group B', 'Group D', 'Group E', 'Group F'], '1-Group C', '3-Group E'),
('C-14', ['Group B', 'Group D', 'Group E', 'Group F'], '1-Group E', '3-Group D'),
('C-14', ['Group B', 'Group D', 'Group E', 'Group F'], '1-Group F', '3-Group B'),

('C-15', ['Group C', 'Group D', 'Group E', 'Group F'], '1-Group B', '3-Group F'),
('C-15', ['Group C', 'Group D', 'Group E', 'Group F'], '1-Group C', '3-Group E'),
('C-15', ['Group C', 'Group D', 'Group E', 'Group F'], '1-Group E', '3-Group D'),
('C-15', ['Group C', 'Group D', 'Group E', 'Group F'], '1-Group F', '3-Group C'),
]

match_combination_schema = StructType(
    [
    StructField("combination_id", StringType()),
    StructField("combination", ArrayType(elementType=StringType())),
    StructField("team1", StringType()),
    StructField("team2", StringType()),
    ]
)

combination_df = spark.createDataFrame(combination_data, schema=match_combination_schema)
display(combination_df)

combination_id,combination,team1,team2
C-1,"List(Group A, Group B, Group C, Group D)",1-Group B,3-Group A
C-1,"List(Group A, Group B, Group C, Group D)",1-Group C,3-Group D
C-1,"List(Group A, Group B, Group C, Group D)",1-Group E,3-Group B
C-1,"List(Group A, Group B, Group C, Group D)",1-Group F,3-Group C
C-2,"List(Group A, Group B, Group C, Group E)",1-Group B,3-Group A
C-2,"List(Group A, Group B, Group C, Group E)",1-Group C,3-Group E
C-2,"List(Group A, Group B, Group C, Group E)",1-Group E,3-Group B
C-2,"List(Group A, Group B, Group C, Group E)",1-Group F,3-Group C
C-3,"List(Group A, Group B, Group C, Group F)",1-Group B,3-Group A
C-3,"List(Group A, Group B, Group C, Group F)",1-Group C,3-Group F


In [0]:
third_palce_group_combination = df_goup_stage_teams.filter(col("rank") == 3).select('group_name').orderBy("group_name").rdd.map(lambda row: row[0]).collect()
print(third_palce_group_combination)

['Group B', 'Group C', 'Group E', 'Group F']


In [0]:
from pyspark.sql.functions import array_contains

third_place_groups = third_palce_group_combination

# Filter combination_df based on the third-place group combinations
filtered_combination_df = combination_df.filter(
    array_contains(col("combination"), third_place_groups[0]) &
    array_contains(col("combination"), third_place_groups[1]) &
    array_contains(col("combination"), third_place_groups[2]) &
    array_contains(col("combination"), third_place_groups[3])
)

# Display the filtered DataFrame
display(filtered_combination_df)


combination_id,combination,team1,team2
C-13,"List(Group B, Group C, Group E, Group F)",1-Group B,3-Group F
C-13,"List(Group B, Group C, Group E, Group F)",1-Group C,3-Group E
C-13,"List(Group B, Group C, Group E, Group F)",1-Group E,3-Group C
C-13,"List(Group B, Group C, Group E, Group F)",1-Group F,3-Group B


In [0]:
reamining_combination_data = [
    ('U', [],'1-Group A', '2-Group C' ),
    ('U', [],'2-Group D', '2-Group E' ),
    ('U', [],'1-Group D', '2-Group F' ),
    ('U', [],'2-Group A', '2-Group B' )
]
reamining_combination_df= spark.createDataFrame(reamining_combination_data, schema=match_combination_schema)


In [0]:
df_knouckout_match_final_combination=filtered_combination_df.union(reamining_combination_df)
display(df_knouckout_match_final_combination)

combination_id,combination,team1,team2
C-13,"List(Group B, Group C, Group E, Group F)",1-Group B,3-Group F
C-13,"List(Group B, Group C, Group E, Group F)",1-Group C,3-Group E
C-13,"List(Group B, Group C, Group E, Group F)",1-Group E,3-Group C
C-13,"List(Group B, Group C, Group E, Group F)",1-Group F,3-Group B
U,List(),1-Group A,2-Group C
U,List(),2-Group D,2-Group E
U,List(),1-Group D,2-Group F
U,List(),2-Group A,2-Group B


In [0]:
df_knouckout_match_final_combination.show()
team_pairs = [(row[2], row[3]) for row in df_knouckout_match_final_combination.collect()]
# print(team_pairs)
def generate_match_fixture_id():
    i=df_match_fixture.count()
    j=df_knouckout_match_final_combination.count()
    # print('i',i)
    # print('j',j)
    return [i for i in range(i+1, 45)]
match_id=generate_match_fixture_id()  
# print(match_id) 
knockout_match=[]
for row in df_knouckout_match_final_combination.collect():
    team_1=row['team1']
    team_2=row['team2']
    data= team_1 + ' vs ' + team_2
    knockout_match.append(data)

data=[(id, match) for id, match in zip(match_id, knockout_match)]
# print(data)
df_knockout_match = spark.createDataFrame(data, schema=match_fixture_schema)
display(df_knockout_match)

+--------------+--------------------+---------+---------+
|combination_id|         combination|    team1|    team2|
+--------------+--------------------+---------+---------+
|          C-13|[Group B, Group C...|1-Group B|3-Group F|
|          C-13|[Group B, Group C...|1-Group C|3-Group E|
|          C-13|[Group B, Group C...|1-Group E|3-Group C|
|          C-13|[Group B, Group C...|1-Group F|3-Group B|
|             U|                  []|1-Group A|2-Group C|
|             U|                  []|2-Group D|2-Group E|
|             U|                  []|1-Group D|2-Group F|
|             U|                  []|2-Group A|2-Group B|
+--------------+--------------------+---------+---------+



match_id,match
37,1-Group B vs 3-Group F
38,1-Group C vs 3-Group E
39,1-Group E vs 3-Group C
40,1-Group F vs 3-Group B
41,1-Group A vs 2-Group C
42,2-Group D vs 2-Group E
43,1-Group D vs 2-Group F
44,2-Group A vs 2-Group B


In [0]:
display(df_goup_stage_teams)

team_name,group_name,matches_played,match_won,match_loss,match_drawn,goal_scored,goal_conceded,goal_difference,points,rank,position
"Germany, Group A",Group A,3,3,0,0,14,4,10,9,1,1-Group A
"Latvia, Group A",Group A,3,1,1,1,6,8,-2,4,2,2-Group A
"Greece, Group B",Group B,3,1,0,2,10,5,5,5,1,1-Group B
"Czech Republic, Group B",Group B,3,1,0,2,7,6,1,5,2,2-Group B
"Belgium, Group C",Group C,3,3,0,0,10,5,5,9,1,1-Group C
"Denmark, Group C",Group C,3,2,1,0,10,5,5,6,2,2-Group C
"Estonia, Group D",Group D,3,2,1,0,8,4,4,6,1,1-Group D
"Iceland, Group D",Group D,3,1,0,2,6,4,2,5,2,2-Group D
"Bulgaria, Group E",Group E,3,2,1,0,9,8,1,6,1,1-Group E
"Netherlands, Group E",Group E,3,1,1,1,9,8,1,4,2,2-Group E


In [0]:
display(df_knockout_match)

match_id,match
37,1-Group B vs 3-Group F
38,1-Group C vs 3-Group E
39,1-Group E vs 3-Group C
40,1-Group F vs 3-Group B
41,1-Group A vs 2-Group C
42,2-Group D vs 2-Group E
43,1-Group D vs 2-Group F
44,2-Group A vs 2-Group B


In [0]:
#knouckout match 
knockout_match_data=[]

for row in df_knockout_match.collect():
    match_id=row['match_id']
    match_fixture=row['match']
    teams = match_fixture.split(' vs ')
    team_1_name = teams[0]
    team_2_name=teams[1]

    # team_1_goal = random.randint(0, 5)
    # team_2_goal = random.randint(0, 5)
   
    # winner = team_1_name if team_1_goal > team_2_goal else team_2_name if team_2_goal > team_1_goal else "Draw"
    while True:
    # Generate random number of goals for each team
        team_1_goal = random.randint(0, 5)
        team_2_goal = random.randint(0, 5)
        print("team_1_goal",team_1_goal)
        print("team_2_goal",team_2_goal)

        # Regenerate the goals if they are equal
        if team_1_goal != team_2_goal:
            print('here')
            break

    # Determine the winner
    winner = team_1_name if team_1_goal > team_2_goal else team_2_name if team_2_goal > team_1_goal else "Draw"

    knockout_match_fixture_goal_row = Row(
        match_fixture_id=match_id,
        match_fixture=match_fixture,
        team_1_name=team_1_name,
        team_2_name=team_2_name,
        team_1_goal=team_1_goal,
        team_2_goal=team_2_goal,
        winner=winner
    )
    knockout_match_data.append(knockout_match_fixture_goal_row)

df_knockout_match_fixture_goals = spark.createDataFrame(knockout_match_data, schema=match_fixture_goal_schema)

display(df_knockout_match_fixture_goals)



team_1_goal 1
team_2_goal 2
here
team_1_goal 2
team_2_goal 5
here
team_1_goal 5
team_2_goal 2
here
team_1_goal 2
team_2_goal 1
here
team_1_goal 2
team_2_goal 4
here
team_1_goal 5
team_2_goal 0
here
team_1_goal 2
team_2_goal 5
here
team_1_goal 0
team_2_goal 1
here


match_fixture_id,match_fixture,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
37,1-Group B vs 3-Group F,1-Group B,3-Group F,1,2,3-Group F
38,1-Group C vs 3-Group E,1-Group C,3-Group E,2,5,3-Group E
39,1-Group E vs 3-Group C,1-Group E,3-Group C,5,2,1-Group E
40,1-Group F vs 3-Group B,1-Group F,3-Group B,2,1,1-Group F
41,1-Group A vs 2-Group C,1-Group A,2-Group C,2,4,2-Group C
42,2-Group D vs 2-Group E,2-Group D,2-Group E,5,0,2-Group D
43,1-Group D vs 2-Group F,1-Group D,2-Group F,2,5,2-Group F
44,2-Group A vs 2-Group B,2-Group A,2-Group B,0,1,2-Group B


In [0]:
df_goup_stage_teams.createOrReplaceTempView("group_stage_team")
df_knockout_match_fixture_goals.createOrReplaceTempView("knockout_match_fixture_goals")

goup_stage_query = """
SELECT m.match_fixture_id, m.match_fixture, t1.team_name AS team_1_name, t2.team_name AS team_2_name, m.team_1_goal, m.team_2_goal, t3.team_name as winner
FROM knockout_match_fixture_goals m
JOIN group_stage_team t1 ON m.team_1_name = t1.position
JOIN group_stage_team t2 ON m.team_2_name = t2.position
JOIN group_stage_team t3 ON m.winner = t3.position
"""

goup_stage_df_team_name = spark.sql(goup_stage_query)
display(goup_stage_df_team_name)

match_fixture_id,match_fixture,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
37,1-Group B vs 3-Group F,"Greece, Group B","Croatia, Group F",1,2,"Croatia, Group F"
38,1-Group C vs 3-Group E,"Belgium, Group C","Finland, Group E",2,5,"Finland, Group E"
39,1-Group E vs 3-Group C,"Bulgaria, Group E","Hungary, Group C",5,2,"Bulgaria, Group E"
40,1-Group F vs 3-Group B,"Italy, Group F","Austria, Group B",2,1,"Italy, Group F"
41,1-Group A vs 2-Group C,"Germany, Group A","Denmark, Group C",2,4,"Denmark, Group C"
42,2-Group D vs 2-Group E,"Iceland, Group D","Netherlands, Group E",5,0,"Iceland, Group D"
43,1-Group D vs 2-Group F,"Estonia, Group D","France, Group F",2,5,"France, Group F"
44,2-Group A vs 2-Group B,"Latvia, Group A","Czech Republic, Group B",0,1,"Czech Republic, Group B"


In [0]:
df_quater_final_teams= goup_stage_df_team_name.select(col("winner").alias("team_name"),col("match_fixture_id").alias("match_id"))
display(df_quater_final_teams)

team_name,match_id
"Croatia, Group F",37
"Finland, Group E",38
"Bulgaria, Group E",39
"Italy, Group F",40
"Denmark, Group C",41
"Iceland, Group D",42
"France, Group F",43
"Czech Republic, Group B",44


In [0]:
#quater final combinations

match_combination_schema = StructType([
    StructField("match_id", StringType()),
    StructField("combination_id", StringType()),
    StructField("combination", ArrayType(elementType=StringType())),
    StructField("team1", StringType()),
    StructField("team2", StringType()),
])
quater_final_combination_data = [
    ('45','Q_C', [],'37', '41' ),
    ('46','Q_C', [],'40', '42' ),
    ('47','Q_C', [],'39', '43' ),
    ('48','Q_C', [],'38', '44' )
]
quater_final_combination_data_df= spark.createDataFrame(quater_final_combination_data, schema=match_combination_schema)
display(quater_final_combination_data_df)

match_id,combination_id,combination,team1,team2
45,Q_C,List(),37,41
46,Q_C,List(),40,42
47,Q_C,List(),39,43
48,Q_C,List(),38,44


In [0]:
df_quater_final_teams.createOrReplaceTempView("quater_final_teams")
quater_final_combination_data_df.createOrReplaceTempView("quater_final_combination_data")

sql_query = """
SELECT d.match_id, d.combination_id, t1.team_name as team_1_name, t2.team_name as team_2_name
FROM quater_final_combination_data d
JOIN quater_final_teams t1 ON d.team1=t1.match_id
JOIN quater_final_teams t2 ON d.team2=t2.match_id
"""

df_quater_final_teams_name = spark.sql(sql_query)

display(df_quater_final_teams_name)



match_id,combination_id,team_1_name,team_2_name
45,Q_C,"Croatia, Group F","Denmark, Group C"
46,Q_C,"Italy, Group F","Iceland, Group D"
47,Q_C,"Bulgaria, Group E","France, Group F"
48,Q_C,"Finland, Group E","Czech Republic, Group B"


In [0]:
# quater_final_teams match result
quater_final_match_data=[]
for row in df_quater_final_teams_name.collect():
    match_id=row['match_id']
    team_1_name=row['team_1_name']
    team_2_name=row['team_2_name']
    while True:
    # Generate random number of goals for each team
        team_1_goal = random.randint(0, 5)
        team_2_goal = random.randint(0, 5)
        print("team_1_goal",team_1_goal)
        print("team_2_goal",team_2_goal)

        # Regenerate the goals if they are equal
        if team_1_goal != team_2_goal:
            print('here')
            break

    # Determine the winner
    winner = team_1_name if team_1_goal > team_2_goal else team_2_name if team_2_goal > team_1_goal else "Draw"

    quater_final_goal_row= Row(
        match_id=match_id,
        team_1_name=team_1_name,
        team_2_name=team_2_name,
        team_1_goal=team_1_goal,
        team_2_goal=team_2_goal,
        winner=winner
    )
    quater_final_match_data.append(quater_final_goal_row)

df_quater_final_match_goals = spark.createDataFrame(quater_final_match_data)
display(df_quater_final_match_goals)


team_1_goal 5
team_2_goal 5
team_1_goal 2
team_2_goal 5
here
team_1_goal 0
team_2_goal 1
here
team_1_goal 1
team_2_goal 5
here
team_1_goal 1
team_2_goal 0
here


match_id,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
45,"Croatia, Group F","Denmark, Group C",2,5,"Denmark, Group C"
46,"Italy, Group F","Iceland, Group D",0,1,"Iceland, Group D"
47,"Bulgaria, Group E","France, Group F",1,5,"France, Group F"
48,"Finland, Group E","Czech Republic, Group B",1,0,"Finland, Group E"


In [0]:
df_semi_final_teams= df_quater_final_match_goals.select(col("winner").alias("team_name"),col("match_id"))
display(df_semi_final_teams)

team_name,match_id
"Denmark, Group C",45
"Iceland, Group D",46
"France, Group F",47
"Finland, Group E",48


In [0]:
semi_final_match_combination_data = [
    ('49','SF_C', [],'45', '46' ),
    ('50','SF_C', [],'47', '48' ),
]
semi_final_match_combination_data_df= spark.createDataFrame(semi_final_match_combination_data, schema=match_combination_schema)
display(semi_final_match_combination_data_df)

match_id,combination_id,combination,team1,team2
49,SF_C,List(),45,46
50,SF_C,List(),47,48


In [0]:
df_semi_final_teams.createOrReplaceTempView("semi_final_teams")
semi_final_match_combination_data_df.createOrReplaceTempView("semi_final_match_combination_data")

semi_final_sql_query = """
SELECT d.match_id, d.combination_id, t1.team_name as team_1_name, t2.team_name as team_2_name
FROM semi_final_match_combination_data d
JOIN semi_final_teams t1 ON d.team1=t1.match_id
JOIN semi_final_teams t2 ON d.team2=t2.match_id
"""

df_semi_final_match_teams_name = spark.sql(semi_final_sql_query)

display(df_semi_final_match_teams_name)

match_id,combination_id,team_1_name,team_2_name
49,SF_C,"Denmark, Group C","Iceland, Group D"
50,SF_C,"France, Group F","Finland, Group E"


In [0]:
# semi final match result
semi_final_match_data=[]
# third_pos_match_data=[]
for row in df_semi_final_match_teams_name.collect():
    match_id=row['match_id']
    team_1_name=row['team_1_name']
    team_2_name=row['team_2_name']
    while True:
    # Generate random number of goals for each team
        team_1_goal = random.randint(0, 5)
        team_2_goal = random.randint(0, 5)
        print("team_1_goal",team_1_goal)
        print("team_2_goal",team_2_goal)

        # Regenerate the goals if they are equal
        if team_1_goal != team_2_goal:
            # print('here')
            break

    # Determine the winner
    winner = team_1_name if team_1_goal > team_2_goal else team_2_name if team_2_goal > team_1_goal else "Draw"
    loser = team_1_name if team_1_goal < team_2_goal else team_2_name if team_2_goal < team_1_goal else "Draw"

    semi_final_match_goal_row= Row(
        match_id=match_id,
        team_1_name=team_1_name,
        team_2_name=team_2_name,
        team_1_goal=team_1_goal,
        team_2_goal=team_2_goal,
        winner=winner,
        loser=loser
    )
    semi_final_match_data.append(semi_final_match_goal_row)

df_semi_final_match_goals_result = spark.createDataFrame(semi_final_match_data)

display(df_semi_final_match_goals_result)



team_1_goal 5
team_2_goal 3
team_1_goal 5
team_2_goal 4


match_id,team_1_name,team_2_name,team_1_goal,team_2_goal,winner,loser
49,"Denmark, Group C","Iceland, Group D",5,3,"Denmark, Group C","Iceland, Group D"
50,"France, Group F","Finland, Group E",5,4,"France, Group F","Finland, Group E"


In [0]:
df_final_teams= df_semi_final_match_goals_result.select(col("winner").alias("team_name"),col("match_id"))
display(df_final_teams)


team_name,match_id
"Denmark, Group C",49
"France, Group F",50


In [0]:
# final match
final_match_combination_data = [
    ('51','F_C', [],'49', '50' ),
]
final_match_combination_data_df= spark.createDataFrame(final_match_combination_data, schema=match_combination_schema)
display(final_match_combination_data_df)


match_id,combination_id,combination,team1,team2
51,F_C,List(),49,50


In [0]:
df_final_teams.createOrReplaceTempView("final_teams")
final_match_combination_data_df.createOrReplaceTempView("final_match_combination_data")

semi_final_sql_query = """
SELECT d.match_id,d.combination_id, t1.team_name as team_1_name, t2.team_name as team_2_name
FROM final_match_combination_data d
JOIN final_teams t1 ON d.team1=t1.match_id
JOIN final_teams t2 ON d.team2=t2.match_id
"""

df_final_match_teams_name = spark.sql(semi_final_sql_query)

display(df_final_match_teams_name)

match_id,combination_id,team_1_name,team_2_name
51,F_C,"Denmark, Group C","France, Group F"


In [0]:
# final match result
final_match_data=[]
for row in df_final_match_teams_name.collect():
    match_id=row['match_id']
    team_1_name=row['team_1_name']
    team_2_name=row['team_2_name']
    while True:
    # Generate random number of goals for each team
        team_1_goal = random.randint(0, 5)
        team_2_goal = random.randint(0, 5)
        print("team_1_goal",team_1_goal)
        print("team_2_goal",team_2_goal)

        # Regenerate the goals if they are equal
        if team_1_goal != team_2_goal:
            break

    # Determine the winner
    winner = team_1_name if team_1_goal > team_2_goal else team_2_name if team_2_goal > team_1_goal else "Draw"

    final_match_goal_row= Row(
        match_id=match_id,
        team_1_name=team_1_name,
        team_2_name=team_2_name,
        team_1_goal=team_1_goal,
        team_2_goal=team_2_goal,
        winner=winner
    )
    final_match_data.append(final_match_goal_row)

df_final_match_goals_result = spark.createDataFrame(final_match_data)
display(df_final_match_goals_result)


team_1_goal 0
team_2_goal 4


match_id,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
51,"Denmark, Group C","France, Group F",0,4,"France, Group F"


In [0]:
df_winner= df_final_match_goals_result.select(col("winner").alias("team_name"))
display(df_winner)

team_name
"France, Group F"


In [0]:
#3rd position final match teams
df_third_pos_teams= df_semi_final_match_goals_result.select(col("loser").alias("team_name"),col("match_id"))
display(df_third_pos_teams)

team_name,match_id
"Iceland, Group D",49
"Finland, Group E",50


In [0]:
third_pos_match_combination_data = [
    ('52','F_C', [],'49', '50' ),
]
third_pos_final_match_combination_data_df= spark.createDataFrame(third_pos_match_combination_data, schema=match_combination_schema)
display(third_pos_final_match_combination_data_df)

match_id,combination_id,combination,team1,team2
52,F_C,List(),49,50


In [0]:
#3rd position final match tema name dataframe
df_third_pos_teams.createOrReplaceTempView("final_teams")
third_pos_final_match_combination_data_df.createOrReplaceTempView("final_match_combination_data")

third_pos_final_sql_query = """
SELECT d.match_id,d.combination_id, t1.team_name as team_1_name, t2.team_name as team_2_name
FROM final_match_combination_data d
JOIN final_teams t1 ON d.team1=t1.match_id
JOIN final_teams t2 ON d.team2=t2.match_id
"""

df_third_pos_match_teams_name = spark.sql(third_pos_final_sql_query)

display(df_third_pos_match_teams_name)

match_id,combination_id,team_1_name,team_2_name
52,F_C,"Iceland, Group D","Finland, Group E"


In [0]:
# third position match result
third_pos_match_data=[]
for row in df_third_pos_match_teams_name.collect():
    match_id=row['match_id']
    team_1_name=row['team_1_name']
    team_2_name=row['team_2_name']
    while True:
    # Generate random number of goals for each team
        team_1_goal = random.randint(0, 5)
        team_2_goal = random.randint(0, 5)
        print("team_1_goal",team_1_goal)
        print("team_2_goal",team_2_goal)

        # Regenerate the goals if they are equal
        if team_1_goal != team_2_goal:
            break

    # Determine the winner
    winner = team_1_name if team_1_goal > team_2_goal else team_2_name if team_2_goal > team_1_goal else "Draw"

    third_pos_final_match_goal_row= Row(
        match_id=match_id,
        team_1_name=team_1_name,
        team_2_name=team_2_name,
        team_1_goal=team_1_goal,
        team_2_goal=team_2_goal,
        winner=winner
    )
    third_pos_match_data.append(third_pos_final_match_goal_row)

df_third_pos_final_match_goals_result = spark.createDataFrame(third_pos_match_data)
display(df_third_pos_final_match_goals_result)


team_1_goal 2
team_2_goal 4


match_id,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
52,"Iceland, Group D","Finland, Group E",2,4,"Finland, Group E"


In [0]:
df_third_pos_winner= df_third_pos_final_match_goals_result.select(col("winner").alias("team_name"))
display(df_third_pos_winner)

team_name
"Finland, Group E"


In [0]:
display(df_goup_stage_teams)
display(df_knouckout_match_final_combination)
display(df_quater_final_teams_name) 
display(df_semi_final_match_teams_name)
display(df_final_match_teams_name)
display(df_winner)
display(df_third_pos_winner)

team_name,group_name,matches_played,match_won,match_loss,match_drawn,goal_scored,goal_conceded,goal_difference,points,rank,position
"Germany, Group A",Group A,3,3,0,0,14,4,10,9,1,1-Group A
"Latvia, Group A",Group A,3,1,1,1,6,8,-2,4,2,2-Group A
"Greece, Group B",Group B,3,1,0,2,10,5,5,5,1,1-Group B
"Czech Republic, Group B",Group B,3,1,0,2,7,6,1,5,2,2-Group B
"Belgium, Group C",Group C,3,3,0,0,10,5,5,9,1,1-Group C
"Denmark, Group C",Group C,3,2,1,0,10,5,5,6,2,2-Group C
"Estonia, Group D",Group D,3,2,1,0,8,4,4,6,1,1-Group D
"Iceland, Group D",Group D,3,1,0,2,6,4,2,5,2,2-Group D
"Bulgaria, Group E",Group E,3,2,1,0,9,8,1,6,1,1-Group E
"Netherlands, Group E",Group E,3,1,1,1,9,8,1,4,2,2-Group E


combination_id,combination,team1,team2
C-13,"List(Group B, Group C, Group E, Group F)",1-Group B,3-Group F
C-13,"List(Group B, Group C, Group E, Group F)",1-Group C,3-Group E
C-13,"List(Group B, Group C, Group E, Group F)",1-Group E,3-Group C
C-13,"List(Group B, Group C, Group E, Group F)",1-Group F,3-Group B
U,List(),1-Group A,2-Group C
U,List(),2-Group D,2-Group E
U,List(),1-Group D,2-Group F
U,List(),2-Group A,2-Group B


match_id,combination_id,team_1_name,team_2_name
45,Q_C,"Croatia, Group F","Denmark, Group C"
46,Q_C,"Italy, Group F","Iceland, Group D"
47,Q_C,"Bulgaria, Group E","France, Group F"
48,Q_C,"Finland, Group E","Czech Republic, Group B"


match_id,combination_id,team_1_name,team_2_name
49,SF_C,"Denmark, Group C","Iceland, Group D"
50,SF_C,"France, Group F","Finland, Group E"


match_id,combination_id,team_1_name,team_2_name
51,F_C,"Denmark, Group C","France, Group F"


team_name
"France, Group F"


team_name
"Finland, Group E"


In [0]:
display(df_third_pos_final_match_goals_result)


match_id,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
52,"Iceland, Group D","Finland, Group E",2,4,"Finland, Group E"


In [0]:
display(df_semi_final_match_goals_result)

match_id,team_1_name,team_2_name,team_1_goal,team_2_goal,winner,loser
49,"Denmark, Group C","Iceland, Group D",5,3,"Denmark, Group C","Iceland, Group D"
50,"France, Group F","Finland, Group E",5,4,"France, Group F","Finland, Group E"


In [0]:
# group_stage_summary_with_rank_visiulaize.drop(columns=['Roundof16'], inplace=True)


In [0]:
import pandas as pd
from pyspark.sql import SparkSession

group_stage_summary_with_rank.createOrReplaceTempView("group_stage_summary_with_rank_temp")

ranking_group = spark.sql("SELECT * FROM group_stage_summary_with_rank_temp")

ranking_group_pandas = ranking_group.toPandas()

visualize_table = pd.DataFrame({
    'teamname': ranking_group_pandas['team_name'],
    'teamgroup': ranking_group_pandas['group_name'],
    'rankings': ranking_group_pandas['rank'],
    'points': ranking_group_pandas['points'],
    'goaldifference': ranking_group_pandas['goal_difference']
})

blank_rows = pd.DataFrame({'teamname': [''] * 1, 'teamgroup': [''] * 1, 'rankings': [''] * 1, 'points': [''] * 1, 'goaldifference': [''] * 1})

chunks = []

for i in range(0, len(visualize_table), 4):
    chunk = visualize_table.iloc[i:i+4]  # Get a chunk of 4 records
    chunks.append(chunk)  # Append the chunk to the list
    if i < len(visualize_table) - 4:
        chunks.append(blank_rows)  

visualize_table_with_blanks = pd.concat(chunks).reset_index(drop=True)
visualize_table_with_blanks = visualize_table_with_blanks.append(blank_rows, ignore_index = True)
half_length = len(visualize_table_with_blanks) // 2

visualize_table_with_blanks1 = pd.DataFrame(columns=['teamname','teamgroup','rankings','points','goaldifference'])
visualize_table_with_blanks1 = visualize_table_with_blanks1.append(blank_rows, ignore_index = True)
visualize_table_with_blanks1 = pd.concat([visualize_table_with_blanks1, visualize_table_with_blanks.iloc[:half_length]]).reset_index(drop=True)
visualize_table_with_blanks1['space'] = ''
visualize_table_with_blanks2 = pd.DataFrame(columns=['teamname','teamgroup','rankings','points','goaldifference'])
visualize_table_with_blanks2 = visualize_table_with_blanks2.append(blank_rows, ignore_index = True)
visualize_table_with_blanks2 = pd.concat([visualize_table_with_blanks2, visualize_table_with_blanks.iloc[half_length:]]).reset_index(drop=True)

display(visualize_table_with_blanks1)
display(visualize_table_with_blanks2)


  visualize_table_with_blanks = visualize_table_with_blanks.append(blank_rows, ignore_index = True)
  visualize_table_with_blanks1 = visualize_table_with_blanks1.append(blank_rows, ignore_index = True)
  visualize_table_with_blanks2 = visualize_table_with_blanks2.append(blank_rows, ignore_index = True)
  Expected bytes, got a 'int' object
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


teamname,teamgroup,rankings,points,goaldifference,space
,,,,,
"Germany, Group A",Group A,1.0,9.0,10.0,
"Latvia, Group A",Group A,2.0,4.0,-2.0,
"Albania, Group A",Group A,3.0,3.0,-2.0,
"Cyprus, Group A",Group A,4.0,1.0,-6.0,
,,,,,
"Greece, Group B",Group B,1.0,5.0,5.0,
"Czech Republic, Group B",Group B,2.0,5.0,1.0,
"Austria, Group B",Group B,3.0,4.0,-4.0,
"Lithuania, Group B",Group B,4.0,1.0,-2.0,


teamname,teamgroup,rankings,points,goaldifference
,,,,
"Estonia, Group D",Group D,1.0,6.0,4.0
"Iceland, Group D",Group D,2.0,5.0,2.0
"Malta, Group D",Group D,3.0,2.0,-2.0
"Bosnia, Group D",Group D,4.0,2.0,-4.0
,,,,
"Bulgaria, Group E",Group E,1.0,6.0,1.0
"Netherlands, Group E",Group E,2.0,4.0,1.0
"Finland, Group E",Group E,3.0,4.0,-1.0
"Ireland, Group E",Group E,4.0,2.0,-1.0


In [0]:
# display(df_quater_final_match_goals)

In [0]:
#for match visualization
from pyspark.sql.functions import col, when

display(goup_stage_df_team_name)

custom_order_ids = [37, 41, 40, 42, 39, 43, 38, 44]

# Create a DataFrame with a custom order column
sorted_df = goup_stage_df_team_name.withColumn(
    "custom_order",
    when(col("match_fixture_id") == custom_order_ids[0], 0)
)

# Add conditions for the rest of the custom_order_ids
for i, id_ in enumerate(custom_order_ids[1:], start=1):
    sorted_df = sorted_df.withColumn(
        "custom_order",
        when(col("match_fixture_id") == id_, i).otherwise(sorted_df["custom_order"])
    )

# Sort the DataFrame based on the custom_order column
sorted_df = sorted_df.orderBy("custom_order")

# Drop the temporary custom_order column
sorted_df = sorted_df.drop("custom_order")
display(sorted_df)


match_fixture_id,match_fixture,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
37,1-Group B vs 3-Group F,"Greece, Group B","Croatia, Group F",1,2,"Croatia, Group F"
38,1-Group C vs 3-Group E,"Belgium, Group C","Finland, Group E",2,5,"Finland, Group E"
39,1-Group E vs 3-Group C,"Bulgaria, Group E","Hungary, Group C",5,2,"Bulgaria, Group E"
40,1-Group F vs 3-Group B,"Italy, Group F","Austria, Group B",2,1,"Italy, Group F"
41,1-Group A vs 2-Group C,"Germany, Group A","Denmark, Group C",2,4,"Denmark, Group C"
42,2-Group D vs 2-Group E,"Iceland, Group D","Netherlands, Group E",5,0,"Iceland, Group D"
43,1-Group D vs 2-Group F,"Estonia, Group D","France, Group F",2,5,"France, Group F"
44,2-Group A vs 2-Group B,"Latvia, Group A","Czech Republic, Group B",0,1,"Czech Republic, Group B"


match_fixture_id,match_fixture,team_1_name,team_2_name,team_1_goal,team_2_goal,winner
37,1-Group B vs 3-Group F,"Greece, Group B","Croatia, Group F",1,2,"Croatia, Group F"
41,1-Group A vs 2-Group C,"Germany, Group A","Denmark, Group C",2,4,"Denmark, Group C"
40,1-Group F vs 3-Group B,"Italy, Group F","Austria, Group B",2,1,"Italy, Group F"
42,2-Group D vs 2-Group E,"Iceland, Group D","Netherlands, Group E",5,0,"Iceland, Group D"
39,1-Group E vs 3-Group C,"Bulgaria, Group E","Hungary, Group C",5,2,"Bulgaria, Group E"
43,1-Group D vs 2-Group F,"Estonia, Group D","France, Group F",2,5,"France, Group F"
38,1-Group C vs 3-Group E,"Belgium, Group C","Finland, Group E",2,5,"Finland, Group E"
44,2-Group A vs 2-Group B,"Latvia, Group A","Czech Republic, Group B",0,1,"Czech Republic, Group B"


In [0]:
sorted_df.createOrReplaceTempView("goup_stage_teams_temp")
 
round_of_16 = spark.sql("SELECT * FROM goup_stage_teams_temp")
round_of_16_pandas = round_of_16.toPandas()
visualize_table16 = pd.DataFrame(columns=['Roundof16'])
visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
for index, row in round_of_16_pandas.iterrows():
    visualize_table16 = visualize_table16.append({'Roundof16': f"{row['team_1_name']} ({row['team_1_goal']})"}, ignore_index=True)
    visualize_table16 = visualize_table16.append({'Roundof16': f"{row['team_2_name']} ({row['team_2_goal']})"}, ignore_index=True)
    
    # Insert 2 blank rows after every record
    visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
    visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)

visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
split_index = len(visualize_table16) // 2 - 1
 
# Split the final DataFrame into two halves
visualize_table16pool1 = visualize_table16.iloc[:split_index].reset_index(drop=True)
visualize_table16pool2 = visualize_table16.iloc[split_index:].reset_index(drop=True)
visualize_table16pool2 = visualize_table16pool2.drop(visualize_table16pool2.tail(2).index)

df_quater_final_match_goals.createOrReplaceTempView("goup_stage_teams_temp")
 
#Quater final score
quater_final_score = spark.sql("SELECT * FROM goup_stage_teams_temp")
quater_final_score_pandas = quater_final_score.toPandas()
visualize_score16 = pd.DataFrame(columns=['quaterFinal'])
for i in range(3):
    visualize_score16 = visualize_score16.append({'quaterFinal': ''}, ignore_index=True)
for index, row in quater_final_score_pandas.iterrows():
    # Insert two records
    
    visualize_score16 = visualize_score16.append({'quaterFinal': f"{row['team_1_name']} ({row['team_1_goal']})"}, ignore_index=True)
    visualize_score16 = visualize_score16.append({'quaterFinal': f"{row['team_2_name']} ({row['team_2_goal']})"}, ignore_index=True)
    for i in range(6):#
        visualize_score16 = visualize_score16.append({'quaterFinal': ''}, ignore_index=True)
# visualize_score16 = visualize_score16.drop(visualize_score16.tail(3).index)
split_index = len(visualize_score16) // 2 - 1

# Split the final DataFrame into two halves
visualize_score16pool1 = visualize_score16.iloc[:split_index].reset_index(drop=True)
visualize_score16pool2 = visualize_score16.iloc[split_index:].reset_index(drop=True)
visualize_score16pool2 = visualize_score16pool2.drop(visualize_score16pool2.tail(3).index)#2
 
#semi final
df_semi_final_match_goals_result.createOrReplaceTempView("semi_final_match_goals_result_temp")
 
semi_final_score = spark.sql("SELECT * FROM semi_final_match_goals_result_temp")
semi_final_score_pandas = semi_final_score.toPandas()
visualize_score_semi = pd.DataFrame(columns=['semiFinal'])
for i in range(5):#8#7
    visualize_score_semi = visualize_score_semi.append({'semiFinal': ''}, ignore_index=True)
for index, row in semi_final_score_pandas.iterrows():
    visualize_score_semi = visualize_score_semi.append({'semiFinal':  f"{row['team_1_name']} ({row['team_1_goal']})"}, ignore_index=True)
    visualize_score_semi = visualize_score_semi.append({'semiFinal': f"{row['team_2_name']} ({row['team_2_goal']})"}, ignore_index=True)

    for i in range(13): 
        visualize_score_semi = visualize_score_semi.append({'semiFinal': ''}, ignore_index=True)

split_index = len(visualize_score_semi) // 2 - 1

# Split the final dataframe
visualize_score_semipool1 = visualize_score_semi.iloc[:split_index].reset_index(drop=True)

visualize_score_semipool2 = visualize_score_semi.iloc[split_index:].reset_index(drop=True)

visualize_score_semipool2 = visualize_score_semipool2.drop(visualize_score_semipool2.tail(3).index)

#final 
df_final_match_goals_result.createOrReplaceTempView("final_match_goals_result_temp")

final = spark.sql("SELECT * FROM final_match_goals_result_temp")
final_pandas = final.toPandas()

visualize_final = pd.DataFrame(columns=['Finals'])
for i in range(6):
    visualize_final = visualize_final.append({'Finals': ''}, ignore_index=True)

for index, row in final_pandas.iterrows():
    visualize_final = visualize_final.append({'Finals': f"{row['team_1_name']} ({row['team_1_goal']})"}, ignore_index=True)
    visualize_final = visualize_final.append({'Finals': f"{row['team_2_name']} ({row['team_2_goal']})"}, ignore_index=True)
    visualize_final = visualize_final.append({'Finals': ''}, ignore_index=True)
    visualize_final = visualize_final.append({'Finals': f"Winner: {row['winner']}"}, ignore_index=True)
    
    if (index + 1) % 2 == 0:
        visualize_final = visualize_final.append({'Finals': ''}, ignore_index=True)
        visualize_final = visualize_final.append({'Finals': ''}, ignore_index=True)
    for i in range(6):
        visualize_final = visualize_final.append({'Finals': ''}, ignore_index=True)

# display(visualize_final)

#renaming columns
visualize_table_with_blanks1.columns = [f"{col}1" for col in visualize_table_with_blanks1.columns]
visualize_table_with_blanks2.columns = [f"{col}2" for col in visualize_table_with_blanks2.columns]
 
visualize_table16pool1.columns = [f"{col}1" for col in visualize_table16pool1.columns]
visualize_table16pool2.columns = [f"{col}2" for col in visualize_table16pool2.columns]
 
visualize_score16pool1.columns = [f"{col}1" for col in visualize_score16pool1.columns]
visualize_score16pool2.columns = [f"{col}2" for col in visualize_score16pool2.columns]
 
visualize_score_semipool1.columns = [f"{col}1" for col in visualize_score_semipool1.columns]
visualize_score_semipool2.columns = [f"{col}2" for col in visualize_score_semipool2.columns]

merged_df = pd.concat([visualize_table_with_blanks1,visualize_table16pool1,visualize_score16pool1,visualize_score_semipool1,visualize_final,visualize_score_semipool2,visualize_score16pool2,visualize_table16pool2,visualize_table_with_blanks2], axis=1)
merged_df = pd.concat([
                visualize_table_with_blanks1,
                visualize_table16pool1,
                visualize_score16pool1,
                visualize_score_semipool1,
                visualize_final,
                visualize_score_semipool2,
                visualize_score16pool2,
                visualize_table16pool2,visualize_table_with_blanks2
                ], axis=1
                )

display(merged_df)


  visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': f"{row['team_1_name']} ({row['team_1_goal']})"}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': f"{row['team_2_name']} ({row['team_2_goal']})"}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': f"{row['team_1_name']} ({row['team_1_goal']})"}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': f"{row['team_2_name']} ({row['team_2_goal']})"}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16': ''}, ignore_index=True)
  visualize_table16 = visualize_table16.append({'Roundof16'

teamname111,teamgroup111,rankings111,points111,goaldifference111,space111,Roundof161,quaterFinal1,semiFinal1,Finals,semiFinal2,quaterFinal2,Roundof162,teamname222,teamgroup222,rankings222,points222,goaldifference222
,,,,,,,,,,,,,,,,,
"Germany, Group A",Group A,1.0,9.0,10.0,,"Greece, Group B (1)",,,,,,"Bulgaria, Group E (5)","Estonia, Group D",Group D,1.0,6.0,4.0
"Latvia, Group A",Group A,2.0,4.0,-2.0,,"Croatia, Group F (2)",,,,,,"Hungary, Group C (2)","Iceland, Group D",Group D,2.0,5.0,2.0
"Albania, Group A",Group A,3.0,3.0,-2.0,,,"Croatia, Group F (2)",,,,"Bulgaria, Group E (1)",,"Malta, Group D",Group D,3.0,2.0,-2.0
"Cyprus, Group A",Group A,4.0,1.0,-6.0,,,"Denmark, Group C (5)",,,"France, Group F (5)","France, Group F (5)",,"Bosnia, Group D",Group D,4.0,2.0,-4.0
,,,,,,"Germany, Group A (2)",,"Denmark, Group C (5)",,"Finland, Group E (4)",,"Estonia, Group D (2)",,,,,
"Greece, Group B",Group B,1.0,5.0,5.0,,"Denmark, Group C (4)",,"Iceland, Group D (3)","Denmark, Group C (0)",,,"France, Group F (5)","Bulgaria, Group E",Group E,1.0,6.0,1.0
"Czech Republic, Group B",Group B,2.0,5.0,1.0,,,,,"France, Group F (4)",,,,"Netherlands, Group E",Group E,2.0,4.0,1.0
"Austria, Group B",Group B,3.0,4.0,-4.0,,,,,,,,,"Finland, Group E",Group E,3.0,4.0,-1.0
"Lithuania, Group B",Group B,4.0,1.0,-2.0,,"Italy, Group F (2)",,,"Winner: France, Group F",,,"Belgium, Group C (2)","Ireland, Group E",Group E,4.0,2.0,-1.0
