In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud 
from pandas_profiling import ProfileReport



**FIFA World Cup**
The FIFA World Cup is an international soccer tournament contested by the men's national teams of the members of Fédération Internationale de Football Association (FIFA), the sport's global governing body. The tournament has been held every four years since 1930, except in 1942 and 1946, due to World War II. 

**2022 FIFA World Cup**
The 2022 FIFA World Cup is scheduled to be the 22nd edition of the FIFA World Cup, the quadrennial international men's football championship contested by the national teams of the member associations of FIFA. It is scheduled to take place in Qatar from 21 November to 18 December 2022. This will be the first World Cup ever to be held in the Middle East and the first in November and December instead of the traditional June and July. The tournament is planned to be played in 8 venues across 5 host cities in the country.

This edition of the World Cup will feature 32 teams, an increase of 16 teams compared to the previous editions. Also this tournament going to be the first to play in the 48 team format ,which was confirmed in 2017 by FIFA council.

In [None]:
# Read the CSV file in pandas
fifa_df = pd.read_csv("/kaggle/input/fifa-world-cup-2022-qatar-match-data/Fifa_WC_2022_Match_data.csv", encoding='latin1')
player_stat_df = pd.read_csv("/kaggle/input/fifa-world-cup-2022-player-data/player_stats.csv", encoding='latin1')

In [None]:
fifa_df.head(2)

**Informtion about the Dataset**

In [None]:
fifa_df.info()

**Checking For Null Values**

In [None]:
fifa_df.isnull().sum()

**Feature Creation**

In [None]:
# add 1_goals and 2_goals, create a new col with name total_match_goals 
# and store the added value in that
fifa_df['total_match_goals'] = fifa_df['1_goals'] + fifa_df['2_goals']

In [None]:
# Show the specific col  i.e 1_goals and 2_goals and total_match_goals
fifa_df.loc[:,['1_goals', '2_goals', 'total_match_goals']].head(10)

In [None]:
# Highest scoring match
fifa_df[fifa_df['total_match_goals'] == fifa_df['total_match_goals'].max()]

**With the above results we can see that the match number 2 played on Mon 21-Nov_2022 which was played at Khalifa International Stadium between England and Iran was the highest goal scoring match with 8 goals. The Referee for the match was Raphael Claus**

In [None]:
# Match with highest Attendence
fifa_df[fifa_df['attendance'] == fifa_df['attendance'].max()]

**With above Results we can see that the top three highest attendance matches where played on Lusail Iconic Stadium with the presence of 88966 spectators.**

In [None]:
# Argetina matches
fifa_df[(fifa_df['1'] == 'ARGENTINA') | (fifa_df['2'] == 'ARGENTINA')] 

In [None]:
# FRANCE matches
fifa_df[(fifa_df['1'] == 'FRANCE') | (fifa_df['2'] == 'FRANCE')] 

In [None]:
# No. Matches Played on respective venues till QF
fifa_df['venue'].value_counts()

**The above results shows us the number of games payed at each venue, highest is at Lusail Iconic Stadium.  
Below is the graph for the same**

In [None]:
# Bar graph Venue v/s No of Matches Played at Venue (using plotly)
x = fifa_df['venue'].value_counts().index
y = fifa_df['venue'].value_counts().values

df = pd.DataFrame({'Venue':x,
                  'Matches':fifa_df['venue'].value_counts().values })

fig = px.bar(df, 
             x='Venue', 
             y='Matches',
             color='Venue',
             title='Venue v/s No of Matches Played at Venue'
            )
fig.show()

In [None]:
# Total Attendance in all the venue of all matches played
fifa_df.groupby('venue').sum()['attendance'].sort_values(ascending=False)

In [None]:
# Bar graph Venue v/s attendance at Venue (using seaborn) 
x = fifa_df.groupby('venue').sum()['attendance'].index
y = fifa_df.groupby('venue').sum()['attendance'].values
labels = [s.strip('Stadium') for s in x]
df = pd.DataFrame({'venue': labels, 'attendance': y})
plt.figure(figsize=(15, 8))
splot=sns.barplot(x="venue",y="attendance",data=df)
plt.xlabel("Venue", size=16)
plt.ylabel("Attendance", size=16)
plt.title('Venue v/s Total Attendance of all matches played')
plt.bar_label(splot.containers[0],size=16)
plt.show()

In [None]:
# Bar graph venue v/s attendance at Venue (using plotly)
x = fifa_df.groupby('venue').sum()['attendance'].sort_index().index
y = fifa_df.groupby('venue').sum()['attendance'].sort_index().values

df_1 = pd.DataFrame({'venue': x, 'attendance': y})

fig = px.bar(df_1, x='venue', y='attendance',color='attendance',title='Venue v/s Total Attendance of all matches played')
fig.update_layout(title_text='Venue v/s Total Attendance of all matches played',template='plotly_dark')
fig.show()

In [None]:
# Venue with max attendance of total matches played 
venue_df = pd.DataFrame({'venue':fifa_df.groupby(['venue'])['attendance'].sum().sort_values(ascending=False).index,
                   'total_attendance': fifa_df.groupby(['venue'])['attendance'].sum().sort_values(ascending=False).values})
venue_df[venue_df['total_attendance'] == venue_df['total_attendance'].max()]



In [None]:
# Plotiing the wordcloud for the teams
plt.subplots(figsize=(25,15))
wordcloud = WordCloud(
                          background_color='white',
                          width=1920,
                          height=1080
                         ).generate(" ".join(fifa_df['1']))
plt.imshow(wordcloud)
plt.axis('off')
#plt.savefig('teams.png')
plt.show()

In [None]:
#All Teams
" ".join(fifa_df['1'].sort_values().value_counts().index)

In [None]:
# Total games played and goals scored by the teams

team_df = pd.DataFrame({'teams':fifa_df['1'].value_counts().sort_index().index,
                        'total_matches':fifa_df['1'].value_counts().sort_index().values + fifa_df['2'].value_counts().sort_index().values,
                        'total_goals': fifa_df.groupby(['1'])['1_goals'].sum().sort_index().values + fifa_df.groupby(['2'])['2_goals'].sum().sort_index().values, 
                         })
team_df.sort_values(by='total_goals', ascending=False)


#team_df.loc[:, ['teams','total_matches','total_goals' ]].sort_values(by='total_goals', ascending=False)




In [None]:
# Bar graph Team v/s Goals Scored (using plotly)

fig = px.bar(team_df, x='teams', y='total_goals',color='total_matches',title='Teams v/s Goals Scored')
fig.update_layout(title_text='Teams v/s Goals Scored',template='plotly_white', width=1000)
fig.show()

In [None]:
# Highest Goals Scoring Team
team_df[team_df['total_goals'] == team_df['total_goals'].max()]

**From above we can say that France is the highest goal scoring team**

In [None]:
# Top 3 highest Goal Scoring Teams
team_df.sort_values(by='total_goals', ascending=False).head(3)

In [None]:
#lowest Goal Scoring Teams
team_df[team_df['total_goals'] == team_df['total_goals'].min()]

**From above we can say that BELGIUM, DENMARK, QATAR, TUNISIA and WALES are the lowest goal scoring teams they only scored 1 goal each**

In [None]:
# no of matches played and passes completed by teams

team_df['total_pass_completed'] = fifa_df.groupby(['1'])['1_passes_compeletd'].sum().sort_index().values + fifa_df.groupby(['2'])['2_passes_compeletd'].sum().sort_index().values

#sort on total_pass_completed
team_df.loc[:,['teams', 'total_matches', 'total_pass_completed']].sort_values(by='total_pass_completed', ascending=False)




In [None]:
# Bar graph Team v/s pass completed (method 1 using plotly)

fig = px.bar(team_df, x='teams', y='total_pass_completed',color='total_matches',title='Teams v/s Pass Completed')
fig.update_layout(title_text='Teams v/s Pass Completed',template='plotly', width=800)
fig.show()

In [None]:
# Highest no of passes completed by team
team_df.loc[team_df['total_pass_completed'] == team_df['total_pass_completed'].max(), ['teams', 'total_matches', 'total_pass_completed']]

In [None]:
# Top 3 teams w.r.t. the no of passes completed 
team_df.loc[:, ['teams', 'total_matches', 'total_pass_completed']].sort_values(by='total_pass_completed', ascending=False).head(3)

In [None]:
# Lowest no of passes completed by team
team_df.loc[team_df['total_pass_completed'] == team_df['total_pass_completed'].min(), ['teams', 'total_matches', 'total_pass_completed']]

In [None]:
# no of matches played and possession by teams

team_df['avg_possession'] = round((fifa_df.groupby(['1'])['1_poss'].sum().sort_index().values + fifa_df.groupby(['2'])['2_poss'].sum().sort_index().values)/team_df['total_matches'], 2)

#sort on avg_possession
team_df.loc[:, ['teams', 'total_matches', 'avg_possession']].sort_values(by='avg_possession', ascending=False)

In [None]:
# Bar graph Teams v/s Avg Possession 

fig = px.bar(team_df, x='teams', y='avg_possession',color='total_matches',title='Teams v/s Avg Possession')
fig.update_layout(title_text='Teams v/s Avg Possession ',template='ggplot2')
fig.show()

In [None]:
# Highest avg_possession by team
team_df.loc[team_df['avg_possession'] == team_df['avg_possession'].max(), ['teams', 'total_matches', 'avg_possession']]

In [None]:
# Top 3 teams w.r.t. the avg_possession
team_df.loc[:,['teams', 'total_matches', 'avg_possession']].sort_values(by='avg_possession', ascending=False).head(3)

In [None]:
# Lowest avg_possession by team
team_df.loc[team_df['avg_possession'] == team_df['avg_possession'].min(), ['teams', 'total_matches', 'avg_possession']]

In [None]:
# Exected Goals(xG) by teams

team_df['avg_xg'] = round((fifa_df.groupby(['1'])['1_xg'].sum().sort_index().values + fifa_df.groupby(['2'])['2_xg'].sum().sort_index().values)/team_df['total_matches'], 2)

#sort on avg_xg
team_df.loc[:,['teams', 'total_matches', 'avg_xg']].sort_values(by='avg_xg', ascending=False)

In [None]:
# Bar graph Teams v/s Exected Goals(xG) 

fig = px.bar(team_df, x='teams', y='avg_xg',color='total_matches',title='Teams v/s Exected Goals(xG) ')
fig.update_layout(title_text='Teams v/s Exected Goals(xG)  ',template='seaborn')
fig.show()

In [None]:
# Highest Exected Goals(xG) by teams
team_df.loc[team_df['avg_xg'] == team_df['avg_xg'].max(), ['teams', 'total_matches', 'avg_xg']]

In [None]:
# Top 3 teams w.r.t. the Exected Goals(xG)
team_df.loc[:,['teams', 'total_matches', 'avg_xg']].sort_values(by='avg_xg', ascending=False).head(3)

In [None]:
# Lowest Exected Goals(xG) by team
team_df.loc[team_df['avg_xg'] == team_df['avg_xg'].min(), ['teams', 'total_matches', 'avg_xg']]

In [None]:
# This is how team_df looks like after the above iterations 
# sorted in alphabetical order
team_df.head(32)

In [None]:
# Yellow Cards by teams 

team_df['total_yellow_cards'] = (fifa_df.groupby(['1'])['1_yellow_cards'].sum().sort_index().values + fifa_df.groupby(['2'])['2_yellow_cards'].sum().sort_index().values)

#sort on total_yellow_cards
team_df.loc[:, ['teams', 'total_matches', 'total_yellow_cards']].sort_values(by='total_yellow_cards', ascending=False)

In [None]:
# Bar graph Teams v/s Yellow Cards

fig = px.bar(team_df, x='teams', y='total_yellow_cards',color='total_matches',title='Teams v/s Yellow Cards ')
fig.update_layout(title_text='Teams v/s Yellow Cards  ',template='simple_white')
fig.show()

In [None]:
# Highest Yellow Cards by teams
team_df.loc[team_df['total_yellow_cards'] == team_df['total_yellow_cards'].max(), ['teams', 'total_matches', 'total_yellow_cards']]

In [None]:
# Top 3 teams w.r.t. the Yellow Cards
team_df.loc[:,['teams', 'total_matches', 'total_yellow_cards']].sort_values(by='total_yellow_cards', ascending=False).head(3)

In [None]:
# Lowest Yellow Cards by teams
team_df.loc[team_df['total_yellow_cards'] == team_df['total_yellow_cards'].min(), ['teams', 'total_matches', 'total_yellow_cards']]

In [None]:
# Teams with Red Cards

team_df['total_red_cards'] = (fifa_df.groupby(['1'])['1_red_cards'].sum().sort_index().values + fifa_df.groupby(['2'])['2_red_cards'].sum().sort_index().values)
team_df.loc[team_df['total_red_cards'] != 0, ['teams', 'total_matches', 'total_red_cards']].sort_values(by='total_red_cards', ascending=False)

In [None]:
# Total games played and goals conceded by the teams  

team_df['total_goals_conceded'] = (fifa_df.groupby(['1'])['1_conceded'].sum().sort_index().values + fifa_df.groupby(['2'])['2_conceded'].sum().sort_index().values)

#sort on goals conceded
team_df.loc[:, ['teams', 'total_matches', 'total_goals_conceded']].sort_values(by='total_goals_conceded', ascending=False)


In [None]:
# Bar graph Teams v/s Total Goals Conceded

fig = px.bar(team_df, x='teams', y='total_goals_conceded',color='total_matches',title='Teams v/s Total Goals Conceded ')
fig.update_layout(title_text='Teams v/s Total Goals Conceded  ',template='none')
fig.show()

In [None]:
# Team with Highest Goal Conceded
team_df.loc[team_df['total_goals_conceded'] == team_df['total_goals_conceded'].max(), ['teams', 'total_matches', 'total_goals_conceded']]

In [None]:
# Team with Lowest Goal Conceded
team_df.loc[team_df['total_goals_conceded'] == team_df['total_goals_conceded'].min(), ['teams', 'total_matches', 'total_goals_conceded']]

In [None]:
# Top 3 teams w.r.t. the Goal Conceded
team_df.loc[:,['teams', 'total_matches', 'total_goals_conceded']].sort_values(by='total_goals_conceded', ascending=False).head(3)

In [None]:
# Teams with own goals  

team_df['total_own_goals'] = (fifa_df.groupby(['1'])['1_own_goal'].sum().sort_index().values + fifa_df.groupby(['2'])['2_own_goal'].sum().sort_index().values)

#sort on total_own_goals
team_df.loc[team_df['total_own_goals'] != 0, ['teams', 'total_matches', 'total_own_goals']].sort_values(by='total_own_goals', ascending=False)


In [None]:
# Total games played and goals conceded by the teams excluding own goals  

team_df['goals_by_opponent'] = team_df['total_goals_conceded'] - team_df['total_own_goals']

#sort on goals_by_opponent
team_df.loc[:, ['teams','total_matches','total_goals_conceded','total_own_goals', 'goals_by_opponent']].sort_values(by='goals_by_opponent', ascending=False)


In [None]:
# Bar graph Teams v/s Total Goals Conceded(excluding Own Goal)

fig = px.bar(team_df, x='teams', y='goals_by_opponent',color='total_matches',title='Teams v/s Total Goals Conceded(excluding Own Goal) ')
fig.update_layout(title_text='Teams v/s Total Goals Conceded(excluding Own Goal)  ',template='plotly_white')
fig.show()

# **Player Statistics**

In [None]:
player_stat_df.head()

In [None]:
player_stat_df.info()

In [None]:
# Top Goal Scorer of WC 2022 | Golden Boot Award
#player_stat_df[player_stat_df['goals'] == player_stat_df['goals'].max()]
player_stat_df.loc[player_stat_df['goals'] == player_stat_df['goals'].max(), ['player', 'team','birth_year', 'club','games', 'assists','goals' ] ]

In [None]:
# Top 5 Goal Scorer of WC 2022
player_stat_df.loc[:, ['player', 'team','birth_year', 'club','games', 'assists','goals' ] ].sort_values(by='goals', ascending=False).head(5)

In [None]:

player_stat_df[player_stat_df['xg'] == player_stat_df['xg'].max()]

In [None]:
# Top Goal Assist of WC 2022 
player_stat_df.loc[player_stat_df['assists'] == player_stat_df['assists'].max(), ['player', 'team','birth_year', 'club','games', 'assists','goals' ] ].sort_values(by='goals', ascending=False)

In [None]:
profile = ProfileReport(team_df)
profile

In [None]:
profile.to_file(output_file='fifa.html')