## Charts to display in flask frontend

### 0. Imports

In [35]:
import os

import sqlite3

import pandas as pd
from pandas.io.sql import DatabaseError

import numpy as np

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from scipy import stats

### 1. Setup and load

In [36]:
# Path to database
db_path = '../football.db'

if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database '{db_path}' not found. Run db_setup.py first.")

conn = sqlite3.connect(db_path)

try:
    df = pd.read_sql_query("SELECT * FROM matches", conn)
    print(f"✅ Loaded {len(df)} rows from 'matches' table")
except DatabaseError as e:
    df = pd.DataFrame()
    print(f"❌ Error loading data: {e}")
finally:
    conn.close()

✅ Loaded 1508 rows from 'matches' table


### 2. Data preprocessing

In [37]:
# Enhanced data cleaning for analysis
df_clean = df.copy()

print(df_clean.isnull().sum()[df_clean.isnull().sum() > 0])

# Remove rows with missing critical information (match identifiers and results)
critical_cols = ['HomeTeam', 'AwayTeam', 'FTR', 'Date']
df_clean = df_clean.dropna(subset=critical_cols)

# Handle missing values in numerical columns
numerical_cols = ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
                  'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR']

# Fill missing numerical values with 0 (assuming missing means no shots, fouls, cards, etc.)
for col in numerical_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna(0)

# Clean string columns
string_cols = ['HomeTeam', 'AwayTeam', 'FTR', 'HTR']
for col in string_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].str.strip()  # Remove whitespace
        # Fill any remaining missing string values with 'Unknown'
        df_clean[col] = df_clean[col].fillna('Unknown')

# Handle any other columns that might have missing values
remaining_missing = df_clean.isnull().sum()
if remaining_missing.sum() > 0:
    print(f"⚠️ Remaining missing values found:")
    print(remaining_missing[remaining_missing > 0])
    
    # Drop columns that are mostly missing (>50% missing)
    threshold = len(df_clean) * 0.5
    cols_to_drop = remaining_missing[remaining_missing > threshold].index.tolist()
    if cols_to_drop:
        print(f"Dropping columns with >50% missing values: {cols_to_drop}")
        df_clean = df_clean.drop(columns=cols_to_drop)
    
    # For remaining columns, fill with appropriate defaults
    for col in df_clean.columns:
        if df_clean[col].isnull().sum() > 0:
            if df_clean[col].dtype in ['object', 'string']:
                df_clean[col] = df_clean[col].fillna('Unknown')
            else:
                df_clean[col] = df_clean[col].fillna(0)

# Final check - ensure no missing values remain
assert df_clean.isnull().sum().sum() == 0, "Missing values still exist!"

# Update the main dataframe
df = df_clean

print(f"✅ Data cleaned successfully!")
print(f"✅ Final data shape: {df.shape}")
print(f"✅ No missing values remaining: {df.isnull().sum().sum() == 0}")


HTHG          4
HTAG          4
HTR           4
HS            5
AS            5
HST           5
AST           5
HF            5
AF            5
HC            5
AC            5
HY            4
AY            4
HR            4
AR            4
B365H         1
B365D         1
B365A         1
BWH           3
BWD           3
BWA           3
PSH           4
PSD           4
PSA           4
WHH         349
WHD         349
WHA         349
B365>2.5      1
B365<2.5      1
P>2.5         4
P<2.5         4
B365AHH       4
B365AHA       4
PAHH          4
PAHA          4
BWCH          8
BWCD          8
BWCA          8
WHCH        355
WHCD        355
WHCA        355
dtype: int64
⚠️ Remaining missing values found:
B365H         1
B365D         1
B365A         1
BWH           3
BWD           3
BWA           3
PSH           4
PSD           4
PSA           4
WHH         349
WHD         349
WHA         349
B365>2.5      1
B365<2.5      1
P>2.5         4
P<2.5         4
B365AHH       4
B365AHA       4
PAHH    

### 3. Extra Stats

In [38]:
# Add derived columns
df['TotalGoals'] = df['FTHG'] + df['FTAG']
df['GoalDifference'] = df['FTHG'] - df['FTAG']
df['IsDraw'] = df['FTR'] == 'D'

# Points earned by Home Team
def home_points(row):
    if row['FTR'] == 'H':
        return 3
    elif row['FTR'] == 'D':
        return 1
    else:
        return 0

df['HomePoints'] = df.apply(home_points, axis=1)

# Convert Date column
df['Date'] = pd.to_datetime(df['Date'])


### 4. Charts

In [39]:
# Group by date to show average goals scored by home teams per matchday
daily_goals = df.groupby('Date')['FTHG'].mean().reset_index()


# Calculate time gaps between rows
daily_goals['PrevDate'] = daily_goals['Date'].shift(1)
daily_goals['Gap'] = (daily_goals['Date'] - daily_goals['PrevDate']).dt.days

# Insert NaN for big gaps
daily_goals['FTHG_clean'] = np.where(daily_goals['Gap'] > 30, np.nan, daily_goals['FTHG'])

# Plot the cleaned series
fig = px.line(
    daily_goals,
    x='Date',
    y='FTHG_clean',
    title='Avg Home Goals Over Time (With Season Gaps)',
    labels={'FTHG_clean': 'Avg Home Goals'}
)
fig.show()


In [40]:
# Compute top 10 home teams
home_goals = df.groupby('HomeTeam')['FTHG'].mean().reset_index()
top_home = home_goals.sort_values(by='FTHG', ascending=False).head(10)

# Compute top 10 away teams
away_goals = df.groupby('AwayTeam')['FTAG'].mean().reset_index()
top_away = away_goals.sort_values(by='FTAG', ascending=False).head(10)

# Create the figure
fig = go.Figure()

# Add home team bar chart
fig.add_trace(go.Bar(
    x=top_home['HomeTeam'],
    y=top_home['FTHG'],
    name='Home Teams',
    visible=True
))

# Add away team bar chart
fig.add_trace(go.Bar(
    x=top_away['AwayTeam'],
    y=top_away['FTAG'],
    name='Away Teams',
    visible=False
))

# Add dropdown to toggle visibility
fig.update_layout(
    title='Top 10 Teams by Average Goals',
    updatemenus=[
        dict(
            type='dropdown',
            direction='down',
            buttons=[
                dict(label='Top Home Teams',
                     method='update',
                     args=[{'visible': [True, False]},
                           {'title': 'Top 10 Home Teams by Avg Goals',
                            'yaxis': {'title': 'Avg Home Goals'}}]),
                dict(label='Top Away Teams',
                     method='update',
                     args=[{'visible': [False, True]},
                           {'title': 'Top 10 Away Teams by Avg Goals',
                            'yaxis': {'title': 'Avg Away Goals'}}]),
            ],
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.15,
            yanchor="top"
        ),
    ]
)

fig.update_layout(height=500)

fig.show()


In [41]:
# Calculate win/loss/draw statistics for all teams
team_stats = []

# Get all unique teams
all_teams = set(df['HomeTeam'].unique()) | set(df['AwayTeam'].unique())

for team in all_teams:
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_draws = len(home_matches[home_matches['FTR'] == 'D'])
    home_losses = len(home_matches[home_matches['FTR'] == 'A'])
    
    # Away matches
    away_matches = df[df['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_draws = len(away_matches[away_matches['FTR'] == 'D'])
    away_losses = len(away_matches[away_matches['FTR'] == 'H'])
    
    # Total stats
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    total_matches = total_wins + total_draws + total_losses
    
    team_stats.append({
        'Team': team,
        'Wins': total_wins,
        'Draws': total_draws,
        'Losses': total_losses,
        'Total': total_matches,
        'Win_Pct': total_wins / total_matches * 100 if total_matches > 0 else 0
    })

# Convert to DataFrame and sort by win percentage
stats_df = pd.DataFrame(team_stats)
stats_df = stats_df.sort_values('Win_Pct', ascending=True)

# Create stacked horizontal bar chart
fig_wld = px.bar(
    stats_df,
    x=['Wins', 'Draws', 'Losses'],
    y='Team',
    orientation='h',
    title='Win/Draw/Loss Record by Team',
    color_discrete_map={
        'Wins': '#2E8B57',     # Sea Green
        'Draws': '#FFD700',    # Gold
        'Losses': '#DC143C'    # Crimson
    },
    labels={'value': 'Number of Matches', 'variable': 'Result'}
)

fig_wld.update_layout(
    height=600,
    xaxis_title='Number of Matches',
    yaxis_title='Teams',
    legend_title='Match Result'
)

fig_wld.show()

In [42]:
# Extract season information from dates
df['Season'] = df['Date'].dt.year.astype(str) + '/' + (df['Date'].dt.year + 1).astype(str).str[-2:]

# Get unique seasons for dropdown options
seasons = ['All Seasons'] + sorted(df['Season'].unique().tolist())

# Calculate win/loss/draw statistics as percentages with season filter
def calculate_team_stats_by_season(selected_season):
    if selected_season == 'All Seasons':
        filtered_df = df
    else:
        filtered_df = df[df['Season'] == selected_season]
    
    team_stats_pct = []
    all_teams = set(filtered_df['HomeTeam'].unique()) | set(filtered_df['AwayTeam'].unique())
    
    for team in all_teams:
        # Home matches
        home_matches = filtered_df[filtered_df['HomeTeam'] == team]
        home_wins = len(home_matches[home_matches['FTR'] == 'H'])
        home_draws = len(home_matches[home_matches['FTR'] == 'D'])
        home_losses = len(home_matches[home_matches['FTR'] == 'A'])
        
        # Away matches
        away_matches = filtered_df[filtered_df['AwayTeam'] == team]
        away_wins = len(away_matches[awayMatches['FTR'] == 'A'])
        away_draws = len(away_matches[away_matches['FTR'] == 'D'])
        away_losses = len(away_matches[away_matches['FTR'] == 'H'])
        
        # Total stats
        total_wins = home_wins + away_wins
        total_draws = home_draws + away_draws
        total_losses = home_losses + away_losses
        total_matches = total_wins + total_draws + total_losses
        
        if total_matches > 0:  # Only include teams with matches in the season
            # Calculate percentages
            win_pct = (total_wins / total_matches * 100)
            draw_pct = (total_draws / total_matches * 100)
            loss_pct = (total_losses / total_matches * 100)
            
            team_stats_pct.append({
                'Team': team,
                'Win_Pct': win_pct,
                'Draw_Pct': draw_pct,
                'Loss_Pct': loss_pct,
                'Total': total_matches
            })
    
    return pd.DataFrame(team_stats_pct).sort_values('Win_Pct', ascending=True)

# Create initial data for all seasons
initial_stats = calculate_team_stats_by_season('All Seasons')

# Create the figure with initial data
fig_pct = px.bar(
    initial_stats,
    x=['Win_Pct', 'Draw_Pct', 'Loss_Pct'],
    y='Team',
    orientation='h',
    title='Win/Draw/Loss Percentage by Team - All Seasons',
    color_discrete_map={
        'Win_Pct': '#2E8B57',     # Sea Green
        'Draw_Pct': '#FFD700',    # Gold
        'Loss_Pct': '#DC143C'     # Crimson
    },
    labels={'value': 'Percentage (%)', 'variable': 'Result'}
)

# Add dropdown menu for season selection
buttons = []
for season in seasons:
    season_stats = calculate_team_stats_by_season(season)
    
    # Create button for each season
    button = dict(
        label=season,
        method='restyle',
        args=[{
            'x': [season_stats['Win_Pct'].tolist(), 
                  season_stats['Draw_Pct'].tolist(), 
                  season_stats['Loss_Pct'].tolist()],
            'y': [season_stats['Team'].tolist()] * 3
        }],
        args2=[{
            'title': f'Win/Draw/Loss Percentage by Team - {season}'
        }]
    )
    buttons.append(button)

# Add the dropdown to the layout
fig_pct.update_layout(
    height=600,
    xaxis_title='Percentage (%)',
    yaxis_title='Teams',
    legend_title='Match Result',
    xaxis=dict(range=[0, 100]),  # Set x-axis range to 0-100%
    updatemenus=[
        dict(
            type='dropdown',
            direction='down',
            buttons=buttons,
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.15,
            yanchor="top"
        )
    ]
)

fig_pct.show()

NameError: name 'awayMatches' is not defined

In [None]:
# Create a scatter plot showing relationship between shots and goals with sized markers
# First, count occurrences of each (HS, FTHG) combination
df_counts = df.groupby(['HS', 'FTHG', 'FTR']).size().reset_index(name='count')

fig_shots_goals = px.scatter(
    df_counts, 
    x='HS', 
    y='FTHG',
    color='FTR',
    size='count',  # Size dots based on count of overlapping points
    title='Home Team: Shots vs Goals Scored (Dot size = frequency)',
    labels={
        'HS': 'Home Shots',
        'FTHG': 'Home Goals',
        'FTR': 'Match Result',
        'count': 'Number of matches'
    },
    color_discrete_map={
        'H': '#2E8B57',  # Green for home wins
        'D': '#FFD700',  # Gold for draws
        'A': '#DC143C'   # Red for home losses
    },
    size_max=20,  # Maximum dot size
    hover_data=['count']
)

fig_shots_goals.update_layout(
    height=500,
    showlegend=True
)

fig_shots_goals.show()


In [None]:
# First, create the correlation matrix for numerical columns
numerical_cols = ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
                  'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 
                  'TotalGoals', 'GoalDifference', 'HomePoints']

correlation_matrix = df[numerical_cols].corr()

# Create a dictionary to explain abbreviations
abbreviation_explanations = {
    'FTHG': 'Full Time Home Goals',
    'FTAG': 'Full Time Away Goals', 
    'HTHG': 'Half Time Home Goals',
    'HTAG': 'Half Time Away Goals',
    'HS': 'Home Shots',
    'AS': 'Away Shots',
    'HST': 'Home Shots on Target',
    'AST': 'Away Shots on Target',
    'HF': 'Home Fouls',
    'AF': 'Away Fouls',
    'HC': 'Home Corners',
    'AC': 'Away Corners',
    'HY': 'Home Yellow Cards',
    'AY': 'Away Yellow Cards',
    'HR': 'Home Red Cards',
    'AR': 'Away Red Cards',
    'TotalGoals': 'Total Goals in Match',
    'GoalDifference': 'Home Goals - Away Goals',
    'HomePoints': 'Points Earned by Home Team'
}

# Extract correlations with HomePoints
homepoints_corr = correlation_matrix['HomePoints'].drop('HomePoints')
homepoints_corr_sorted = homepoints_corr.abs().sort_values(ascending=False)

# Get top 10 most correlated features
top_correlations = homepoints_corr_sorted.head(10)
top_corr_values = homepoints_corr[top_correlations.index]

# Create hover text with explanations
hover_text = [f"{abbreviation_explanations.get(feature, feature)}<br>Correlation: {corr:.3f}" 
              for feature, corr in top_corr_values.items()]

# Create bar chart with enhanced hover information
fig_homepoints = px.bar(
    x=top_corr_values.values,
    y=top_corr_values.index,
    orientation='h',
    title='Top 10 Features Most Correlated with Home Points',
    labels={'x': 'Correlation Coefficient', 'y': 'Features'},
    color=top_corr_values.values,
    color_continuous_scale='RdBu_r',
    color_continuous_midpoint=0,
    hover_name=[abbreviation_explanations.get(feature, feature) for feature in top_corr_values.index]
)

# Update hover template for better formatting
fig_homepoints.update_traces(
    text=[f'{val:.3f}' for val in top_corr_values.values],
    textposition='outside',
    hovertemplate='<b>%{hovertext}</b><br>' +
                  'Correlation: %{x:.3f}<br>' +
                  '<extra></extra>',
    hovertext=hover_text
)

fig_homepoints.update_layout(
    height=500,
    showlegend=False,
    coloraxis_showscale=True,
    coloraxis_colorbar_title="Correlation"
)

fig_homepoints.show()


In [None]:
# Create a comprehensive cards vs wins analysis
team_cards_wins = []

for team in df['HomeTeam'].unique():
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_yellow = home_matches['HY'].sum()
    home_red = home_matches['HR'].sum()
    home_total = len(home_matches)
    
    # Away matches  
    away_matches = df[df['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_yellow = away_matches['AY'].sum()
    away_red = away_matches['AR'].sum()
    away_total = len(away_matches)
    
    # Combined stats
    total_wins = home_wins + away_wins
    total_matches = home_total + away_total
    total_yellow = home_yellow + away_yellow
    total_red = home_red + away_red
    total_cards = total_yellow + (total_red * 2)  # Weight red cards more heavily
    
    if total_matches > 20:  # Only include teams with sufficient matches
        team_cards_wins.append({
            'Team': team,
            'Win_Rate': (total_wins / total_matches) * 100,
            'Cards_Per_Match': total_cards / total_matches,
            'Yellow_Per_Match': total_yellow / total_matches,
            'Red_Per_Match': total_red / total_matches,
            'Total_Matches': total_matches,
            'Discipline_Score': (total_yellow * 1 + total_red * 5) / total_matches  # Discipline penalty score
        })

cards_df = pd.DataFrame(team_cards_wins)

# Create the main bubble chart
fig_cards = px.scatter(
    cards_df,
    x='Cards_Per_Match',
    y='Win_Rate',
    size='Total_Matches',
    color='Discipline_Score',
    hover_name='Team',
    title='🟨🟥 The Card Penalty Effect: How Discipline Affects Team Success',
    labels={
        'Cards_Per_Match': 'Total Cards per Match (Yellow + 2×Red)',
        'Win_Rate': 'Win Rate (%)',
        'Discipline_Score': 'Discipline Penalty Score',
        'Total_Matches': 'Total Matches'
    },
    color_continuous_scale='Reds',
    size_max=25
)

# Customize the layout
fig_cards.update_layout(
    height=600,
    showlegend=True,
    plot_bgcolor='rgba(240,240,240,0.8)',
    font=dict(size=12),
    title_font_size=16,
    annotations=[
        dict(
            x=0.02, y=0.98,
            xref='paper', yref='paper',
            text='💡 Bubble size = Total matches played<br>🔴 Color intensity = Disciplinary issues',
            showarrow=False,
            font=dict(size=10),
            bgcolor='rgba(255,255,255,0.8)',
            bordercolor='gray',
            borderwidth=1
        )
    ]
)

# Add quadrant lines for better interpretation
mean_cards = cards_df['Cards_Per_Match'].mean()
mean_wins = cards_df['Win_Rate'].mean()

fig_cards.add_hline(y=mean_wins, line_dash="dot", line_color="gray", 
                   annotation_text=f"Avg Win Rate: {mean_wins:.1f}%")
fig_cards.add_vline(x=mean_cards, line_dash="dot", line_color="gray",
                   annotation_text=f"Avg Cards: {mean_cards:.1f}")

# Update hover template for better information
fig_cards.update_traces(
    hovertemplate='<b>%{hovertext}</b><br>' +
                  'Win Rate: %{y:.1f}%<br>' +
                  'Cards per Match: %{x:.2f}<br>' +
                  'Total Matches: %{marker.size}<br>' +
                  'Discipline Score: %{marker.color:.2f}<br>' +
                  '<extra></extra>',
    selector=dict(mode='markers')
)

fig_cards.show()

In [None]:
# Create a comprehensive home vs away performance comparison
performance_stats = []

for team in df['HomeTeam'].unique():
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_draws = len(home_matches[home_matches['FTR'] == 'D'])
    home_losses = len(home_matches[home_matches['FTR'] == 'A'])
    home_total = len(home_matches)
    home_goals_scored = home_matches['FTHG'].sum()
    home_goals_conceded = home_matches['FTAG'].sum()
    
    # Away matches
    away_matches = df[df['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_draws = len(away_matches[away_matches['FTR'] == 'D'])
    away_losses = len(away_matches[awayMatches['FTR'] == 'H'])
    away_total = len(away_matches)
    away_goals_scored = away_matches['FTAG'].sum()
    away_goals_conceded = away_matches['FTHG'].sum()
    
    if home_total >= 20 and away_total >= 20:  # Only teams with sufficient matches
        performance_stats.append({
            'Team': team,
            'Home_Win_Rate': (home_wins / home_total) * 100,
            'Away_Win_Rate': (away_wins / away_total) * 100,
            'Home_Goals_Per_Game': home_goals_scored / home_total,
            'Away_Goals_Per_Game': away_goals_scored / away_total,
            'Home_Goals_Against_Per_Game': home_goals_conceded / home_total,
            'Away_Goals_Against_Per_Game': away_goals_conceded / away_total,
            'Home_Advantage': ((home_wins / home_total) - (away_wins / away_total)) * 100,
            'Goal_Difference_Home': (home_goals_scored - home_goals_conceded) / home_total,
            'Goal_Difference_Away': (away_goals_scored - away_goals_conceded) / away_total,
            'Home_Matches': home_total,
            'Away_Matches': away_total
        })

perf_df = pd.DataFrame(performance_stats)

# Create subplots for comprehensive comparison

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Win Rate: Home vs Away', 'Goals Scored: Home vs Away', 
                   'Home Advantage by Team', 'Goal Difference: Home vs Away'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# 1. Win Rate Comparison - Scatter plot
fig.add_trace(
    px.scatter(perf_df, x='Away_Win_Rate', y='Home_Win_Rate', 
               hover_name='Team', size='Home_Matches',
               color='Home_Advantage',
               color_continuous_scale='RdYlGn').data[0],
    row=1, col=1
)

# Add diagonal line for reference (equal performance)
fig.add_shape(
    type="line", x0=0, y0=0, x1=70, y1=70,
    line=dict(color="gray", dash="dash"),
    row=1, col=1
)

# 2. Goals Scored Comparison
fig.add_trace(
    px.scatter(perf_df, x='Away_Goals_Per_Game', y='Home_Goals_Per_Game',
               hover_name='Team', size='Home_Matches',
               color='Home_Advantage',
               color_continuous_scale='RdYlGn').data[0],
    row=1, col=2
)

# Add diagonal line for reference
fig.add_shape(
    type="line", x0=0.5, y0=0.5, x1=2.5, y1=2.5,
    line=dict(color="gray", dash="dash"),
    row=1, col=2
)

# 3. Home Advantage Bar Chart
perf_sorted = perf_df.sort_values('Home_Advantage', ascending=True)
fig.add_trace(
    px.bar(perf_sorted, x='Home_Advantage', y='Team', 
           orientation='h', color='Home_Advantage',
           color_continuous_scale='RdYlGn').data[0],
    row=2, col=1
)

# 4. Goal Difference Comparison
fig.add_trace(
    px.scatter(perf_df, x='Goal_Difference_Away', y='Goal_Difference_Home',
               hover_name='Team', size='Home_Matches',
               color='Home_Advantage',
               color_continuous_scale='RdYlGn').data[0],
    row=2, col=2
)

# Add reference lines
fig.add_shape(
    type="line", x0=-2, y0=-2, x1=2, y1=2,
    line=dict(color="gray", dash="dash"),
    row=2, col=2
)
fig.add_hline(y=0, line_dash="dot", line_color="gray", row=2, col=2)
fig.add_vline(x=0, line_dash="dot", line_color="gray", row=2, col=2)

# Update layout
fig.update_layout(
    height=800,
    title_text="🏠 Home vs Away Performance Analysis",
    title_font_size=16,
    showlegend=False
)

# Update axes labels
fig.update_xaxes(title_text="Away Win Rate (%)", row=1, col=1)
fig.update_yaxes(title_text="Home Win Rate (%)", row=1, col=1)
fig.update_xaxes(title_text="Away Goals per Game", row=1, col=2)
fig.update_yaxes(title_text="Home Goals per Game", row=1, col=2)
fig.update_xaxes(title_text="Home Advantage (%)", row=2, col=1)
fig.update_yaxes(title_text="Teams", row=2, col=1)
fig.update_xaxes(title_text="Away Goal Difference", row=2, col=2)
fig.update_yaxes(title_text="Home Goal Difference", row=2, col=2)

fig.show()

### 5. Advanced Analytics - Seasonal Trends

In [43]:
# Analyze league competitiveness over seasons
seasonal_stats = []

for season in df['Season'].unique():
    season_data = df[df['Season'] == season]
    
    # Calculate various metrics
    avg_goals = season_data['TotalGoals'].mean()
    goal_variance = season_data['TotalGoals'].var()
    draw_rate = (season_data['FTR'] == 'D').mean() * 100
    
    # Home advantage strength
    home_wins = (season_data['FTR'] == 'H').sum()
    total_matches = len(season_data)
    home_advantage = (home_wins / total_matches) * 100
    
    # League competitiveness (lower std of points = more competitive)
    team_points = {}
    for team in season_data['HomeTeam'].unique():
        home_points = season_data[season_data['HomeTeam'] == team]['HomePoints'].sum()
        away_points = season_data[season_data['AwayTeam'] == team].apply(
            lambda x: 3 if x['FTR'] == 'A' else (1 if x['FTR'] == 'D' else 0), axis=1).sum()
        team_points[team] = home_points + away_points
    
    if team_points:
        competitiveness = 100 - (np.std(list(team_points.values())) / np.mean(list(team_points.values())) * 100)
    else:
        competitiveness = 0
    
    seasonal_stats.append({
        'Season': season,
        'Avg_Goals': avg_goals,
        'Goal_Variance': goal_variance,
        'Draw_Rate': draw_rate,
        'Home_Advantage': home_advantage,
        'Competitiveness': competitiveness,
        'Total_Matches': total_matches
    })

seasonal_df = pd.DataFrame(seasonal_stats).sort_values('Season')

# Create multi-metric seasonal analysis
fig_seasonal = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Average Goals per Match', 'Draw Rate (%)', 
                   'Home Advantage (%)', 'League Competitiveness'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Goals per match trend
fig_seasonal.add_trace(
    go.Scatter(x=seasonal_df['Season'], y=seasonal_df['Avg_Goals'],
               mode='lines+markers', name='Avg Goals',
               line=dict(color='blue', width=3)),
    row=1, col=1
)

# Draw rate trend
fig_seasonal.add_trace(
    go.Scatter(x=seasonal_df['Season'], y=seasonal_df['Draw_Rate'],
               mode='lines+markers', name='Draw Rate',
               line=dict(color='orange', width=3)),
    row=1, col=2
)

# Home advantage trend
fig_seasonal.add_trace(
    go.Scatter(x=seasonal_df['Season'], y=seasonal_df['Home_Advantage'],
               mode='lines+markers', name='Home Advantage',
               line=dict(color='green', width=3)),
    row=2, col=1
)

# Competitiveness trend
fig_seasonal.add_trace(
    go.Scatter(x=seasonal_df['Season'], y=seasonal_df['Competitiveness'],
               mode='lines+markers', name='Competitiveness',
               line=dict(color='red', width=3)),
    row=2, col=2
)

fig_seasonal.update_layout(
    height=600,
    title_text="⚽ League Evolution: How Football Has Changed Over Time",
    showlegend=False
)

fig_seasonal.update_xaxes(tickangle=45)
fig_seasonal.show()

### 6. Goal Scoring Patterns

In [44]:
# Analyze goal scoring distributions and patterns
from scipy import stats

# Create goal distribution analysis
fig_goals = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Home Goals Distribution', 'Away Goals Distribution', 
                   'Total Goals Distribution', 'Goal Difference Distribution'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Home goals histogram
fig_goals.add_trace(
    go.Histogram(x=df['FTHG'], nbinsx=8, name='Home Goals',
                marker_color='lightblue', opacity=0.7),
    row=1, col=1
)

# Away goals histogram  
fig_goals.add_trace(
    go.Histogram(x=df['FTAG'], nbinsx=8, name='Away Goals',
                marker_color='lightcoral', opacity=0.7),
    row=1, col=2
)

# Total goals histogram
fig_goals.add_trace(
    go.Histogram(x=df['TotalGoals'], nbinsx=10, name='Total Goals',
                marker_color='lightgreen', opacity=0.7),
    row=2, col=1
)

# Goal difference histogram
fig_goals.add_trace(
    go.Histogram(x=df['GoalDifference'], nbinsx=15, name='Goal Difference',
                marker_color='gold', opacity=0.7),
    row=2, col=2
)

# Add statistical annotations
home_mean = df['FTHG'].mean()
away_mean = df['FTAG'].mean()
total_mean = df['TotalGoals'].mean()

fig_goals.add_vline(x=home_mean, line_dash="dash", line_color="red", 
                   annotation_text=f"μ={home_mean:.2f}", row=1, col=1)
fig_goals.add_vline(x=away_mean, line_dash="dash", line_color="red",
                   annotation_text=f"μ={away_mean:.2f}", row=1, col=2)
fig_goals.add_vline(x=total_mean, line_dash="dash", line_color="red",
                   annotation_text=f"μ={total_mean:.2f}", row=2, col=1)
fig_goals.add_vline(x=0, line_dash="dash", line_color="red",
                   annotation_text="Even", row=2, col=2)

fig_goals.update_layout(
    height=600,
    title_text="🥅 Goal Scoring Patterns Analysis",
    showlegend=False
)

fig_goals.show()

# Print some interesting stats
print(f"📊 GOAL SCORING INSIGHTS:")
print(f"• Average goals per match: {df['TotalGoals'].mean():.2f}")
print(f"• Home advantage: {df['FTHG'].mean() - df['FTAG'].mean():.2f} goals")
print(f"• Most common score: {df['TotalGoals'].mode().iloc[0]} goals")
print(f"• High-scoring matches (4+ goals): {(df['TotalGoals'] >= 4).mean()*100:.1f}%")
print(f"• Low-scoring matches (0-1 goals): {(df['TotalGoals'] <= 1).mean()*100:.1f}%")

📊 GOAL SCORING INSIGHTS:
• Average goals per match: 2.92
• Home advantage: 0.27 goals
• Most common score: 2 goals
• High-scoring matches (4+ goals): 34.0%
• Low-scoring matches (0-1 goals): 20.6%


### 7. Team Style Analysis - Defensive vs Offensive

In [45]:
# Classify teams by playing style
team_styles = []

for team in df['HomeTeam'].unique():
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_goals_scored = home_matches['FTHG'].sum()
    home_goals_conceded = home_matches['FTAG'].sum()
    home_matches_count = len(home_matches)
    
    # Away matches
    away_matches = df[df['AwayTeam'] == team]
    away_goals_scored = away_matches['FTAG'].sum()
    away_goals_conceded = away_matches['FTHG'].sum()
    away_matches_count = len(away_matches)
    
    total_matches = home_matches_count + away_matches_count
    
    if total_matches >= 30:  # Minimum matches for reliable analysis
        # Calculate averages
        goals_scored_per_game = (home_goals_scored + away_goals_scored) / total_matches
        goals_conceded_per_game = (home_goals_conceded + away_goals_conceded) / total_matches
        
        # Calculate style metrics
        attacking_power = goals_scored_per_game
        defensive_power = 3.0 - goals_conceded_per_game  # Higher is better defense
        
        # Calculate win rate for bubble size
        home_wins = len(home_matches[home_matches['FTR'] == 'H'])
        away_wins = len(away_matches[away_matches['FTR'] == 'A'])
        win_rate = (home_wins + away_wins) / total_matches * 100
        
        # Classify style
        if attacking_power > 1.3 and defensive_power > 1.7:
            style = "Balanced Strong"
        elif attacking_power > 1.4:
            style = "Attacking"
        elif defensive_power > 2.0:
            style = "Defensive"
        elif attacking_power < 1.0 and defensive_power < 1.5:
            style = "Struggling"
        else:
            style = "Balanced"
        
        team_styles.append({
            'Team': team,
            'Goals_Scored_Per_Game': goals_scored_per_game,
            'Goals_Conceded_Per_Game': goals_conceded_per_game,
            'Attacking_Power': attacking_power,
            'Defensive_Power': defensive_power,
            'Win_Rate': win_rate,
            'Total_Matches': total_matches,
            'Style': style
        })

style_df = pd.DataFrame(team_styles)

# Create team style scatter plot
fig_style = px.scatter(
    style_df,
    x='Attacking_Power',
    y='Defensive_Power',
    color='Style',
    size='Win_Rate',
    hover_name='Team',
    title='⚔️ Team Playing Styles: Attack vs Defense',
    labels={
        'Attacking_Power': 'Attacking Power (Goals/Game)',
        'Defensive_Power': 'Defensive Power (3 - Goals Conceded/Game)',
        'Win_Rate': 'Win Rate (%)'
    },
    color_discrete_map={
        'Attacking': '#FF4444',
        'Defensive': '#4444FF', 
        'Balanced': '#44FF44',
        'Balanced Strong': '#FFD700',
        'Struggling': '#888888'
    },
    size_max=30
)

# Add quadrant lines
fig_style.add_hline(y=style_df['Defensive_Power'].mean(), line_dash="dot", 
                   line_color="gray", annotation_text="Avg Defense")
fig_style.add_vline(x=style_df['Attacking_Power'].mean(), line_dash="dot", 
                   line_color="gray", annotation_text="Avg Attack")

# Add style zone annotations
fig_style.add_annotation(x=0.7, y=2.3, text="🛡️ Defensive<br>Teams", 
                        showarrow=False, bgcolor="rgba(68,68,255,0.1)")
fig_style.add_annotation(x=1.8, y=1.2, text="⚔️ Attacking<br>Teams", 
                        showarrow=False, bgcolor="rgba(255,68,68,0.1)")
fig_style.add_annotation(x=1.6, y=2.2, text="👑 Elite<br>Teams", 
                        showarrow=False, bgcolor="rgba(255,215,0,0.1)")

fig_style.update_layout(
    height=600,
    showlegend=True,
    legend=dict(title="Playing Style")
)

fig_style.show()

# Print style summary
print("🎯 TEAM STYLE BREAKDOWN:")
for style in style_df['Style'].unique():
    count = len(style_df[style_df['Style'] == style])
    avg_wins = style_df[style_df['Style'] == style]['Win_Rate'].mean()
    print(f"• {style}: {count} teams (avg {avg_wins:.1f}% win rate)")

🎯 TEAM STYLE BREAKDOWN:
• Balanced: 11 teams (avg 28.5% win rate)
• Balanced Strong: 5 teams (avg 54.5% win rate)
• Attacking: 5 teams (avg 36.5% win rate)
• Struggling: 1 teams (avg 19.1% win rate)


### 8. Upset Analysis - David vs Goliath

In [46]:
# Calculate team strength based on overall performance
team_strength = {}

for team in df['HomeTeam'].unique():
    home_matches = df[df['HomeTeam'] == team]
    away_matches = df[df['AwayTeam'] == team]
    
    # Calculate total points
    home_points = home_matches['HomePoints'].sum()
    away_points = away_matches.apply(
        lambda x: 3 if x['FTR'] == 'A' else (1 if x['FTR'] == 'D' else 0), axis=1).sum()
    
    total_matches = len(home_matches) + len(away_matches)
    if total_matches > 0:
        points_per_game = (home_points + away_points) / total_matches
        team_strength[team] = points_per_game

# Find upsets (weak team beats strong team)
upsets = []
for _, match in df.iterrows():
    home_team = match['HomeTeam']
    away_team = match['AwayTeam']
    
    if home_team in team_strength and away_team in team_strength:
        home_strength = team_strength[home_team]
        away_strength = team_strength[away_team]
        strength_diff = abs(home_strength - away_strength)
        
        # Define upset conditions
        is_upset = False
        upset_type = ""
        
        if match['FTR'] == 'H' and away_strength > home_strength + 0.5:
            is_upset = True
            upset_type = "Home Upset"
        elif match['FTR'] == 'A' and home_strength > away_strength + 0.5:
            is_upset = True
            upset_type = "Away Upset"
        
        upsets.append({
            'Date': match['Date'],
            'HomeTeam': home_team,
            'AwayTeam': away_team,
            'Score': f"{match['FTHG']}-{match['FTAG']}",
            'Result': match['FTR'],
            'Home_Strength': home_strength,
            'Away_Strength': away_strength,
            'Strength_Difference': strength_diff,
            'Is_Upset': is_upset,
            'Upset_Type': upset_type,
            'Upset_Magnitude': strength_diff if is_upset else 0
        })

upset_df = pd.DataFrame(upsets)
major_upsets = upset_df[upset_df['Is_Upset'] == True].sort_values('Upset_Magnitude', ascending=False)

# Create upset timeline
fig_upsets = px.scatter(
    major_upsets.head(50),  # Top 50 biggest upsets
    x='Date',
    y='Upset_Magnitude',
    color='Upset_Type',
    size='Upset_Magnitude',
    hover_data=['HomeTeam', 'AwayTeam', 'Score'],
    title='🎯 Major Upsets Timeline - When David Beat Goliath',
    labels={
        'Upset_Magnitude': 'Upset Magnitude (Strength Difference)',
        'Date': 'Date'
    },
    color_discrete_map={
        'Home Upset': '#2E8B57',
        'Away Upset': '#DC143C'
    }
)

fig_upsets.update_layout(
    height=500,
    showlegend=True
)

fig_upsets.show()

# Create upset frequency by team
upset_victims = major_upsets['HomeTeam'].value_counts() + major_upsets['AwayTeam'].value_counts()
upset_causers = []

for team in df['HomeTeam'].unique():
    home_upsets = len(major_upsets[(major_upsets['HomeTeam'] == team) & (major_upsets['Result'] == 'H')])
    away_upsets = len(major_upsets[(major_upsets['AwayTeam'] == team) & (major_upsets['Result'] == 'A')])
    total_upsets = home_upsets + away_upsets
    
    upsets_suffered = len(major_upsets[(major_upsets['HomeTeam'] == team) & (major_upsets['Result'] == 'A')]) + len(major_upsets[(major_upsets['AwayTeam'] == team) & (major_upsets['Result'] == 'H')])
    
    if total_upsets > 0 or upsets_suffered > 0:
        upset_causers.append({
            'Team': team,
            'Upsets_Caused': total_upsets,
            'Upsets_Suffered': upsets_suffered,
            'Net_Upsets': total_upsets - upsets_suffered,
            'Team_Strength': team_strength.get(team, 0)
        })

upset_teams_df = pd.DataFrame(upset_causers).sort_values('Net_Upsets', ascending=False)

# Top upset causers and victims
fig_upset_teams = px.bar(
    upset_teams_df.head(15),
    x='Net_Upsets',
    y='Team',
    orientation='h',
    color='Net_Upsets',
    color_continuous_scale='RdYlGn',
    title='🎭 Giant Killers vs Upset Victims',
    labels={'Net_Upsets': 'Net Upsets (Caused - Suffered)'}
)

fig_upset_teams.update_layout(height=500)
fig_upset_teams.show()

print(f"📈 UPSET ANALYSIS:")
print(f"• Total major upsets identified: {len(major_upsets)}")
print(f"• Biggest upset magnitude: {major_upsets['Upset_Magnitude'].max():.2f}")
print(f"• Most upset-prone team: {upset_teams_df.tail(1)['Team'].iloc[0]} ({upset_teams_df.tail(1)['Upsets_Suffered'].iloc[0]} suffered)")
print(f"• Best giant killer: {upset_teams_df.head(1)['Team'].iloc[0]} ({upset_teams_df.head(1)['Upsets_Caused'].iloc[0]} caused)")

📈 UPSET ANALYSIS:
• Total major upsets identified: 79
• Biggest upset magnitude: 1.18
• Most upset-prone team: Club Brugge (14 suffered)
• Best giant killer: Kortrijk (12 caused)


### 9. Match Intensity Heatmap

In [47]:
# Create match intensity analysis based on cards, fouls, and goals
df['Match_Intensity'] = (
    df['HY'] + df['AY'] +           # Yellow cards
    (df['HR'] + df['AR']) * 3 +     # Red cards (weighted more)
    df['HF'] + df['AF'] +           # Fouls
    df['TotalGoals'] * 2            # Goals (exciting factor)
) / 2  # Normalize

# Create intensity by day of week and month
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.day_name()

# Create intensity heatmap
intensity_pivot = df.groupby(['Month', 'DayOfWeek'])['Match_Intensity'].mean().reset_index()
intensity_matrix = intensity_pivot.pivot(index='DayOfWeek', columns='Month', values='Match_Intensity')

# Reorder days of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
intensity_matrix = intensity_matrix.reindex(day_order)

fig_intensity = px.imshow(
    intensity_matrix,
    labels=dict(x="Month", y="Day of Week", color="Match Intensity"),
    title="🔥 Match Intensity Heatmap - When Football Gets Heated",
    color_continuous_scale='Reds',
    aspect="auto"
)

fig_intensity.update_layout(height=400)
fig_intensity.show()

# Season timing analysis
df['Days_From_Season_Start'] = df.groupby('Season')['Date'].transform(
    lambda x: (x - x.min()).dt.days
)

# Divide into season phases
df['Season_Phase'] = pd.cut(
    df['Days_From_Season_Start'], 
    bins=[0, 60, 120, 180, 365], 
    labels=['Early Season', 'Mid Season', 'Late Season', 'Season End'],
    include_lowest=True
)

# Analyze performance by season phase
phase_stats = df.groupby('Season_Phase').agg({
    'TotalGoals': 'mean',
    'Match_Intensity': 'mean',
    'HomePoints': 'mean',
    'HY': 'mean',
    'AY': 'mean'
}).round(2)

print("📅 SEASONAL TIMING EFFECTS:")
print(phase_stats)

📅 SEASONAL TIMING EFFECTS:
              TotalGoals  Match_Intensity  HomePoints    HY    AY
Season_Phase                                                     
Early Season        2.77            17.21        1.53  1.61  2.06
Mid Season          3.14            16.99        1.53  1.69  1.91
Late Season         2.38            18.19        1.79  1.72  1.98
Season End          2.98            17.28        1.53  1.92  2.26




