## Charts to display in flask frontend

### 0. Imports

In [2]:
import os

import sqlite3

import pandas as pd
from pandas.io.sql import DatabaseError

import numpy as np

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### 1. Setup and load

In [3]:
# Path to database
db_path = '../football.db'

if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database '{db_path}' not found. Run db_setup.py first.")

conn = sqlite3.connect(db_path)

try:
    df = pd.read_sql_query("SELECT * FROM matches", conn)
    print(f"✅ Loaded {len(df)} rows from 'matches' table")
except DatabaseError as e:
    df = pd.DataFrame()
    print(f"❌ Error loading data: {e}")
finally:
    conn.close()

✅ Loaded 1508 rows from 'matches' table


### 2. Data preprocessing

In [4]:
# Nettoyage de base et définition des features
df = df.dropna(subset=['HomeTeam', 'AwayTeam', 'FTR'])  # cible et équipes présentes
X = df[['HomeTeam', 'AwayTeam', 'HS', 'AS']]             # sélection de features
y = df['FTR']                                            # cible : résultat final

# Définition des colonnes
categorical_features = ['HomeTeam', 'AwayTeam']
numerical_features = ['HS', 'AS']

# Pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Fusion des pipelines
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Data preprocessing pipeline with imputers is ready.")


✅ Data preprocessing pipeline with imputers is ready.


### 3. Extra Stats

In [5]:
# Add derived columns
df['TotalGoals'] = df['FTHG'] + df['FTAG']
df['GoalDifference'] = df['FTHG'] - df['FTAG']
df['IsDraw'] = df['FTR'] == 'D'

# Points earned by Home Team
def home_points(row):
    if row['FTR'] == 'H':
        return 3
    elif row['FTR'] == 'D':
        return 1
    else:
        return 0

df['HomePoints'] = df.apply(home_points, axis=1)

# Convert Date column
df['Date'] = pd.to_datetime(df['Date'])


### 4. Charts

In [6]:
# Group by date to show average goals scored by home teams per matchday
daily_goals = df.groupby('Date')['FTHG'].mean().reset_index()


# Calculate time gaps between rows
daily_goals['PrevDate'] = daily_goals['Date'].shift(1)
daily_goals['Gap'] = (daily_goals['Date'] - daily_goals['PrevDate']).dt.days

# Insert NaN for big gaps
daily_goals['FTHG_clean'] = np.where(daily_goals['Gap'] > 30, np.nan, daily_goals['FTHG'])

# Plot the cleaned series
fig = px.line(
    daily_goals,
    x='Date',
    y='FTHG_clean',
    title='Avg Home Goals Over Time (With Season Gaps)',
    labels={'FTHG_clean': 'Avg Home Goals'}
)
fig.show()


In [7]:
import plotly.graph_objects as go

# Compute top 10 home teams
home_goals = df.groupby('HomeTeam')['FTHG'].mean().reset_index()
top_home = home_goals.sort_values(by='FTHG', ascending=False).head(10)

# Compute top 10 away teams
away_goals = df.groupby('AwayTeam')['FTAG'].mean().reset_index()
top_away = away_goals.sort_values(by='FTAG', ascending=False).head(10)

# Create the figure
fig = go.Figure()

# Add home team bar chart
fig.add_trace(go.Bar(
    x=top_home['HomeTeam'],
    y=top_home['FTHG'],
    name='Home Teams',
    visible=True
))

# Add away team bar chart
fig.add_trace(go.Bar(
    x=top_away['AwayTeam'],
    y=top_away['FTAG'],
    name='Away Teams',
    visible=False
))

# Add dropdown to toggle visibility
fig.update_layout(
    title='Top 10 Teams by Average Goals',
    updatemenus=[
        dict(
            type='dropdown',
            direction='down',
            buttons=[
                dict(label='Top Home Teams',
                     method='update',
                     args=[{'visible': [True, False]},
                           {'title': 'Top 10 Home Teams by Avg Goals',
                            'yaxis': {'title': 'Avg Home Goals'}}]),
                dict(label='Top Away Teams',
                     method='update',
                     args=[{'visible': [False, True]},
                           {'title': 'Top 10 Away Teams by Avg Goals',
                            'yaxis': {'title': 'Avg Away Goals'}}]),
            ],
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.15,
            yanchor="top"
        ),
    ]
)

fig.update_layout(height=500)

fig.show()


In [9]:
# Calculate win/loss/draw statistics for all teams
team_stats = []

# Get all unique teams
all_teams = set(df['HomeTeam'].unique()) | set(df['AwayTeam'].unique())

for team in all_teams:
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_draws = len(home_matches[home_matches['FTR'] == 'D'])
    home_losses = len(home_matches[home_matches['FTR'] == 'A'])
    
    # Away matches
    away_matches = df[df['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_draws = len(away_matches[away_matches['FTR'] == 'D'])
    away_losses = len(away_matches[away_matches['FTR'] == 'H'])
    
    # Total stats
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    total_matches = total_wins + total_draws + total_losses
    
    team_stats.append({
        'Team': team,
        'Wins': total_wins,
        'Draws': total_draws,
        'Losses': total_losses,
        'Total': total_matches,
        'Win_Pct': total_wins / total_matches * 100 if total_matches > 0 else 0
    })

# Convert to DataFrame and sort by win percentage
stats_df = pd.DataFrame(team_stats)
stats_df = stats_df.sort_values('Win_Pct', ascending=True)

# Create stacked horizontal bar chart
fig_wld = px.bar(
    stats_df,
    x=['Wins', 'Draws', 'Losses'],
    y='Team',
    orientation='h',
    title='Win/Draw/Loss Record by Team',
    color_discrete_map={
        'Wins': '#2E8B57',     # Sea Green
        'Draws': '#FFD700',    # Gold
        'Losses': '#DC143C'    # Crimson
    },
    labels={'value': 'Number of Matches', 'variable': 'Result'}
)

fig_wld.update_layout(
    height=600,
    xaxis_title='Number of Matches',
    yaxis_title='Teams',
    legend_title='Match Result'
)

fig_wld.show()

In [10]:
# Extract season information from dates
df['Season'] = df['Date'].dt.year.astype(str) + '/' + (df['Date'].dt.year + 1).astype(str).str[-2:]

# Get unique seasons for dropdown options
seasons = ['All Seasons'] + sorted(df['Season'].unique().tolist())

# Calculate win/loss/draw statistics as percentages with season filter
def calculate_team_stats_by_season(selected_season):
    if selected_season == 'All Seasons':
        filtered_df = df
    else:
        filtered_df = df[df['Season'] == selected_season]
    
    team_stats_pct = []
    all_teams = set(filtered_df['HomeTeam'].unique()) | set(filtered_df['AwayTeam'].unique())
    
    for team in all_teams:
        # Home matches
        home_matches = filtered_df[filtered_df['HomeTeam'] == team]
        home_wins = len(home_matches[home_matches['FTR'] == 'H'])
        home_draws = len(home_matches[home_matches['FTR'] == 'D'])
        home_losses = len(home_matches[home_matches['FTR'] == 'A'])
        
        # Away matches
        away_matches = filtered_df[filtered_df['AwayTeam'] == team]
        away_wins = len(away_matches[away_matches['FTR'] == 'A'])
        away_draws = len(away_matches[away_matches['FTR'] == 'D'])
        away_losses = len(away_matches[away_matches['FTR'] == 'H'])
        
        # Total stats
        total_wins = home_wins + away_wins
        total_draws = home_draws + away_draws
        total_losses = home_losses + away_losses
        total_matches = total_wins + total_draws + total_losses
        
        if total_matches > 0:  # Only include teams with matches in the season
            # Calculate percentages
            win_pct = (total_wins / total_matches * 100)
            draw_pct = (total_draws / total_matches * 100)
            loss_pct = (total_losses / total_matches * 100)
            
            team_stats_pct.append({
                'Team': team,
                'Win_Pct': win_pct,
                'Draw_Pct': draw_pct,
                'Loss_Pct': loss_pct,
                'Total': total_matches
            })
    
    return pd.DataFrame(team_stats_pct).sort_values('Win_Pct', ascending=True)

# Create initial data for all seasons
initial_stats = calculate_team_stats_by_season('All Seasons')

# Create the figure with initial data
fig_pct = px.bar(
    initial_stats,
    x=['Win_Pct', 'Draw_Pct', 'Loss_Pct'],
    y='Team',
    orientation='h',
    title='Win/Draw/Loss Percentage by Team - All Seasons',
    color_discrete_map={
        'Win_Pct': '#2E8B57',     # Sea Green
        'Draw_Pct': '#FFD700',    # Gold
        'Loss_Pct': '#DC143C'     # Crimson
    },
    labels={'value': 'Percentage (%)', 'variable': 'Result'}
)

# Add dropdown menu for season selection
buttons = []
for season in seasons:
    season_stats = calculate_team_stats_by_season(season)
    
    # Create button for each season
    button = dict(
        label=season,
        method='restyle',
        args=[{
            'x': [season_stats['Win_Pct'].tolist(), 
                  season_stats['Draw_Pct'].tolist(), 
                  season_stats['Loss_Pct'].tolist()],
            'y': [season_stats['Team'].tolist()] * 3
        }],
        args2=[{
            'title': f'Win/Draw/Loss Percentage by Team - {season}'
        }]
    )
    buttons.append(button)

# Add the dropdown to the layout
fig_pct.update_layout(
    height=600,
    xaxis_title='Percentage (%)',
    yaxis_title='Teams',
    legend_title='Match Result',
    xaxis=dict(range=[0, 100]),  # Set x-axis range to 0-100%
    updatemenus=[
        dict(
            type='dropdown',
            direction='down',
            buttons=buttons,
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.15,
            yanchor="top"
        )
    ]
)

fig_pct.show()

In [20]:
# Create a scatter plot showing relationship between shots and goals with sized markers
# First, count occurrences of each (HS, FTHG) combination
df_counts = df.groupby(['HS', 'FTHG', 'FTR']).size().reset_index(name='count')

fig_shots_goals = px.scatter(
    df_counts, 
    x='HS', 
    y='FTHG',
    color='FTR',
    size='count',  # Size dots based on count of overlapping points
    title='Home Team: Shots vs Goals Scored (Dot size = frequency)',
    labels={
        'HS': 'Home Shots',
        'FTHG': 'Home Goals',
        'FTR': 'Match Result',
        'count': 'Number of matches'
    },
    color_discrete_map={
        'H': '#2E8B57',  # Green for home wins
        'D': '#FFD700',  # Gold for draws
        'A': '#DC143C'   # Red for home losses
    },
    size_max=20,  # Maximum dot size
    hover_data=['count']
)

fig_shots_goals.update_layout(
    height=500,
    showlegend=True
)

fig_shots_goals.show()


In [26]:
# First, create the correlation matrix for numerical columns
numerical_cols = ['FTHG', 'FTAG', 'HTHG', 'HTAG', 'HS', 'AS', 'HST', 'AST', 
                  'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 
                  'TotalGoals', 'GoalDifference', 'HomePoints']

correlation_matrix = df[numerical_cols].corr()

# Create a dictionary to explain abbreviations
abbreviation_explanations = {
    'FTHG': 'Full Time Home Goals',
    'FTAG': 'Full Time Away Goals', 
    'HTHG': 'Half Time Home Goals',
    'HTAG': 'Half Time Away Goals',
    'HS': 'Home Shots',
    'AS': 'Away Shots',
    'HST': 'Home Shots on Target',
    'AST': 'Away Shots on Target',
    'HF': 'Home Fouls',
    'AF': 'Away Fouls',
    'HC': 'Home Corners',
    'AC': 'Away Corners',
    'HY': 'Home Yellow Cards',
    'AY': 'Away Yellow Cards',
    'HR': 'Home Red Cards',
    'AR': 'Away Red Cards',
    'TotalGoals': 'Total Goals in Match',
    'GoalDifference': 'Home Goals - Away Goals',
    'HomePoints': 'Points Earned by Home Team'
}

# Extract correlations with HomePoints
homepoints_corr = correlation_matrix['HomePoints'].drop('HomePoints')
homepoints_corr_sorted = homepoints_corr.abs().sort_values(ascending=False)

# Get top 10 most correlated features
top_correlations = homepoints_corr_sorted.head(10)
top_corr_values = homepoints_corr[top_correlations.index]

# Create hover text with explanations
hover_text = [f"{abbreviation_explanations.get(feature, feature)}<br>Correlation: {corr:.3f}" 
              for feature, corr in top_corr_values.items()]

# Create bar chart with enhanced hover information
fig_homepoints = px.bar(
    x=top_corr_values.values,
    y=top_corr_values.index,
    orientation='h',
    title='Top 10 Features Most Correlated with Home Points',
    labels={'x': 'Correlation Coefficient', 'y': 'Features'},
    color=top_corr_values.values,
    color_continuous_scale='RdBu_r',
    color_continuous_midpoint=0,
    hover_name=[abbreviation_explanations.get(feature, feature) for feature in top_corr_values.index]
)

# Update hover template for better formatting
fig_homepoints.update_traces(
    text=[f'{val:.3f}' for val in top_corr_values.values],
    textposition='outside',
    hovertemplate='<b>%{hovertext}</b><br>' +
                  'Correlation: %{x:.3f}<br>' +
                  '<extra></extra>',
    hovertext=hover_text
)

fig_homepoints.update_layout(
    height=500,
    showlegend=False,
    coloraxis_showscale=True,
    coloraxis_colorbar_title="Correlation"
)

fig_homepoints.show()


In [30]:
# Create a comprehensive cards vs wins analysis
team_cards_wins = []

for team in df['HomeTeam'].unique():
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_yellow = home_matches['HY'].sum()
    home_red = home_matches['HR'].sum()
    home_total = len(home_matches)
    
    # Away matches  
    away_matches = df[df['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_yellow = away_matches['AY'].sum()
    away_red = away_matches['AR'].sum()
    away_total = len(away_matches)
    
    # Combined stats
    total_wins = home_wins + away_wins
    total_matches = home_total + away_total
    total_yellow = home_yellow + away_yellow
    total_red = home_red + away_red
    total_cards = total_yellow + (total_red * 2)  # Weight red cards more heavily
    
    if total_matches > 20:  # Only include teams with sufficient matches
        team_cards_wins.append({
            'Team': team,
            'Win_Rate': (total_wins / total_matches) * 100,
            'Cards_Per_Match': total_cards / total_matches,
            'Yellow_Per_Match': total_yellow / total_matches,
            'Red_Per_Match': total_red / total_matches,
            'Total_Matches': total_matches,
            'Discipline_Score': (total_yellow * 1 + total_red * 5) / total_matches  # Discipline penalty score
        })

cards_df = pd.DataFrame(team_cards_wins)

# Create the main bubble chart
fig_cards = px.scatter(
    cards_df,
    x='Cards_Per_Match',
    y='Win_Rate',
    size='Total_Matches',
    color='Discipline_Score',
    hover_name='Team',
    title='🟨🟥 The Card Penalty Effect: How Discipline Affects Team Success',
    labels={
        'Cards_Per_Match': 'Total Cards per Match (Yellow + 2×Red)',
        'Win_Rate': 'Win Rate (%)',
        'Discipline_Score': 'Discipline Penalty Score',
        'Total_Matches': 'Total Matches'
    },
    color_continuous_scale='Reds',
    size_max=25
)

# Add trend line
import scipy.stats as stats
slope, intercept, r_value, p_value, std_err = stats.linregress(cards_df['Cards_Per_Match'], cards_df['Win_Rate'])
line_x = np.linspace(cards_df['Cards_Per_Match'].min(), cards_df['Cards_Per_Match'].max(), 100)
line_y = slope * line_x + intercept

fig_cards.add_trace(go.Scatter(
    x=line_x,
    y=line_y,
    mode='lines',
    name='',
    # name=f'Trend Line (R² = {r_value**2:.3f})',
    line=dict(color='darkblue', width=2, dash='dash'),
    hovertemplate='Correlation: %{text}<extra></extra>',
    text=[f'R² = {r_value**2:.3f}'] * len(line_x)
))

# Customize the layout
fig_cards.update_layout(
    height=600,
    showlegend=True,
    plot_bgcolor='rgba(240,240,240,0.8)',
    font=dict(size=12),
    title_font_size=16,
    annotations=[
        dict(
            x=0.02, y=0.98,
            xref='paper', yref='paper',
            text='💡 Bubble size = Total matches played<br>🔴 Color intensity = Disciplinary issues',
            showarrow=False,
            font=dict(size=10),
            bgcolor='rgba(255,255,255,0.8)',
            bordercolor='gray',
            borderwidth=1
        )
    ]
)

# Add quadrant lines for better interpretation
mean_cards = cards_df['Cards_Per_Match'].mean()
mean_wins = cards_df['Win_Rate'].mean()

fig_cards.add_hline(y=mean_wins, line_dash="dot", line_color="gray", 
                   annotation_text=f"Avg Win Rate: {mean_wins:.1f}%")
fig_cards.add_vline(x=mean_cards, line_dash="dot", line_color="gray",
                   annotation_text=f"Avg Cards: {mean_cards:.1f}")

# Update hover template for better information
fig_cards.update_traces(
    hovertemplate='<b>%{hovertext}</b><br>' +
                  'Win Rate: %{y:.1f}%<br>' +
                  'Cards per Match: %{x:.2f}<br>' +
                  'Total Matches: %{marker.size}<br>' +
                  'Discipline Score: %{marker.color:.2f}<br>' +
                  '<extra></extra>',
    selector=dict(mode='markers')
)

fig_cards.show()