## Charts to display in flask frontend

### 0. Imports

In [None]:
import os

import sqlite3

import pandas as pd
from pandas.io.sql import DatabaseError

import numpy as np

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### 1. Setup and load

In [14]:
# Path to database
db_path = '../football.db'

if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database '{db_path}' not found. Run db_setup.py first.")

conn = sqlite3.connect(db_path)

try:
    df = pd.read_sql_query("SELECT * FROM matches", conn)
    print(f"✅ Loaded {len(df)} rows from 'matches' table")
except DatabaseError as e:
    df = pd.DataFrame()
    print(f"❌ Error loading data: {e}")
finally:
    conn.close()

✅ Loaded 1508 rows from 'matches' table


### 2. Data preprocessing

In [15]:
# Nettoyage de base et définition des features
df = df.dropna(subset=['HomeTeam', 'AwayTeam', 'FTR'])  # cible et équipes présentes
X = df[['HomeTeam', 'AwayTeam', 'HS', 'AS']]             # sélection de features
y = df['FTR']                                            # cible : résultat final

# Définition des colonnes
categorical_features = ['HomeTeam', 'AwayTeam']
numerical_features = ['HS', 'AS']

# Pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Fusion des pipelines
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Data preprocessing pipeline with imputers is ready.")


✅ Data preprocessing pipeline with imputers is ready.


### 3. Extra Stats

In [16]:
# Add derived columns
df['TotalGoals'] = df['FTHG'] + df['FTAG']
df['GoalDifference'] = df['FTHG'] - df['FTAG']
df['IsDraw'] = df['FTR'] == 'D'

# Points earned by Home Team
def home_points(row):
    if row['FTR'] == 'H':
        return 3
    elif row['FTR'] == 'D':
        return 1
    else:
        return 0

df['HomePoints'] = df.apply(home_points, axis=1)

# Convert Date column
df['Date'] = pd.to_datetime(df['Date'])


### 4. Charts

In [17]:
# Group by date to show average goals scored by home teams per matchday
daily_goals = df.groupby('Date')['FTHG'].mean().reset_index()


# Calculate time gaps between rows
daily_goals['PrevDate'] = daily_goals['Date'].shift(1)
daily_goals['Gap'] = (daily_goals['Date'] - daily_goals['PrevDate']).dt.days

# Insert NaN for big gaps
daily_goals['FTHG_clean'] = np.where(daily_goals['Gap'] > 30, np.nan, daily_goals['FTHG'])

# Plot the cleaned series
fig = px.line(
    daily_goals,
    x='Date',
    y='FTHG_clean',
    title='Avg Home Goals Over Time (With Season Gaps)',
    labels={'FTHG_clean': 'Avg Home Goals'}
)
fig.show()


In [19]:
import plotly.graph_objects as go

# Compute top 10 home teams
home_goals = df.groupby('HomeTeam')['FTHG'].mean().reset_index()
top_home = home_goals.sort_values(by='FTHG', ascending=False).head(10)

# Compute top 10 away teams
away_goals = df.groupby('AwayTeam')['FTAG'].mean().reset_index()
top_away = away_goals.sort_values(by='FTAG', ascending=False).head(10)

# Create the figure
fig = go.Figure()

# Add home team bar chart
fig.add_trace(go.Bar(
    x=top_home['HomeTeam'],
    y=top_home['FTHG'],
    name='Home Teams',
    visible=True
))

# Add away team bar chart
fig.add_trace(go.Bar(
    x=top_away['AwayTeam'],
    y=top_away['FTAG'],
    name='Away Teams',
    visible=False
))

# Add dropdown to toggle visibility
fig.update_layout(
    title='Top 10 Teams by Average Goals',
    updatemenus=[
        dict(
            type='dropdown',
            direction='down',
            buttons=[
                dict(label='Top Home Teams',
                     method='update',
                     args=[{'visible': [True, False]},
                           {'title': 'Top 10 Home Teams by Avg Goals',
                            'yaxis': {'title': 'Avg Home Goals'}}]),
                dict(label='Top Away Teams',
                     method='update',
                     args=[{'visible': [False, True]},
                           {'title': 'Top 10 Away Teams by Avg Goals',
                            'yaxis': {'title': 'Avg Away Goals'}}]),
            ],
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.15,
            yanchor="top"
        ),
    ]
)

fig.update_layout(height=500)

fig.show()


In [21]:
# Create data for sunburst chart showing team performance breakdown
sunburst_data = []

# Get unique teams
all_teams = set(df['HomeTeam'].unique()) | set(df['AwayTeam'].unique())

for team in all_teams:
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_draws = len(home_matches[home_matches['FTR'] == 'D'])
    home_losses = len(home_matches[home_matches['FTR'] == 'A'])
    
    # Away matches
    away_matches = df[df['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_draws = len(away_matches[away_matches['FTR'] == 'D'])
    away_losses = len(away_matches[away_matches['FTR'] == 'H'])
    
    # Add data for this team
    if home_wins > 0:
        sunburst_data.append({'ids': f'{team}_Home_Win', 'labels': 'Win', 'parents': f'{team}_Home', 'values': home_wins})
    if home_draws > 0:
        sunburst_data.append({'ids': f'{team}_Home_Draw', 'labels': 'Draw', 'parents': f'{team}_Home', 'values': home_draws})
    if home_losses > 0:
        sunburst_data.append({'ids': f'{team}_Home_Loss', 'labels': 'Loss', 'parents': f'{team}_Home', 'values': home_losses})
    
    if away_wins > 0:
        sunburst_data.append({'ids': f'{team}_Away_Win', 'labels': 'Win', 'parents': f'{team}_Away', 'values': away_wins})
    if away_draws > 0:
        sunburst_data.append({'ids': f'{team}_Away_Draw', 'labels': 'Draw', 'parents': f'{team}_Away', 'values': away_draws})
    if away_losses > 0:
        sunburst_data.append({'ids': f'{team}_Away_Loss', 'labels': 'Loss', 'parents': f'{team}_Away', 'values': away_losses})
    
    # Add home/away categories
    sunburst_data.append({'ids': f'{team}_Home', 'labels': 'Home', 'parents': team, 'values': len(home_matches)})
    sunburst_data.append({'ids': f'{team}_Away', 'labels': 'Away', 'parents': team, 'values': len(away_matches)})
    
    # Add team root
    sunburst_data.append({'ids': team, 'labels': team, 'parents': '', 'values': len(home_matches) + len(away_matches)})

# Convert to DataFrame for easier handling
sunburst_df = pd.DataFrame(sunburst_data)

# Create sunburst chart
fig_sunburst = px.sunburst(
    sunburst_df,
    ids='ids',
    names='labels',
    parents='parents',
    values='values',
    title='Team Performance Breakdown: Home/Away Results'
)

fig_sunburst.update_layout(
    font_size=10,
    height=700
)

fig_sunburst.show()

In [None]:
# Calculate win/loss/draw statistics for all teams
team_stats = []

# Get all unique teams
all_teams = set(df['HomeTeam'].unique()) | set(df['AwayTeam'].unique())

for team in all_teams:
    # Home matches
    home_matches = df[df['HomeTeam'] == team]
    home_wins = len(home_matches[home_matches['FTR'] == 'H'])
    home_draws = len(home_matches[home_matches['FTR'] == 'D'])
    home_losses = len(home_matches[home_matches['FTR'] == 'A'])
    
    # Away matches
    away_matches = df[df['AwayTeam'] == team]
    away_wins = len(away_matches[away_matches['FTR'] == 'A'])
    away_draws = len(away_matches[away_matches['FTR'] == 'D'])
    away_losses = len(away_matches[away_matches['FTR'] == 'H'])
    
    # Total stats
    total_wins = home_wins + away_wins
    total_draws = home_draws + away_draws
    total_losses = home_losses + away_losses
    total_matches = total_wins + total_draws + total_losses
    
    team_stats.append({
        'Team': team,
        'Wins': total_wins,
        'Draws': total_draws,
        'Losses': total_losses,
        'Total': total_matches,
        'Win_Pct': total_wins / total_matches * 100 if total_matches > 0 else 0
    })

# Convert to DataFrame and sort by win percentage
stats_df = pd.DataFrame(team_stats)
stats_df = stats_df.sort_values('Win_Pct', ascending=True)

# Create stacked horizontal bar chart
fig_wld = px.bar(
    stats_df,
    x=['Wins', 'Draws', 'Losses'],
    y='Team',
    orientation='h',
    title='Win/Draw/Loss Record by Team',
    color_discrete_map={
        'Wins': '#2E8B57',     # Sea Green
        'Draws': '#FFD700',    # Gold
        'Losses': '#DC143C'    # Crimson
    },
    labels={'value': 'Number of Matches', 'variable': 'Result'}
)

fig_wld.update_layout(
    height=600,
    xaxis_title='Number of Matches',
    yaxis_title='Teams',
    legend_title='Match Result'
)

fig_wld.show()

In [32]:
# Extract season information from dates
df['Season'] = df['Date'].dt.year.astype(str) + '/' + (df['Date'].dt.year + 1).astype(str).str[-2:]

# Get unique seasons for dropdown options
seasons = ['All Seasons'] + sorted(df['Season'].unique().tolist())

# Calculate win/loss/draw statistics as percentages with season filter
def calculate_team_stats_by_season(selected_season):
    if selected_season == 'All Seasons':
        filtered_df = df
    else:
        filtered_df = df[df['Season'] == selected_season]
    
    team_stats_pct = []
    all_teams = set(filtered_df['HomeTeam'].unique()) | set(filtered_df['AwayTeam'].unique())
    
    for team in all_teams:
        # Home matches
        home_matches = filtered_df[filtered_df['HomeTeam'] == team]
        home_wins = len(home_matches[home_matches['FTR'] == 'H'])
        home_draws = len(home_matches[home_matches['FTR'] == 'D'])
        home_losses = len(home_matches[home_matches['FTR'] == 'A'])
        
        # Away matches
        away_matches = filtered_df[filtered_df['AwayTeam'] == team]
        away_wins = len(away_matches[away_matches['FTR'] == 'A'])
        away_draws = len(away_matches[away_matches['FTR'] == 'D'])
        away_losses = len(away_matches[away_matches['FTR'] == 'H'])
        
        # Total stats
        total_wins = home_wins + away_wins
        total_draws = home_draws + away_draws
        total_losses = home_losses + away_losses
        total_matches = total_wins + total_draws + total_losses
        
        if total_matches > 0:  # Only include teams with matches in the season
            # Calculate percentages
            win_pct = (total_wins / total_matches * 100)
            draw_pct = (total_draws / total_matches * 100)
            loss_pct = (total_losses / total_matches * 100)
            
            team_stats_pct.append({
                'Team': team,
                'Win_Pct': win_pct,
                'Draw_Pct': draw_pct,
                'Loss_Pct': loss_pct,
                'Total': total_matches
            })
    
    return pd.DataFrame(team_stats_pct).sort_values('Win_Pct', ascending=True)

# Create initial data for all seasons
initial_stats = calculate_team_stats_by_season('All Seasons')

# Create the figure with initial data
fig_pct = px.bar(
    initial_stats,
    x=['Win_Pct', 'Draw_Pct', 'Loss_Pct'],
    y='Team',
    orientation='h',
    title='Win/Draw/Loss Percentage by Team - All Seasons',
    color_discrete_map={
        'Win_Pct': '#2E8B57',     # Sea Green
        'Draw_Pct': '#FFD700',    # Gold
        'Loss_Pct': '#DC143C'     # Crimson
    },
    labels={'value': 'Percentage (%)', 'variable': 'Result'}
)

# Add dropdown menu for season selection
buttons = []
for season in seasons:
    season_stats = calculate_team_stats_by_season(season)
    
    # Create button for each season
    button = dict(
        label=season,
        method='restyle',
        args=[{
            'x': [season_stats['Win_Pct'].tolist(), 
                  season_stats['Draw_Pct'].tolist(), 
                  season_stats['Loss_Pct'].tolist()],
            'y': [season_stats['Team'].tolist()] * 3
        }],
        args2=[{
            'title': f'Win/Draw/Loss Percentage by Team - {season}'
        }]
    )
    buttons.append(button)

# Add the dropdown to the layout
fig_pct.update_layout(
    height=600,
    xaxis_title='Percentage (%)',
    yaxis_title='Teams',
    legend_title='Match Result',
    xaxis=dict(range=[0, 100]),  # Set x-axis range to 0-100%
    updatemenus=[
        dict(
            type='dropdown',
            direction='down',
            buttons=buttons,
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.15,
            yanchor="top"
        )
    ]
)

fig_pct.show()