We are running sqlite3 through python to query our sqlite database. Due to size of the database and constraints with running in this method we will break apart the query into multiple queries and create several csvs. After the csvs are created we will then use python to form the training set.

All 11 Home Players Individually

In [1]:
import sqlite3
import csv

# Connect to the SQLite database
conn = sqlite3.connect('database.sqlite')
cursor = conn.cursor()

for i in range(4, 12):  # Loop through numbers 1 to 11
    # Dynamic view name and player references in the SQL query
    create_view_query = f"""
CREATE VIEW IF NOT EXISTS PlayerV9_{i} AS
WITH MatchDates AS (
    SELECT 
        m.id,
        m.match_api_id,
        m.date AS match_date,
        m.home_player_{i},
        m.home_team_api_id,
        m.away_team_api_id
    FROM Match m
    WHERE m.league_id = 21518
)
SELECT 
    md.id,
    md.match_api_id,
    md.match_date,
    md.home_team_api_id,
    md.away_team_api_id,
    p.player_name AS home_player_{i}_name,
    (
        SELECT pa.overall_rating
        FROM Player_Attributes pa
        WHERE pa.player_fifa_api_id = p.player_fifa_api_id
        AND pa.date <= md.match_date
        ORDER BY pa.date DESC
        LIMIT 1
    ) AS home_player_{i}_rating,
    (
        SELECT pa.potential
        FROM Player_Attributes pa
        WHERE pa.player_fifa_api_id = p.player_fifa_api_id
        AND pa.date <= md.match_date
        ORDER BY pa.date DESC
        LIMIT 1
    ) AS home_player_{i}_potential
FROM 
    MatchDates md
LEFT JOIN Player p ON md.home_player_{i} = p.player_api_id;
"""

    # Execute the SQL query to create or replace the view
    cursor.execute(create_view_query)
    conn.commit()  # Commit the view creation to the database

    # Query the view to fetch data
    cursor.execute(f"SELECT * FROM PlayerV9_{i}")

    # Fetch all results
    rows = cursor.fetchall()

    # Column headers based on the fetched data
    headers = [description[0] for description in cursor.description]

    # Write data to a CSV file for the current player
    with open(f'Home_Player_{i}.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)  # Write the headers
        writer.writerows(rows)    # Write the data rows

# Close the connection to the database
conn.close()


Next 11 Away Players Independently

In [1]:
import sqlite3
import csv

# Connect to the SQLite database
conn = sqlite3.connect('database.sqlite')
cursor = conn.cursor()

for i in range(10, 12):  # Loop through numbers 1 to 11
    # Dynamic view name and player references in the SQL query
    create_view_query = f"""
CREATE VIEW IF NOT EXISTS PlayerV13_{i} AS
WITH MatchDates AS (
    SELECT 
        m.id,
        m.match_api_id,
        m.date AS match_date,
        m.away_player_{i},
        m.home_team_api_id,
        m.away_team_api_id
    FROM Match m
    WHERE m.league_id = 21518
)
SELECT 
    md.id,
    md.match_api_id,
    md.match_date,
    md.home_team_api_id,
    md.away_team_api_id,
    p.player_name AS away_player_{i}_name,
    (
        SELECT pa.overall_rating
        FROM Player_Attributes pa
        WHERE pa.player_fifa_api_id = p.player_fifa_api_id
        AND pa.date <= md.match_date
        ORDER BY pa.date DESC
        LIMIT 1
    ) AS away_player_{i}_rating,
    (
        SELECT pa.potential
        FROM Player_Attributes pa
        WHERE pa.player_fifa_api_id = p.player_fifa_api_id
        AND pa.date <= md.match_date
        ORDER BY pa.date DESC
        LIMIT 1
    ) AS away_player_{i}_potential
FROM 
    MatchDates md
LEFT JOIN Player p ON md.away_player_{i} = p.player_api_id;
"""

    # Execute the SQL query to create or replace the view
    cursor.execute(create_view_query)
    conn.commit()  # Commit the view creation to the database

    # Query the view to fetch data
    cursor.execute(f"SELECT * FROM PlayerV13_{i}")

    # Fetch all results
    rows = cursor.fetchall()

    # Column headers based on the fetched data
    headers = [description[0] for description in cursor.description]

    # Write data to a CSV file for the current player
    with open(f'Away_Player_{i}.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)  # Write the headers
        writer.writerows(rows)    # Write the data rows

# Close the connection to the database
conn.close()


Once all of the individual csvs are created I then want to combine them to start a new csv that contains all of the applicable information.

In [None]:
import sqlite3
import csv

# Connect to the SQLite database
conn = sqlite3.connect('database.sqlite')
cursor = conn.cursor()

for i in range(1, 12):  # Loop through numbers 1 to 11
    # Dynamic view name and player references in the SQL query
    create_view_query = f"""
CREATE VIEW IF NOT EXISTS PlayerV6_{i} AS
WITH MatchDates AS (
    SELECT 
        m.id,
        m.match_api_id,
        m.date AS match_date,
        m.home_player_{i},
        m.home_team_api_id,
        m.away_team_api_id
    FROM Match m
    WHERE m.home_team_api_id = 8633 OR m.away_team_api_id = 8633
)
SELECT 
    md.id,
    md.match_api_id,
    md.match_date,
    p.player_name AS home_player_{i}_name,
    (
        SELECT pa.overall_rating
        FROM Player_Attributes pa
        WHERE pa.player_fifa_api_id = p.player_fifa_api_id
        AND pa.date <= md.match_date
        ORDER BY pa.date DESC
        LIMIT 1
    ) AS home_player_{i}_rating,
    (
        SELECT pa.potential
        FROM Player_Attributes pa
        WHERE pa.player_fifa_api_id = p.player_fifa_api_id
        AND pa.date <= md.match_date
        ORDER BY pa.date DESC
        LIMIT 1
    ) AS home_player_{i}_potential
FROM 
    MatchDates md
LEFT JOIN Player p ON md.home_player_{i} = p.player_api_id;
"""

    # Execute the SQL query to create or replace the view
    cursor.execute(create_view_query)
    conn.commit()  # Commit the view creation to the database

    # Query the view to fetch data
    cursor.execute(f"SELECT * FROM PlayerV6_{i}")

    # Fetch all results
    rows = cursor.fetchall()

    # Column headers based on the fetched data
    headers = [description[0] for description in cursor.description]

    # Write data to a CSV file for the current player
    with open(f'Home_Player_{i}.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)  # Write the headers
        writer.writerows(rows)    # Write the data rows

# Close the connection to the database
conn.close()


In [3]:
import pandas as pd

# Initialize an empty list to store DataFrames
dfs = []

# Read and store each player's CSV file
for i in range(1, 12):
    df = pd.read_csv(f'Home_Player_{i}.csv')
    dfs.append(df)

# Merge all DataFrames with proper suffixes
merged_df = dfs[0]  # Start with first DataFrame

for i, df in enumerate(dfs[1:], start=2):
    merged_df = pd.merge(
        merged_df,
        df,
        on=['match_api_id', 'match_date'],
        how='outer',
        suffixes=(f'_{i-1}', f'_{i}')
    )

# Save the merged DataFrame
merged_df.to_csv('all_players_combined.csv', index=False)

I now have all of the combined player ratings, potential, and names I now need to find the other information and add those.

In [None]:
import sqlite3
import csv

def export_matches_with_team_names():
    # Connect to the SQLite database
    conn = sqlite3.connect('database.sqlite')
    cursor = conn.cursor()
    
    # SQL query with team name joins
    query = """
    SELECT 
        m.id,
        m.country_id,
        m.league_id,
        m.season,
        m.stage,
        m.date,
        m.match_api_id,
        m.home_team_api_id,
        home_team.team_long_name AS home_team_name,
        m.away_team_api_id,
        away_team.team_long_name AS away_team_name,
        m.home_team_goal,
        m.away_team_goal,
        --m.goal,
        --m.shoton,
        --m.shotoff,
        --m.foulcommit,
        --m.card,
       -- m.cross,
        --m.corner,
       -- m.possession,
        m.B365H,
        m.B365D, m.B365A, m.BWH, m.BWD, m.BWA, m.IWH, m.IWD, m.IWA,
        m.LBH, m.LBD, m.LBA, m.PSH, m.PSD, m.PSA, m.WHH, m.WHD, m.WHA,
        m.SJH, m.SJD, m.SJA, m.VCH, m.VCD, m.VCA, m.GBH, m.GBD, m.GBA,
        m.BSH, m.BSD, m.BSA
    FROM 
        Match AS m
    LEFT JOIN Team AS home_team ON m.home_team_api_id = home_team.team_api_id
    LEFT JOIN Team AS away_team ON m.away_team_api_id = away_team.team_api_id
    where
    """
    
    # Execute the query
    cursor.execute(query)
    
    # Get all rows and column names
    rows = cursor.fetchall()
    column_names = [description[0] for description in cursor.description]
    
    # Write to CSV
    csv_file = 'matches_with_team_names.csv'
    with open(csv_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(column_names)  # Write header
        writer.writerows(rows)  # Write all rows
    
    print(f"Successfully exported {len(rows)} matches to {csv_file}")
    
    # Close connection
    conn.close()

if __name__ == '__main__':
    export_matches_with_team_names()

Successfully exported 304 matches to matches_with_team_names.csv


Having trouble with the xml and unsure how neccesary it will be - for now will not use it.

In [7]:
import csv
import xml.etree.ElementTree as ET
from collections import defaultdict
import pandas as pd

def parse_xml_columns(row):
    """
    Parse all XML columns in a row and return aggregated player statistics
    """
    player_stats = defaultdict(lambda: {
        'goals': 0,
        'assists': 0,
        'yellow_cards': 0,
        'red_cards': 0,
        'shots_on': 0,
        'shots_off': 0,
        'fouls_committed': 0,
        'crosses': 0,
        'corners': 0
    })
    
    xml_columns = ['goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner']
    
    for col in xml_columns:
        xml_data = row.get(col, '')
        if not xml_data or xml_data.lower() == 'nan':
            continue
            
        try:
            root = ET.fromstring(f"<root>{xml_data}</root>")
        except ET.ParseError:
            continue
            
        for event in root:
            for value in event.findall('value'):
                # Common fields
                player1 = value.find('player1')
                player_id = player1.text if player1 is not None else None
                team = value.find('team')
                team_id = team.text if team is not None else None
                
                # Process each event type
                if event.tag == 'goal':
                    if player_id:
                        player_stats[player_id]['goals'] += 1
                    player2 = value.find('player2')
                    if player2 is not None:
                        player_stats[player2.text]['assists'] += 1
                
                elif event.tag == 'card':
                    card_type = value.find('card_type')
                    if card_type is not None and player_id:
                        if card_type.text == 'y':
                            player_stats[player_id]['yellow_cards'] += 1
                        elif card_type.text == 'r':
                            player_stats[player_id]['red_cards'] += 1
                
                elif event.tag == 'shoton' and player_id:
                    player_stats[player_id]['shots_on'] += 1
                
                elif event.tag == 'shotoff' and player_id:
                    player_stats[player_id]['shots_off'] += 1
                
                elif event.tag == 'foulcommit' and player_id:
                    player_stats[player_id]['fouls_committed'] += 1
                
                elif event.tag == 'cross' and player_id:
                    player_stats[player_id]['crosses'] += 1
                
                elif event.tag == 'corner' and player_id:
                    player_stats[player_id]['corners'] += 1
    
    return dict(player_stats)

def process_csv(input_file, output_file):
    """
    Process the input CSV and create a new CSV with parsed statistics
    """
    # Read the input CSV
    df = pd.read_csv(input_file)
    
    # Prepare output data
    output_rows = []
    
    for _, row in df.iterrows():
        # Get the basic match info (non-XML columns)
        match_info = {
            'id': row['id'],
            'match_api_id': row['match_api_id'],
            'date': row['date'],
            'home_team_api_id': row['home_team_api_id'],
            'home_team_name': row.get('home_team_name', ''),
            'away_team_api_id': row['away_team_api_id'],
            'away_team_name': row.get('away_team_name', ''),
            'home_team_goal': row['home_team_goal'],
            'away_team_goal': row['away_team_goal']
        }
        
        # Parse the XML columns
        player_stats = parse_xml_columns(row)
        
        # Add player stats to match info
        for player_id, stats in player_stats.items():
            output_row = match_info.copy()
            output_row.update({
                'player_id': player_id,
                **stats
            })
            output_rows.append(output_row)
    
    # Create DataFrame from output rows
    output_df = pd.DataFrame(output_rows)
    
    # Save to CSV
    output_df.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")

# Example usage
input_csv = 'matches_with_team_names.csv'
output_csv = 'matches_with_player_stats.csv'
process_csv(input_csv, output_csv)

Processed data saved to matches_with_player_stats.csv


Now I want to combine the two csv files so that I finally have a working dataset before I will then perform data augmentation to make the data more usable.

In [2]:
## First want to reduce the betting columns into one average as we do not need to have for every sportsbook. 
import pandas as pd

def odds_to_probability(odds):
    """Convert decimal odds to implied probability"""
    return 1 / float(odds) if pd.notna(odds) else None

def process_betting_odds(input_file, output_file=None):
    """
    Process betting odds and replace them with average probabilities.
    If output_file is None, overwrites the input file.
    """
    # Read the input CSV
    df = pd.read_csv(input_file)
    
    # List of all bookmaker columns (home, draw, away)
    bookmakers = [
        ('B365', ['B365H', 'B365D', 'B365A']),
        ('BW', ['BWH', 'BWD', 'BWA']),
        ('IW', ['IWH', 'IWD', 'IWA']),
        ('LB', ['LBH', 'LBD', 'LBA']),
        ('PS', ['PSH', 'PSD', 'PSA']),
        ('WH', ['WHH', 'WHD', 'WHA']),
        ('SJ', ['SJH', 'SJD', 'SJA']),
        ('VC', ['VCH', 'VCD', 'VCA']),
        ('GB', ['GBH', 'GBD', 'GBA']),
        ('BS', ['BSH', 'BSD', 'BSA'])
    ]
    
    # Initialize lists to collect all probabilities
    all_home_probs = []
    all_draw_probs = []
    all_away_probs = []
    
    # Calculate probabilities for each bookmaker
    for prefix, (h_col, d_col, a_col) in bookmakers:
        # Convert odds to probabilities
        home_probs = df[h_col].apply(odds_to_probability)
        draw_probs = df[d_col].apply(odds_to_probability)
        away_probs = df[a_col].apply(odds_to_probability)
        
        # Normalize to sum to 1 (account for bookmaker overround)
        total_probs = home_probs + draw_probs + away_probs
        home_probs = home_probs / total_probs
        draw_probs = draw_probs / total_probs
        away_probs = away_probs / total_probs
        
        # Collect probabilities for averaging
        all_home_probs.append(home_probs)
        all_draw_probs.append(draw_probs)
        all_away_probs.append(away_probs)
    
    # Calculate average probabilities across all bookmakers
    df['avg_home_prob'] = pd.concat(all_home_probs, axis=1).mean(axis=1)
    df['avg_draw_prob'] = pd.concat(all_draw_probs, axis=1).mean(axis=1)
    df['avg_away_prob'] = pd.concat(all_away_probs, axis=1).mean(axis=1)
    
    # Drop all the original betting columns
    original_odds_cols = [col for _, cols in bookmakers for col in cols]
    df.drop(columns=original_odds_cols, inplace=True)
    
    # Save to file (overwrite if no output_file specified)
    save_path = output_file if output_file else input_file
    df.to_csv(save_path, index=False)
    print(f"Processed data saved to {save_path}")
    print("New columns: avg_home_prob, avg_draw_prob, avg_away_prob")

# Example usage (overwrites original file):
process_betting_odds('matches_with_team_names.csv')

# Alternative usage (creates new file):
# process_betting_odds('matches_with_team_names.csv', 'matches_with_probabilities.csv')

Processed data saved to matches_with_team_names.csv
New columns: avg_home_prob, avg_draw_prob, avg_away_prob


Now I will create a marged output that is almost ready for use as a training set.

In [3]:
def combine_csv_files(matches_file, players_file, output_file):
    """
    Combine matches_with_team_names.csv and combined_players.csv based on 'id'
    """
    # Read both CSV files
    matches_df = pd.read_csv(matches_file)
    players_df = pd.read_csv(players_file)
    
    # Merge the dataframes on 'id'
    combined_df = pd.merge(
        matches_df,
        players_df.drop(columns=['match_api_id', 'match_date']),  # Remove duplicate columns
        on='id',
        how='inner'  # Only keep rows that exist in both files
    )
    
    # Reorder columns to have matches columns first
    matches_columns = matches_df.columns.tolist()
    players_columns = [col for col in players_df.columns 
                      if col not in ['id', 'match_api_id', 'match_date']]
    
    final_columns = matches_columns + players_columns
    combined_df = combined_df[final_columns]
    
    # Save to CSV
    combined_df.to_csv(output_file, index=False)
    print(f"Combined data saved to {output_file}")
    print(f"Total rows: {len(combined_df)}")
    print(f"Columns: {', '.join(combined_df.columns)}")

# Example usage
combine_csv_files(
    'matches_with_team_names.csv',
    'combined_players.csv',
    'final_combined_data.csv'
)

Combined data saved to final_combined_data.csv
Total rows: 304
Columns: id, country_id, league_id, season, stage, date, match_api_id, home_team_api_id, home_team_name, away_team_api_id, away_team_name, home_team_goal, away_team_goal, avg_home_prob, avg_draw_prob, avg_away_prob, away_player_1_name, away_player_1_rating, away_player_1_potential, away_player_2_name, away_player_2_rating, away_player_2_potential, away_player_3_name, away_player_3_rating, away_player_3_potential, away_player_4_name, away_player_4_rating, away_player_4_potential, away_player_5_name, away_player_5_rating, away_player_5_potential, away_player_6_name, away_player_6_rating, away_player_6_potential, away_player_7_name, away_player_7_rating, away_player_7_potential, away_player_8_name, away_player_8_rating, away_player_8_potential, away_player_9_name, away_player_9_rating, away_player_9_potential, away_player_10_name, away_player_10_rating, away_player_10_potential, away_player_11_name, away_player_11_rating, away

Trying to add record of each teeam before the game

In [9]:
import sqlite3
import csv

conn = sqlite3.connect('database.sqlite')
cursor = conn.cursor()
query = f"""
WITH MatchResults AS (
    SELECT
        match_api_id,
        date,
        season,
        home_team_api_id AS team_id,
        home_team_goal,
        away_team_goal,
        CASE
            WHEN home_team_goal > away_team_goal THEN 'win'
            WHEN home_team_goal = away_team_goal THEN 'draw'
            ELSE 'loss'
        END as result
    FROM Match
    WHERE league_id = 21518
    UNION ALL
    SELECT
        match_api_id,
        date,
        season,
        away_team_api_id AS team_id,
        away_team_goal AS home_team_goal,
        home_team_goal AS away_team_goal,
        CASE
            WHEN away_team_goal > home_team_goal THEN 'win'
            WHEN away_team_goal = home_team_goal THEN 'draw'
            ELSE 'loss'
        END as result
    FROM Match
    WHERE league_id = 21518
),
CumulativeRecords AS (
    SELECT
        match_api_id,
        date,
        season,
        team_id,
        result,
        SUM(CASE WHEN result = 'win' THEN 1 ELSE 0 END) OVER (PARTITION BY team_id, season ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) as wins,
        SUM(CASE WHEN result = 'draw' THEN 1 ELSE 0 END) OVER (PARTITION BY team_id, season ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) as draws,
        SUM(CASE WHEN result = 'loss' THEN 1 ELSE 0 END) OVER (PARTITION BY team_id, season ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) as losses
    FROM MatchResults
)
SELECT match_api_id, date, season, team_id, wins, draws, losses
FROM CumulativeRecords
ORDER BY season, date, team_id;
"""
# Execute the query
cursor.execute(query)
    
# Get all rows and column names
rows = cursor.fetchall()
column_names = [description[0] for description in cursor.description]
                        
# Write to CSV
csv_file = 'team_records.csv'
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(column_names)  # Write header
    writer.writerows(rows)  # Write all rows
                        
print(f"Successfully exported {len(rows)} matches to {csv_file}")
                        
                        # Close connection
conn.close()                        

Successfully exported 6080 matches to team_records.csv


Next I want to rearrange the data to make it more sensical for real.

In [12]:
import pandas as pd

def get_player_info(row, prefix, player_num):
    return [
        row[f"{prefix}_player_{player_num}_name"],
        row[f"{prefix}_player_{player_num}_rating"],
        row[f"{prefix}_player_{player_num}_potential"]
    ]

def rearrange_row(row, real_api_id):
    is_real_home = row['home_team_api_id'] == real_api_id
    home_away_indicator = 1 if is_real_home else 0
    team_prefix_real = 'home' if is_real_home else 'away'
    team_prefix_other = 'away' if is_real_home else 'home'
    
    rearranged_data = [
        home_away_indicator,
        real_api_id,
        row[f"{team_prefix_other}_team_api_id"],
        row[f"{team_prefix_other}_team_name"],
        row[f"{team_prefix_real}_team_goal"],
        row[f"{team_prefix_other}_team_goal"],
        row['avg_home_prob'] if is_real_home else row['avg_away_prob'],
        row['avg_draw_prob'],
        row['avg_away_prob'] if is_real_home else row['avg_home_prob']
    ]
    
    # Append Real Madrid players' details
    for i in range(1, 12):
        rearranged_data.extend(get_player_info(row, team_prefix_real, i))
    
    # Append the other team's players' details
    for i in range(1, 12):
        rearranged_data.extend(get_player_info(row, team_prefix_other, i))
    
    return pd.Series(rearranged_data)

# Load your data
df = pd.read_csv('final_combined_data.csv')
real_api_id = 8633  

# Define new column names
new_columns = [
    'home_away_indicator', 'real_api_id', 'other_team_id', 'other_team_name',
    'real_score', 'other_score', 'real_winprob', 'tie_prob', 'other_winprob'
] + [f"real_player_{i}_{attr}" for i in range(1, 12) for attr in ['name', 'rating', 'potential']] \
  + [f"other_player_{i}_{attr}" for i in range(1, 12) for attr in ['name', 'rating', 'potential']]

# Apply the function to each row
rearranged_df = df.apply(lambda row: rearrange_row(row, real_api_id), axis=1)
rearranged_df.columns = new_columns

# Save the modified DataFrame
rearranged_df.to_csv('rearranged_matches.csv', index=False)
