In [2]:
import pandas as pd
import numpy as np

In [3]:
#null handling
df = pd.read_csv('afl_game_by_game_results_1965_2022.csv')
df.isnull().sum()

#replace with 0
df['behinds']

0         21-8
1         16-9
2         25-9
3        11-13
4          8-8
         ...  
18521     8-10
18522     15-4
18523     12-8
18524      8-7
18525    11-13
Name: behinds, Length: 18526, dtype: object

In [4]:
stats_columns = ['kicks', 'marks', 'handballs', 'disposals', 'goals', 'behinds', 'hit_outs', 'tackles', 'rebound_50s', 
                 'inside_50s', 'clearances', 'clangers', 'freekicks_for', 'freekicks_agains', 'brownlow_votes', 
                 'contested_possesions', 'uncontested_possesions', 'contested_marks', 'marks_inside_50', 'one_percenters',
                 'bounces', 'goal_assist']




In [5]:
#splitting the stats to 2 teams
for col in stats_columns:
    #using regexp to filter off all incorrect formats for stats
    df[col] = df[col].where(df[col].str.fullmatch(r'^\d+-\d+$', na=False), np.nan)
    
    # Split the stats into two columns for each team (team1_stats, team2_stats)
    df[[f'{col}_team1', f'{col}_team2']] = df[col].str.split('-',expand=True)
    
    # Convert these columns to numeric values
    df[f'{col}_team1'] = pd.to_numeric(df[f'{col}_team1'], errors='coerce')
    df[f'{col}_team2'] = pd.to_numeric(df[f'{col}_team2'], errors='coerce')

#Calculate the total score for each team: goals * 6 + behinds
df['score_team1'] = df['goals_team1'] * 6 + df['behinds_team1']
df['score_team2'] = df['goals_team2'] * 6 + df['behinds_team2']

#dropping all original stat columns 
df = df.drop(columns= stats_columns)


In [6]:
#creating match_id with year, round, teams playing
df['match_id'] = df.apply(lambda row: tuple([row['year'], row['round'], sorted([row['team'], row['Opponent']])]), axis=1)

#drop duplicates
df = df.drop_duplicates(subset = 'match_id', keep= 'first')


In [7]:
#dropping unused columns
df.drop(columns= ['Unnamed: 0.1', 'Unnamed: 0', 'match_id'], inplace= True)
df = df.sort_values(by=['year', 'round', 'team'])
df.columns = df.columns.str.strip()

In [8]:
#export
df.to_csv('cleanish.csv')