In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)

In [None]:
# Import data
df = pd.read_csv('../data/raw_2012_2024.csv', dtype={5: str, 7: str, 9: str})

In [None]:
# Surfer Prefixs
sufer_prefixs = ['surfer_', 'op1_', 'op2_']

In [None]:
# Drop Surf Ranch Heats
mask_surf_ranch = df['event_slug'].str.contains('surf-ranch')
df = df[~mask_surf_ranch]

In [None]:
# Drop Heat Values
drop_heat = ['heat_round', 'heat_slug', 'event_slug', 'surfer_heat_place']
df = df.drop(drop_heat, axis=1)

In [None]:
# Drop Surfer Values
drop_surfer = ['dob', 'heat_total', 'slug']

for prefix in sufer_prefixs:
    for col in drop_surfer:
        df = df.drop(prefix + col, axis=1)

In [None]:
# Split surfer count into dummy cols
df['surfer_count'] = df['surfer_count'].astype(str)
df_sc_dummies = pd.get_dummies(df['surfer_count'], prefix='surfer_count')

df = df.join(df_sc_dummies)
df = df.drop('surfer_count', axis=1)

In [None]:
# Convert surfer one from boolean to int
df['surfer_won'] = df['surfer_won'].astype(int)

In [None]:
# Convert Start Date
df['event_s_date'] = pd.to_datetime(df['event_s_date'])
df['month'] = df['event_s_date'].apply(lambda date: date.month)

In [None]:
# Add Surfer_comp_hc: Surfer is competing in home country
df['surfer_comp_hc'] = (df['surfer_country'] == df['event_country']).astype(int)
df['op1_comp_hc'] = (df['op1_country'] == df['event_country']).astype(int)
df['op2_comp_hc'] = (df['op2_country'] == df['event_country']).astype(int)

In [None]:
# # Add frontside: The direction of the wave is always frontside for surfer else false
df['surfer_frontside'] = (((df['surfer_stance'] == 'REGULAR') & (df['wave_dir'] == 'RIGHT')) | \
                         ((df['surfer_stance'] == 'GOOFY') & (df['wave_dir'] == 'LEFT'))).astype(int)

df['op1_frontside'] = (((df['op1_stance'] == 'REGULAR') & (df['wave_dir'] == 'RIGHT')) | \
                         ((df['op1_stance'] == 'GOOFY') & (df['wave_dir'] == 'LEFT'))).astype(int)

df['op2_frontside'] = (((df['op2_stance'] == 'REGULAR') & (df['wave_dir'] == 'RIGHT')) | \
                         ((df['op2_stance'] == 'GOOFY') & (df['wave_dir'] == 'LEFT'))).astype(int)

In [None]:
# Drop Temp Values
drop_temp = ['heat_duration', 'wave_range', 'heat_date', 'wind_conditions', 'avg_wave_height']
df = df.drop(drop_temp, axis=1)

In [None]:
# Sort columns
def custom_sort_key(col):
    # Give priority to columns starting with 'op1'
    if col.startswith('surfer_'):
        return (1, col)
    elif col.startswith('op1_'):
        return (2, col)
    elif col.startswith('op2_'):
        return (3, col)
    else:
        return (0, col)

sorted_columns = sorted(df.columns, key=custom_sort_key)
df = df[sorted_columns]