In [206]:
import pandas as pd
import re

In [207]:
countries = [
    "ARG", "AUT", "BRA", "CHN",
    "DNK", "FIN", "IRL", "JPN",
    "MEX", "NOR", "POL", "ROU",
    "RUS", "SWE", "SWZ", "USA",
]

seasons = [
    '2324', 
    '2223', 
    '2122', 
    '2021',
    '1920',
    '1819',
    '1718',
    '1617',
    '1516',
    '1415'
]


In [208]:
matches_files = []
fixtures_files = []

In [209]:
for country in countries:    
    matches_files.append('data/scraped/other/%s.csv' % (country))
    continue

In [210]:
fixtures_files.append(f'data/fixtures/new_league_fixtures.csv')

In [211]:
def format_season(season):
    # Check if the season contains a '/'
    if '/' in season:
        # Split the string on '/' and take the last two digits of each year
        parts = season.split('/')
        new_season = parts[0][-2:] + parts[1][-2:]
    else:
        # If it's just a single year, use the string as is
        new_season = season

    return new_season

In [212]:
country_mapping = {
    'Argentina': 'ARG',
    'Austria': 'AUT',
    'Brazil': 'BRA',
    'China': 'CHN',
    'Denmark': 'DNK',
    'Finland': 'FIN',
    'Ireland': 'IRL',
    'Japan': 'JPN',
    'Norway': 'NOR',
    'Poland': 'POL',
    'Romania': 'ROU',
    'Russia': 'RUS',
    'Sweden': 'SWE',
    'Switzerland': 'SWZ',
    'USA': 'USA'
}

In [213]:
def load_data(files):

    df = pd.DataFrame()

    for file in files:
        try:
            # Try to read with default utf-8 encoding
            try:
                df_temp = pd.read_csv(file, encoding='utf-8')
            except UnicodeDecodeError:
                # If utf-8 decoding fails, try reading with ISO-8859-1
                df_temp = pd.read_csv(file, encoding='ISO-8859-1')

            # Map the country name to the country code, and add it as a new column 'Div'
            df_temp['Div'] = df_temp['Country'].map(country_mapping)            

            # Check if 'Season' column exists and convert it
            if 'Season' in df_temp.columns:
                df_temp['Season'] = df_temp['Season'].astype(str).apply(format_season)
            else:
                df_temp['Season'] = seasons[0]

            df = pd.concat([df, df_temp], ignore_index=True)
        except FileNotFoundError:
            print(f'Error: {file} not found')
        except Exception as e:
            print(f"An error occurred while loading {file}: {e}")

    return df


In [214]:
def clean_data(df):
    # Rename the necessary columns as per the mapping
    df.rename(columns={
        'Home': 'HomeTeam',
        'Away': 'AwayTeam',
        'HG': 'FTHG',
        'AG': 'FTAG',
        'Res': 'FTR',
        'AvgH': 'AvgH',
        'AvgD': 'AvgD',
        'AvgA': 'AvgA'
    }, inplace=True)

    # Convert 'Date' from string to datetime
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

    # Convert 'Date' to a String in the format 'DD/MM/YYYY'
    df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')

    # Drop the columns that are not needed
    df.drop(columns=['Country', 'League', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA'], inplace=True)

    # Create FTHG, FTAG and FTR columns if they don't exist
    if 'FTHG' not in df.columns:
        df['FTHG'] = None
    if 'FTAG' not in df.columns:
        df['FTAG'] = None
    if 'FTR' not in df.columns:
        df['FTR'] = None

    # Change the order of the columns
    df = df[['Div', 'Season', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'AvgH', 'AvgD', 'AvgA']]

    return df

In [215]:
# Load data into DataFrames
df = load_data(matches_files)
df_fixtures = load_data(fixtures_files)

In [216]:
df = clean_data(df)
df_fixtures = clean_data(df_fixtures)

In [217]:
len(df), len(df_fixtures)

(51981, 125)

In [218]:
# Split the df into separate DataFrames for each country and save them to CSV
for country in countries:
    df_country = df[df['Div'] == country]
    df_country.to_csv(f'data/cleaned/{country}.csv', index=False)

In [219]:
df_fixtures.to_csv(f'data/fixtures/fixtures_world.csv', index=False)

In [220]:
import winsound
frequency = 400  # Set Frequency To 2500 Hertz
duration = 200  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)