In [148]:
import pandas as pd
import re

In [149]:
countries = [
    "ARG", "AUT", "BRA", "CHN",
    "DNK", "FIN", "IRL", "JPN",
    "MEX", "NOR", "POL", "ROU",
    "RUS", "SWE", "SWZ", "USA",
]

matches_files = []

In [150]:
for country in countries:    
    matches_files.append('data/scraped/other/%s.csv' % (country))
    continue

In [151]:
def format_season(season):
    # Check if the season contains a '/'
    if '/' in season:
        # Split the string on '/' and take the last two digits of each year
        parts = season.split('/')
        new_season = parts[0][-2:] + parts[1][-2:]
    else:
        # If it's just a single year, use the string as is
        new_season = season

    return new_season

In [152]:
def load_data(files):
    df = pd.DataFrame()

    for file in files:
        try:
            # Try to read with default utf-8 encoding
            try:
                df_temp = pd.read_csv(file, encoding='utf-8')
            except UnicodeDecodeError:
                # If utf-8 decoding fails, try reading with ISO-8859-1
                df_temp = pd.read_csv(file, encoding='ISO-8859-1')

            # Extract the country name from the file path
            div = file.split('/')[-1].split('.')[0]
            df_temp['Div'] = div

            # Check if 'Season' column exists and convert it
            if 'Season' in df_temp.columns:
                df_temp['Season'] = df_temp['Season'].astype(str).apply(format_season)
            else:
                print(f"No 'Season' column in {file}")

            df = pd.concat([df, df_temp], ignore_index=True)
        except FileNotFoundError:
            print(f'Error: {file} not found')
        except Exception as e:
            print(f"An error occurred while loading {file}: {e}")

    return df


In [153]:
# Load data into DataFrames
df = load_data(matches_files)

In [154]:
# Rename the necessary columns as per the mapping
df.rename(columns={
    'Home': 'HomeTeam',
    'Away': 'AwayTeam',
    'HG': 'FTHG',
    'AG': 'FTAG',
    'Res': 'FTR',
    'AvgH': 'AvgH',
    'AvgD': 'AvgD',
    'AvgA': 'AvgA'
}, inplace=True)

# Convert 'Date' from string to datetime
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

# Convert 'Date' to a String in the format 'DD/MM/YYYY'
df['Date'] = df['Date'].dt.strftime('%d/%m/%Y')

# Drop the columns that are not needed
df.drop(columns=['Country', 'League', 'PH', 'PD', 'PA', 'MaxH', 'MaxD', 'MaxA'], inplace=True)

# Change the order of the columns
df = df[['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'AvgH', 'AvgD', 'AvgA']]

In [155]:
print(df.head())

   Div        Date   Time         HomeTeam           AwayTeam  FTHG  FTAG FTR  \
0  ARG  03/08/2012  23:00  Arsenal Sarandi  Union de Santa Fe   1.0   0.0   H   
1  ARG  04/08/2012  01:10  Velez Sarsfield     Argentinos Jrs   3.0   0.0   H   
2  ARG  04/08/2012  18:10      Racing Club       Atl. Rafaela   1.0   1.0   D   
3  ARG  04/08/2012  20:10   Colon Santa FE              Lanus   1.0   0.0   H   
4  ARG  04/08/2012  22:15          Quilmes       Boca Juniors   3.0   0.0   H   

   AvgH  AvgD  AvgA  
0  1.76  3.30  4.74  
1  1.97  3.12  3.96  
2  1.91  3.22  4.05  
3  2.39  2.99  3.04  
4  3.07  2.99  2.38  


In [156]:
df['Div'].value_counts()

Div
ARG    5090
USA    4879
BRA    4588
MEX    3953
JPN    3852
ROU    3513
POL    3425
SWE    2942
NOR    2933
RUS    2876
DNK    2531
CHN    2383
IRL    2258
AUT    2227
FIN    2217
SWZ    2187
Name: count, dtype: int64

In [157]:
# Split the df into separate DataFrames for each country and save them to CSV
for country in countries:
    df_country = df[df['Div'] == country]
    df_country.to_csv(f'data/cleaned/{country}.csv', index=False)