In [1]:
import os
import pandas as pd

# %matplotlib inline

# Assume you have 4 seasons:
seasons = ["20_21", "21_22", "22_23", "23_24"]

# Folder where the 'featured' CSVs for these seasons are stored
featured_folder = r"C:\Users\mbaki\Desktop\Proje\data\processed\featured"

# Final folder to store the combined CSV
final_folder = r"C:\Users\mbaki\Desktop\Proje\data\final"
if not os.path.exists(final_folder):
    os.makedirs(final_folder)

# Path for the final combined CSV
final_csv_path = os.path.join(final_folder, "all_seasons_final.csv")


In [2]:
def create_multiclass_outcome(row):
    """
    Creates a 3-class label based on the match outcome:
      - "H" (Home Win)
      - "D" (Draw)
      - "A" (Away Win)
    
    Parameters:
    - row (Series): A row from the DataFrame containing 'Home Goals' and 'Away Goals'.
    
    Returns:
    - str: "H", "D", or "A" based on the match result.
    """
    if row["Home Goals"] > row["Away Goals"]:
        return "H"
    elif row["Home Goals"] < row["Away Goals"]:
        return "A"
    else:
        return "D"


In [3]:
all_dfs = []  # List to store DataFrames for concatenation later

for season in seasons:
    featured_file = os.path.join(featured_folder, f"{season}_featured.csv")
    if not os.path.exists(featured_file):
        print(f"{featured_file} does not exist, skipping this season.")
        continue

    print(f"--- Loading data for season {season}: {featured_file} ---")
    df = pd.read_csv(featured_file)

    # Add a 3-class (HomeWin/Draw/AwayWin) outcome column
    df["MatchOutcome"] = df.apply(create_multiclass_outcome, axis=1)

    # If there are duplicate columns like "Season_x" or "Season_y", rename or drop them
    rename_map = {
        "Season_x": "Season",
        "Match Date_x": "Match Date"
        # Add more mappings if there are other duplicates like "Season_y" -> "Season", etc.
    }
    df.rename(columns=rename_map, inplace=True)

    # Drop columns that are not needed. For example:
    cols_to_drop = ["Season_y", "Match Date_y"]  # Add other columns if they exist
    existing_drops = [c for c in cols_to_drop if c in df.columns]
    df.drop(columns=existing_drops, inplace=True)

    # Move the target column ("MatchOutcome") to the end or desired position
    # For example, to move it to the end:
    outcome_col = df.pop("MatchOutcome")  # Remove (pop) from DataFrame
    df["MatchOutcome"] = outcome_col  # Add it back at the end

    # Append the processed DataFrame to the list
    all_dfs.append(df)


--- Loading data for season 20_21: C:\Users\mbaki\Desktop\Proje\data\processed\featured\20_21_featured.csv ---
--- Loading data for season 21_22: C:\Users\mbaki\Desktop\Proje\data\processed\featured\21_22_featured.csv ---
--- Loading data for season 22_23: C:\Users\mbaki\Desktop\Proje\data\processed\featured\22_23_featured.csv ---
--- Loading data for season 23_24: C:\Users\mbaki\Desktop\Proje\data\processed\featured\23_24_featured.csv ---


In [4]:
if len(all_dfs) == 0:
    print("No data loaded, final CSV cannot be created.")
else:
    # Concatenate all DataFrames
    df_all = pd.concat(all_dfs, ignore_index=True, sort=False)
    print(f"Total {len(df_all)} rows have been concatenated.")

    # Optional: Adjust the column order
    # Example: Season, Week, Match Date, Home Team, Away Team, Home Goals, Away Goals, MatchOutcome, etc.
    desired_cols = [
        "Season", "Week", "Match Date",
        "Home Team", "Away Team",
        "Home Goals", "Away Goals",
        "MatchOutcome",
    ]
    # Bring the desired columns to the front if they exist
    desired_cols_in_df = [c for c in desired_cols if c in df_all.columns]
    other_cols = [c for c in df_all.columns if c not in desired_cols_in_df]
    df_all = df_all[desired_cols_in_df + other_cols]

    #%%
    # Save the final combined DataFrame as a CSV
    df_all.to_csv(final_csv_path, index=False)
    print(f"Final combined data for all seasons has been saved to: {final_csv_path}")

    # Optionally, view the last 5 rows
    display(df_all.tail(5))


Total 2387 rows have been concatenated.
Final combined data for all seasons has been saved to: C:\Users\mbaki\Desktop\Proje\data\final\all_seasons_final.csv


Unnamed: 0,Season,Season.1,Week,Match Date,Match Date.1,Home Team,Away Team,Home Goals,Away Goals,MatchOutcome,...,Away_GoalsScored_Last5,Away_GoalsScored_Last10,Away_Points_Last5,Away_Points_Last10,Away_AvgAge_Last5,Away_AvgAge_Last10,Away_AvgValue_Last5,Away_AvgValue_Last10,Away_AvgRating_Last5,Away_AvgRating_Last10
2382,23/24,23/24,Round 1,13/08/23,2023-08-13,siva,sams,1,1,D,...,1.0,1.0,1.0,1.0,29.454545,29.454545,124715900.0,124715900.0,6.854545,6.854545
2383,23/24,23/24,Round 1,13/08/23,2023-08-13,adan,rize,2,1,H,...,1.0,1.0,0.0,0.0,26.090909,26.090909,121245100.0,121245100.0,6.815455,6.815455
2384,23/24,23/24,Round 1,13/08/23,2023-08-13,fene,gazi,2,1,H,...,1.0,1.0,0.0,0.0,29.818182,29.818182,25906820.0,25906820.0,6.787273,6.787273
2385,23/24,23/24,Round 1,14/08/23,2023-08-14,alan,başa,2,0,H,...,0.0,0.0,0.0,0.0,28.6875,28.6875,187951100.0,187951100.0,6.875455,6.875455
2386,23/24,23/24,Round 1,14/08/23,2023-08-14,fati,beşi,0,1,A,...,1.0,1.0,3.0,3.0,29.818182,29.818182,396413600.0,396413600.0,6.978182,6.978182
