In [1]:
import os
import pandas as pd
import numpy as np

# %matplotlib inline

# Example: List of 4 seasons
seasons = ["20_21", "21_22", "22_23", "23_24"]

# Input folder: Previously merged/cleaned and saved CSVs (e.g., X_processed.csv) are here
processed_folder = r"C:\Users\mbaki\Desktop\Proje\data\processed\all_season"

# Output folder: Feature-Engineered CSVs (X_featured.csv) will be saved here after running this code
feature_folder = r"C:\Users\mbaki\Desktop\Proje\data\featured\all_season"



In [2]:
def create_features_for_season(df, add_rolling_form=True):
    """
    Creates feature-engineered data for a given season.

    Parameters:
    - df (DataFrame): Your previously "processed" dataset.
                      (Columns like Home_Player_X_TeamPlayer_Age, MarketValue, Rating, etc., are present).
    - add_rolling_form (bool): If True, calculates data for the last 5 and 10 matches (goals, points, etc.).

    This function performs the following on the dataframe:
      1. Home & Away team-based (Total / Average) Age, MarketValue, Rating
      2. Home advantage
      3. (Optional) League position / point difference
      4. (Optional) Form data (Points, goals, average rating/value in the last 5/10 matches, etc.)
      5. Optionally drops the original 66 "Home_Player_X_TeamPlayer_*" and "Away_Player_X_TeamPlayer_*" columns.

    Contains a comprehensive example code; you can uncomment lines as needed based on requirements.
    """

    # ---------- A) Home & Away team-based (average / total) ----------
    MAX_PLAYERS = 11

    home_age_cols = [f"Home_Player_{i}_TeamPlayer_Age" for i in range(1, MAX_PLAYERS + 1)]
    home_mv_cols = [f"Home_Player_{i}_TeamPlayer_MarketValue" for i in range(1, MAX_PLAYERS + 1)]
    home_rating_cols = [f"Home_Player_{i}_TeamPlayer_Rating" for i in range(1, MAX_PLAYERS + 1)]

    away_age_cols = [f"Away_Player_{i}_TeamPlayer_Age" for i in range(1, MAX_PLAYERS + 1)]
    away_mv_cols = [f"Away_Player_{i}_TeamPlayer_MarketValue" for i in range(1, MAX_PLAYERS + 1)]
    away_rating_cols = [f"Away_Player_{i}_TeamPlayer_Rating" for i in range(1, MAX_PLAYERS + 1)]

    # Home Team
    df["Home_AvgAge"] = df[home_age_cols].mean(axis=1)
    df["Home_SumValue"] = df[home_mv_cols].sum(axis=1)
    df["Home_AvgValue"] = df[home_mv_cols].mean(axis=1)
    df["Home_AvgRating"] = df[home_rating_cols].mean(axis=1)

    # Away Team
    df["Away_AvgAge"] = df[away_age_cols].mean(axis=1)
    df["Away_SumValue"] = df[away_mv_cols].sum(axis=1)
    df["Away_AvgValue"] = df[away_mv_cols].mean(axis=1)
    df["Away_AvgRating"] = df[away_rating_cols].mean(axis=1)

    # Difference Columns (optional)
    df["Age_Diff"] = df["Home_AvgAge"] - df["Away_AvgAge"]
    df["Value_Diff"] = df["Home_SumValue"] - df["Away_SumValue"]
    df["Rating_Diff"] = df["Home_AvgRating"] - df["Away_AvgRating"]

    # Home Advantage (simple)
    df["Home_Advantage"] = 1


    if add_rolling_form:
        # 1) Transform to long format: Home rows + Away rows
        #    If you want to keep team average values like "AvgAge", "AvgValue", "AvgRating" in a single column,
        #    you can name them as "df_long_home['AvgAge']" etc.

        df_long_home = df[[
            "Season", "Week", "Match Date",
            "Home Team", "Home Goals", "Away Goals",
            "Home_AvgAge", "Home_AvgValue", "Home_AvgRating"
        ]].copy()

        df_long_home["Team"] = df_long_home["Home Team"]
        df_long_home["GoalsScored"] = df_long_home["Home Goals"]
        df_long_home["GoalsConceded"] = df_long_home["Away Goals"]

        # Home match result
        def get_home_result(row):
            if row["Home Goals"] > row["Away Goals"]:
                return "Win"
            elif row["Home Goals"] < row["Away Goals"]:
                return "Lose"
            else:
                return "Draw"

        df_long_home["Result"] = df_long_home.apply(get_home_result, axis=1)

        # Rename average columns for easier rolling operations
        df_long_home["AvgAge"] = df_long_home["Home_AvgAge"]
        df_long_home["AvgValue"] = df_long_home["Home_AvgValue"]
        df_long_home["AvgRating"] = df_long_home["Home_AvgRating"]

        # Away rows
        df_long_away = df[[
            "Season", "Week", "Match Date",
            "Away Team", "Home Goals", "Away Goals",
            "Away_AvgAge", "Away_AvgValue", "Away_AvgRating"
        ]].copy()

        df_long_away["Team"] = df_long_away["Away Team"]
        df_long_away["GoalsScored"] = df_long_away["Away Goals"]
        df_long_away["GoalsConceded"] = df_long_away["Home Goals"]

        def get_away_result(row):
            if row["Away Goals"] > row["Home Goals"]:
                return "Win"
            elif row["Away Goals"] < row["Home Goals"]:
                return "Lose"
            else:
                return "Draw"

        df_long_away["Result"] = df_long_away.apply(get_away_result, axis=1)

        df_long_away["AvgAge"] = df_long_away["Away_AvgAge"]
        df_long_away["AvgValue"] = df_long_away["Away_AvgValue"]
        df_long_away["AvgRating"] = df_long_away["Away_AvgRating"]

        # Combine home and away data
        df_long = pd.concat([df_long_home, df_long_away], ignore_index=True)

        # Points column
        def result_to_points(res):
            if res == "Win":
                return 3
            elif res == "Draw":
                return 1
            else:
                return 0

        df_long["Points"] = df_long["Result"].apply(result_to_points)

        # 2) Convert to datetime and sort (oldest -> newest)
        df_long["Match Date"] = pd.to_datetime(df_long["Match Date"], dayfirst=True, errors="coerce")
        # If the format is month-day-year, set dayfirst=False. Adjust according to your format.
        df_long.sort_values(by=["Team", "Match Date"], ascending=[True, True], inplace=True)

        # 3) Rolling statistics for the last 5 / 10 matches
        df_long["GoalsScored_Last5"] = df_long.groupby("Team")["GoalsScored"].rolling(window=5,
                                                                                         min_periods=1).sum().reset_index(
            level=0, drop=True)
        df_long["GoalsScored_Last10"] = df_long.groupby("Team")["GoalsScored"].rolling(window=10,
                                                                                          min_periods=1).sum().reset_index(
            level=0, drop=True)

        df_long["Points_Last5"] = df_long.groupby("Team")["Points"].rolling(window=5, min_periods=1).sum().reset_index(
            level=0, drop=True)
        df_long["Points_Last10"] = df_long.groupby("Team")["Points"].rolling(window=10,
                                                                              min_periods=1).sum().reset_index(level=0,
                                                                                                               drop=True)

        # Example: Average "AvgAge" in the last 5 matches
        df_long["AvgAge_Last5"] = df_long.groupby("Team")["AvgAge"].rolling(window=5, min_periods=1).mean().reset_index(
            level=0, drop=True)
        df_long["AvgAge_Last10"] = df_long.groupby("Team")["AvgAge"].rolling(window=10,
                                                                              min_periods=1).mean().reset_index(level=0,
                                                                                                                drop=True)

        # Similarly for Market Value and Rating
        df_long["AvgValue_Last5"] = df_long.groupby("Team")["AvgValue"].rolling(window=5,
                                                                                  min_periods=1).mean().reset_index(
            level=0, drop=True)
        df_long["AvgValue_Last10"] = df_long.groupby("Team")["AvgValue"].rolling(window=10,
                                                                                   min_periods=1).mean().reset_index(
            level=0, drop=True)

        df_long["AvgRating_Last5"] = df_long.groupby("Team")["AvgRating"].rolling(window=5,
                                                                                    min_periods=1).mean().reset_index(
            level=0, drop=True)
        df_long["AvgRating_Last10"] = df_long.groupby("Team")["AvgRating"].rolling(window=10,
                                                                                     min_periods=1).mean().reset_index(
            level=0, drop=True)


        # Home Form Data
        df_home_form = df_long[[
            "Team", "Match Date", "Season", "Week",
            "GoalsScored_Last5", "GoalsScored_Last10",
            "Points_Last5", "Points_Last10",
            "AvgAge_Last5", "AvgAge_Last10",
            "AvgValue_Last5", "AvgValue_Last10",
            "AvgRating_Last5", "AvgRating_Last10"
        ]].copy()

        df_home_form.rename(columns={
            "GoalsScored_Last5": "Home_GoalsScored_Last5",
            "GoalsScored_Last10": "Home_GoalsScored_Last10",
            "Points_Last5": "Home_Points_Last5",
            "Points_Last10": "Home_Points_Last10",
            "AvgAge_Last5": "Home_AvgAge_Last5",
            "AvgAge_Last10": "Home_AvgAge_Last10",
            "AvgValue_Last5": "Home_AvgValue_Last5",
            "AvgValue_Last10": "Home_AvgValue_Last10",
            "AvgRating_Last5": "Home_AvgRating_Last5",
            "AvgRating_Last10": "Home_AvgRating_Last10",
        }, inplace=True)

        # Merge (Week + Home Team vs. df_home_form["Week"] + df_home_form["Team"])
        df = df.merge(
            df_home_form,
            left_on=["Week", "Home Team"],
            right_on=["Week", "Team"],
            how="left"
        )
        df.drop(columns="Team", inplace=True)

        # Away Form Data
        df_away_form = df_long[[
            "Team", "Match Date", "Season", "Week",
            "GoalsScored_Last5", "GoalsScored_Last10",
            "Points_Last5", "Points_Last10",
            "AvgAge_Last5", "AvgAge_Last10",
            "AvgValue_Last5", "AvgValue_Last10",
            "AvgRating_Last5", "AvgRating_Last10"
        ]].copy()

        df_away_form.rename(columns={
            "GoalsScored_Last5": "Away_GoalsScored_Last5",
            "GoalsScored_Last10": "Away_GoalsScored_Last10",
            "Points_Last5": "Away_Points_Last5",
            "Points_Last10": "Away_Points_Last10",
            "AvgAge_Last5": "Away_AvgAge_Last5",
            "AvgAge_Last10": "Away_AvgAge_Last10",
            "AvgValue_Last5": "Away_AvgValue_Last5",
            "AvgValue_Last10": "Away_AvgValue_Last10",
            "AvgRating_Last5": "Away_AvgRating_Last5",
            "AvgRating_Last10": "Away_AvgRating_Last10",
        }, inplace=True)

        # Merge (Week + Away Team vs. df_away_form["Week"] + df_away_form["Team"])
        df = df.merge(
            df_away_form,
            left_on=["Week", "Away Team"],
            right_on=["Week", "Team"],
            how="left"
        )
        df.drop(columns="Team", inplace=True)

        # Now, df contains columns like Home_GoalsScored_Last5, Away_Points_Last10, etc.

    # ---------- D) Dropping the original 66 columns (optional) ----------
    all_home_cols = home_age_cols + home_mv_cols + home_rating_cols
    all_away_cols = away_age_cols + away_mv_cols + away_rating_cols
    df.drop(columns=(all_home_cols + all_away_cols), inplace=True)

    return df



In [3]:
# Create the feature folder if it doesn't exist
if not os.path.exists(feature_folder):
    os.makedirs(feature_folder)

# Process each season
for season in seasons:
    processed_file = os.path.join(processed_folder, f"{season}_processed.csv")
    if not os.path.exists(processed_file):
        print(f"{processed_file} not found, skipping this season.")
        continue

    print(f"\n--- Starting feature engineering for season {season} ---")
    df_proc = pd.read_csv(processed_file)

    # add_rolling_form=True => adds features for the last 5/10 matches
    df_featured = create_features_for_season(df_proc, add_rolling_form=True)

    featured_file = os.path.join(feature_folder, f"{season}_featured.csv")
    df_featured.to_csv(featured_file, index=False)
    print(f"Feature engineering for season {season} completed -> Saved: {featured_file}")



--- Starting feature engineering for season 20_21 ---
Feature engineering for season 20_21 completed -> Saved: C:\Users\mbaki\Desktop\Proje\data\featured\all_season\20_21_featured.csv

--- Starting feature engineering for season 21_22 ---
Feature engineering for season 21_22 completed -> Saved: C:\Users\mbaki\Desktop\Proje\data\featured\all_season\21_22_featured.csv

--- Starting feature engineering for season 22_23 ---
Feature engineering for season 22_23 completed -> Saved: C:\Users\mbaki\Desktop\Proje\data\featured\all_season\22_23_featured.csv

--- Starting feature engineering for season 23_24 ---
Feature engineering for season 23_24 completed -> Saved: C:\Users\mbaki\Desktop\Proje\data\featured\all_season\23_24_featured.csv


  df_long["Match Date"] = pd.to_datetime(df_long["Match Date"], dayfirst=True, errors="coerce")
  df_long["Match Date"] = pd.to_datetime(df_long["Match Date"], dayfirst=True, errors="coerce")
  df_long["Match Date"] = pd.to_datetime(df_long["Match Date"], dayfirst=True, errors="coerce")
  df_long["Match Date"] = pd.to_datetime(df_long["Match Date"], dayfirst=True, errors="coerce")


In [4]:
# Display sample data from a specific season
sample_season = "23_24"
sample_feature_path = os.path.join(feature_folder, f"{sample_season}_featured.csv")
if os.path.exists(sample_feature_path):
    df_check = pd.read_csv(sample_feature_path)
    print(f"\nSample rows from the {sample_season} season:")
    display(df_check.head(10))  # Displays the first 10 rows in Jupyter
else:
    print(f"{sample_feature_path} does not exist.")



Sample rows from the 23_24 season:


Unnamed: 0,Season_x,Week,Match Date_x,Home Team,Away Team,Home Goals,Away Goals,Home Performance,Away Performance,Home Formation,...,Away_GoalsScored_Last5,Away_GoalsScored_Last10,Away_Points_Last5,Away_Points_Last10,Away_AvgAge_Last5,Away_AvgAge_Last10,Away_AvgValue_Last5,Away_AvgValue_Last10,Away_AvgRating_Last5,Away_AvgRating_Last10
0,23/24,Round 38,24/05/24,alan,anta,1,1,7.03,6.78,4-2-3-1,...,6.0,11.0,4.0,11.0,29.349091,29.614545,121714400.0,128175100.0,6.902727,6.925364
1,23/24,Round 38,25/05/24,fati,sams,3,1,7.01,6.74,4-2-3-1,...,5.0,9.0,4.0,10.0,27.963636,28.445455,137591800.0,137967000.0,6.819818,6.831364
2,23/24,Round 38,25/05/24,siva,kays,2,1,6.94,6.78,4-1-4-1,...,7.0,9.0,5.0,7.0,28.726599,28.83771,108295800.0,105626600.0,6.786727,6.801
3,23/24,Round 38,26/05/24,adan,başa,2,6,6.35,7.2,5-4-1,...,13.0,21.0,12.0,22.0,28.828409,28.888068,201710200.0,199129000.0,7.012727,6.999455
4,23/24,Round 38,26/05/24,fene,i̇st,6,0,7.54,6.35,4-4-2,...,1.0,6.0,0.0,3.0,26.417508,26.845118,404066000.0,452212500.0,6.726,6.738727
5,23/24,Round 38,26/05/24,hata,rize,2,0,7.14,6.64,4-1-4-1,...,5.0,17.0,2.0,11.0,26.301818,26.270909,139460600.0,160054100.0,6.909636,6.923418
6,23/24,Round 38,26/05/24,kası,beşi,2,1,7.25,6.79,4-1-4-1,...,8.0,13.0,5.0,10.0,28.05754,28.262059,367160500.0,439595100.0,6.922164,6.955182
7,23/24,Round 38,26/05/24,kony,gala,1,3,6.77,7.14,4-2-3-1,...,15.0,34.0,12.0,27.0,29.909091,29.827273,900000000.0,915818200.0,7.231455,7.227636
8,23/24,Round 38,26/05/24,pend,gazi,0,1,6.88,6.91,4-3-3,...,13.0,19.0,10.0,16.0,30.290909,30.118182,63963640.0,72138640.0,6.918,6.931727
9,23/24,Round 38,26/05/24,trab,anka,4,2,7.08,6.61,4-2-3-1,...,6.0,13.0,3.0,10.0,28.254545,28.318182,109939100.0,112469500.0,6.904727,6.900091
