In [2]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

# Connect to your Azure ML workspace (config.json is automatically used)
ml_client = MLClient.from_config(credential=DefaultAzureCredential())

# Load the Gold Data Asset
data_asset = ml_client.data.get(
    name="playeranalytics_DataAssest",
    version="1"
)

data_asset


Found the config file in: /config.json


Data({'path': 'azureml://subscriptions/a00dcbea-fd05-4973-82dc-120208b60116/resourcegroups/rg-60300294/workspaces/playeranalyticsml/datastores/playeranalyticsdatastore/paths/gold/', 'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_folder', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'playeranalytics_DataAssest', 'description': None, 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/a00dcbea-fd05-4973-82dc-120208b60116/resourceGroups/rg-60300294/providers/Microsoft.MachineLearningServices/workspaces/playeranalyticsml/data/playeranalytics_DataAssest/versions/1', 'Resource__source_path': '', 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/player-ci/code/Users/60302712', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7c7d8f486ce0>, 'serialize': <msrest.serialization.Serializer object at 0x7c7d8f487a60>, 'version': '1', 'latest_ve

In [4]:
import pandas as pd
import os

gold_uri = data_asset.path
gold_uri


'azureml://subscriptions/a00dcbea-fd05-4973-82dc-120208b60116/resourcegroups/rg-60300294/workspaces/playeranalyticsml/datastores/playeranalyticsdatastore/paths/gold/'

In [5]:
features_path = os.path.join(gold_uri, "player_season_features")
value_features_path = os.path.join(gold_uri, "player_season_value_features")

features_df = pd.read_parquet(features_path)
value_features_df = pd.read_parquet(value_features_path)

features_df.head(), value_features_df.head()


  mlflow.mismatch._check_version_mismatch()


(   player_id  season         player_name  club_id  matches_played  \
 0      92567    2012     Andriy Bogdanov      338              21   
 1      89222    2012  Yaroslav Rakitskyi      660              35   
 2       3182    2012         Ashley Cole      631              48   
 3      45596    2012      Márcio Mossoró     1075              39   
 4      49069    2012      Género Zeefuik      202              30   
 
    total_minutes  total_goals  total_assists  yellow_cards  red_cards  ...  \
 0           1336            3              2             4          0  ...   
 1           3063            1              4             6          1  ...   
 2           4220            1              5             7          0  ...   
 3           2706            6             10             8          0  ...   
 4           2340            5              3             3          0  ...   
 
    is_under_21  is_over_30  market_value_millions log_market_value  \
 0        False       False    

In [15]:
merge_keys = ["player_id", "season"]

full_df = features_df.merge(
    value_features_df,
    on=merge_keys,
    how="inner",
    suffixes=("_x", "_y")
)

print("Merged shape:", full_df.shape)
full_df.head()


Merged shape: (83827, 57)


Unnamed: 0,player_id,season,player_name_x,club_id_x,matches_played_x,total_minutes_x,total_goals_x,total_assists_x,yellow_cards_x,red_cards_x,...,assists_per90_y,cards_per90_y,name_y,position_y,sub_position_y,date_of_birth_y,current_club_id_y,age_at_season_start_y,club_name_y,season_market_value_eur_y
0,92567,2012,Andriy Bogdanov,338,21,1336,3,2,4,0,...,0.134731,0.269461,Andriy Bogdanov,Midfield,Central Midfield,1990-01-21,48332,22.0,Futbolniy Klub Dynamo Kyiv,2000000
1,89222,2012,Yaroslav Rakitskyi,660,35,3063,1,4,6,1,...,0.117532,0.205681,Yaroslav Rakitskyi,Defender,Centre-Back,1989-08-03,6992,23.0,FC Shakhtar Donetsk,12000000
2,3182,2012,Ashley Cole,631,48,4220,1,5,7,0,...,0.106635,0.149289,Ashley Cole,Defender,Left-Back,1980-12-20,12,32.0,Chelsea Football Club,8000000
3,45596,2012,Márcio Mossoró,1075,39,2706,6,10,8,0,...,0.332594,0.266075,Márcio Mossoró,Midfield,Attacking Midfield,1983-07-04,1467,29.0,Sporting Clube de Braga,5500000
4,49069,2012,Género Zeefuik,202,30,2340,5,3,3,0,...,0.115385,0.115385,Género Zeefuik,Attack,Centre-Forward,1990-04-05,202,22.0,Football Club Groningen,1000000


In [16]:
# Rename the label column
if "season_market_value_eur_y" in full_df.columns:
    full_df = full_df.rename(columns={"season_market_value_eur_y": "season_market_value_eur"})

# Remove the X version if it exists
if "season_market_value_eur_x" in full_df.columns:
    full_df = full_df.drop(columns=["season_market_value_eur_x"])

# Remove ALL other _y duplicates
cols_to_drop = [c for c in full_df.columns if c.endswith("_y")]
full_df = full_df.drop(columns=cols_to_drop)

# Remove the _x suffix from remaining columns
full_df.columns = [c.replace("_x", "") for c in full_df.columns]

# Final check
full_df.columns


Index(['player_id', 'season', 'player_name', 'club_id', 'matches_played',
       'total_minutes', 'total_goals', 'total_assists', 'yellow_cards',
       'red_cards', 'goals_per90', 'assists_per90', 'cards_per90', 'name',
       'position', 'sub_position', 'date_of_birth', 'current_club_id',
       'age_at_season_start', 'club_name', 'goal_contribution_per90',
       'goals_per_match', 'assists_per_match', 'cards_per_match',
       'minutes_per_match', 'discipline_index', 'age_squared', 'is_under_21',
       'is_over_30', 'market_value_millions', 'log_market_value',
       'position_index', 'sub_position_index', 'position_ohe',
       'sub_position_ohe', 'numeric_vector', 'numeric_scaled',
       'season_market_value_eur'],
      dtype='object')

In [17]:
label_col = "season_market_value_eur"

# Remove missing labels if any
full_df = full_df.dropna(subset=[label_col])

y = full_df[label_col]
X = full_df.drop(columns=[label_col])

X.shape, y.shape


((83827, 37), (83827,))

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

X_train.shape, X_test.shape


((67061, 37), (16766, 37))

In [21]:
# Select only numeric columns for modeling
numeric_X = X.select_dtypes(include=["number"])

numeric_X.shape


(83827, 24)

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    numeric_X,
    y,
    test_size=0.2,
    random_state=42
)

X_train.shape, X_test.shape


((67061, 24), (16766, 24))

In [23]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

rmse, r2


(136739.35867308444, 0.9998303424959103)

In [24]:
import joblib
import os
from azure.ai.ml.entities import Model

os.makedirs("outputs", exist_ok=True)
model_path = "outputs/player_value_model.pkl"

# Save the model locally in the compute instance
joblib.dump(model, model_path)

# Register in Azure ML workspace
registered_model = ml_client.models.create_or_update(
    Model(
        name="player_value_model",
        path=model_path,
        description="Random Forest regression model predicting season market value (EUR) for football players"
    )
)

registered_model


Uploading player_value_model.pkl (< 1 MB): 100%|██████████| 7.87M/7.87M [00:00<00:00, 32.6MB/s]




Model({'job_name': None, 'intellectual_property': None, 'system_metadata': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'player_value_model', 'description': 'Random Forest regression model predicting season market value (EUR) for football players', 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/a00dcbea-fd05-4973-82dc-120208b60116/resourceGroups/rg-60300294/providers/Microsoft.MachineLearningServices/workspaces/playeranalyticsml/models/player_value_model/versions/1', 'Resource__source_path': '', 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/player-ci/code/Users/60302712', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7c7cf55c4850>, 'serialize': <msrest.serialization.Serializer object at 0x7c7cf55c47f0>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/a00dcbea-fd05-4973-82dc-120208b60116/resourceGroups/rg-60300294/workspaces/playeran