In [14]:
import pandas as pd
df = pd.read_json("../data/pp_train.json")

In [15]:
from sklearn.model_selection import train_test_split

df_own_split = df.copy()
df_own_split["target"] = "0"

# Assuming 'player_id' is present in the original df
unique_ids = df["player_id"].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.15, random_state=1)

train_df = df[df["player_id"].isin(train_ids)].copy()
train_df["target"] = "0"
test_df = df[df["player_id"].isin(test_ids)].copy()
test_df["target"] = "0"

In [16]:
train_ids_set = set(train_df["player_id"].unique())
test_ids_set = set(test_df["player_id"].unique())

only_in_train = train_ids_set - test_ids_set
only_in_test = test_ids_set - train_ids_set

print("Player IDs only in train:", len(only_in_train))
print("Player IDs only in test:", len(only_in_test))

Player IDs only in train: 2890
Player IDs only in test: 511


In [17]:
print(f"{'Feature':<30} {'Only in Train':<15} {'Only in Test'}")
print("-" * 60)
for col in train_df.columns:
    if col == "target":
        continue
    train_vals = set(train_df[col].dropna().unique())
    test_vals = set(test_df[col].dropna().unique())
    only_in_train = len(train_vals - test_vals)
    only_in_test = len(test_vals - train_vals)
    print(f"{col:<30} {only_in_train:<15} {only_in_test}")

Feature                        Only in Train   Only in Test
------------------------------------------------------------
injury_category                0               0
market_value_category          0               0
age                            5               0
coach_id                       6               0
player_id                      2890            511
club_id                        1               0
league_id                      0               0
season_id                      0               0
injury                         100             8
last_transfer_fee              276             26
first_name                     1196            170
last_name                      2494            427
pseudonym                      106             22
position                       0               0
foot                           0               0
citizenship                    54              1
height                         6               0
club                           1      

In [20]:
df_test_proc = pd.read_json("../data/tm_minimal/test_split_no_new_categories.json")

print(f"{'Feature':<30} {'Only in Train':<15} {'Only in Test'}")
print("-" * 60)
for col in train_df.columns:
    if col == "target":
        continue
    train_vals = set(train_df[col].dropna().unique())
    test_vals = set(df_test_proc[col].dropna().unique())
    only_in_train = len(train_vals - test_vals)
    only_in_test = len(test_vals - train_vals)
    print(f"{col:<30} {only_in_train:<15} {only_in_test}")

Feature                        Only in Train   Only in Test
------------------------------------------------------------
injury_category                0               0
market_value_category          0               0
age                            5               0
coach_id                       6               0
player_id                      2457            0
club_id                        1               0
league_id                      0               0
season_id                      0               0
injury                         95              0
last_transfer_fee              276             26
first_name                     1087            0
last_name                      2142            0
pseudonym                      106             0
position                       0               0
foot                           0               0
citizenship                    54              0
height                         6               0
club                           1             