# Podejście klasyczne ML - zbudowanie abetki

In [1]:
import pandas as pd
import json
from pyspark.sql import functions as F
from pyspark.sql.functions import col


In [2]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder \
    .appName("xT_preprocessing") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [3]:
# Load data
df_raw = spark.read.parquet('data/all_events_Italy_2015_2016.parquet')

# Basic info
print(f"Total events: {df_raw.count():,}")
print(f"Columns: {len(df_raw.columns)}")

Total events: 1,353,739
Columns: 116


In [4]:
final_columns = [
    # === CORE ===
    'id', 'match_id', 'possession', 'possession_team_id', 'team_id',
    'minute', 'second', 'type', 'duration',
    'location',
    'player_id', 'under_pressure', 'play_pattern',
    
    # === PASS ===
    'pass_end_location', 'pass_length', 'pass_angle',
    'pass_height', 'pass_body_part', 'pass_type',
    'pass_cross', 'pass_switch', 'pass_through_ball',
    'pass_shot_assist',
    
    # === SHOT ===
    'shot_outcome',
    
    # === CARRY ===
    'carry_end_location'
]

In [5]:
df = df_raw.select(final_columns)
print(df.count())
print(len(df.columns))

1353739
25


In [6]:
# Filter movement events only
movement_types = ['Pass', 'Carry', 'Shot']

df = df.filter(
    df['type'].isin(movement_types)
)

df = df.filter(
    (F.col("type") != "Shot") | 
    (F.col("shot_type").isNull()) |
    (F.col("shot_type") != "Penalty")
)

print(df.count())
print(len(df.columns))

# Show distribution
print("\nEvent type distribution:")
df.groupBy('type').count().orderBy('count', ascending=False).show()

664364
25

Event type distribution:
+-----+------+
| type| count|
+-----+------+
| Pass|373667|
|Carry|280820|
| Shot|  9877|
+-----+------+



In [7]:
from pyspark.sql import functions as F

# PASS features - tylko dla Pass events
print("=== PASS FEATURES (% of Pass events) ===")
total_passes = df.filter(F.col('type') == 'Pass').count()

pass_features = ['pass_cross', 'pass_switch', 'pass_through_ball', 'pass_shot_assist',
                 'pass_height', 'pass_body_part', 'pass_type']

for col in pass_features:
    print(f"\n--- {col} ---")
    df.filter(F.col('type') == 'Pass') \
        .groupBy(col) \
        .agg(F.count('*').alias('count')) \
        .withColumn('percent', F.round(F.col('count') / total_passes * 100, 2)) \
        .orderBy(F.desc('count')) \
        .show(truncate=False)

=== PASS FEATURES (% of Pass events) ===

--- pass_cross ---
+----------+------+-------+
|pass_cross|count |percent|
+----------+------+-------+
|NULL      |364015|97.42  |
|true      |9652  |2.58   |
+----------+------+-------+


--- pass_switch ---
+-----------+------+-------+
|pass_switch|count |percent|
+-----------+------+-------+
|NULL       |362460|97.0   |
|true       |11207 |3.0    |
+-----------+------+-------+


--- pass_through_ball ---
+-----------------+------+-------+
|pass_through_ball|count |percent|
+-----------------+------+-------+
|NULL             |372085|99.58  |
|true             |1582  |0.42   |
+-----------------+------+-------+


--- pass_shot_assist ---
+----------------+------+-------+
|pass_shot_assist|count |percent|
+----------------+------+-------+
|NULL            |367140|98.25  |
|true            |6527  |1.75   |
+----------------+------+-------+


--- pass_height ---
+-----------+------+-------+
|pass_height|count |percent|
+-----------+------+------

In [8]:
from pyspark.sql import functions as F


# 1. Usuń kolumny
df = df.drop('pass_cross', 'pass_switch', 'pass_through_ball', 'pass_type')

# 2. Pass body part → 3 kategorie (Right Foot, Left Foot, Other)
df = df.withColumn('pass_body_part',
    F.when(F.col('pass_body_part') == 'Right Foot', 'Right Foot')
     .when(F.col('pass_body_part') == 'Left Foot', 'Left Foot')
     .when(F.col('pass_body_part').isNotNull(), 'Other')  # Head, Keeper Arm, etc.
     .otherwise(None))  # NULL dla non-Pass

# 3. Shot outcome → Goal/No Goal
df = df.withColumn('shot_outcome',
    F.when(F.col('shot_outcome') == 'Goal', 'Goal')
     .when(F.col('shot_outcome').isNotNull(), 'No Goal')
     .otherwise(None))


# 6. Pass shot assist - null, No, Yes
df = df.withColumn('pass_shot_assist',
    F.when(F.col('type') == 'Pass', 
           F.when(F.col('pass_shot_assist') == 'true', 'Yes').otherwise('No'))
     .otherwise(None))

# Weryfikacja
print("=== PASS BODY PART ===")
df.filter(F.col('type') == 'Pass').groupBy('pass_body_part').count().show()

print("=== SHOT OUTCOME ===")
df.filter(F.col('type') == 'Shot').groupBy('shot_outcome').count().show()

=== PASS BODY PART ===
+--------------+------+
|pass_body_part| count|
+--------------+------+
|    Right Foot|219909|
|          NULL| 26631|
|     Left Foot|103728|
|         Other| 23399|
+--------------+------+

=== SHOT OUTCOME ===
+------------+-----+
|shot_outcome|count|
+------------+-----+
|        Goal|  858|
|     No Goal| 9019|
+------------+-----+



In [9]:
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

# Extract X, Y z location
df = df.withColumn('start_x', F.col('location')[0].cast(FloatType())) \
       .withColumn('start_y', F.col('location')[1].cast(FloatType()))

# Przelicz start_x i start_y jeśli team != possession_team
df = df.withColumn('start_x',
    F.round(
        F.when(F.col('team_id') != F.col('possession_team_id'), 120 - F.col('start_x'))
         .otherwise(F.col('start_x')),
        1
    ))
df = df.withColumn('start_y',
    F.round(
        F.when(F.col('team_id') != F.col('possession_team_id'), 80 - F.col('start_y'))
         .otherwise(F.col('start_y')),
        1
    ))

# Extract end_x, end_y z pass_end_location
df = df.withColumn('pass_end_x', F.col('pass_end_location')[0].cast(FloatType())) \
       .withColumn('pass_end_y', F.col('pass_end_location')[1].cast(FloatType()))

# Przelicz pass_end_x i pass_end_y jeśli team != possession_team
df = df.withColumn('pass_end_x',
    F.round(
        F.when(F.col('team_id') != F.col('possession_team_id'), 120 - F.col('pass_end_x'))
         .otherwise(F.col('pass_end_x')),
        1
    ))
df = df.withColumn('pass_end_y',
    F.round(
        F.when(F.col('team_id') != F.col('possession_team_id'), 80 - F.col('pass_end_y'))
         .otherwise(F.col('pass_end_y')),
        1
    ))

# Extract end_x, end_y z carry_end_location
df = df.withColumn('carry_end_x', F.col('carry_end_location')[0].cast(FloatType())) \
       .withColumn('carry_end_y', F.col('carry_end_location')[1].cast(FloatType()))

# Przelicz carry_end_x i carry_end_y jeśli team != possession_team
df = df.withColumn('carry_end_x',
    F.round(
        F.when(F.col('team_id') != F.col('possession_team_id'), 120 - F.col('carry_end_x'))
         .otherwise(F.col('carry_end_x')),
        1
    ))
df = df.withColumn('carry_end_y',
    F.round(
        F.when(F.col('team_id') != F.col('possession_team_id'), 80 - F.col('carry_end_y'))
         .otherwise(F.col('carry_end_y')),
        1
    ))

# Utwórz unified end_x, end_y (pass lub carry)
df = df.withColumn('end_x',
    F.when(F.col('type') == 'Pass', F.col('pass_end_x'))
     .when(F.col('type') == 'Carry', F.col('carry_end_x'))
     .otherwise(None))
df = df.withColumn('end_y',
    F.when(F.col('type') == 'Pass', F.col('pass_end_y'))
     .when(F.col('type') == 'Carry', F.col('carry_end_y'))
     .otherwise(None))

# Drop oryginalne i pomocnicze
df = df.drop('location', 'pass_end_location', 'carry_end_location',
             'pass_end_x', 'pass_end_y', 'carry_end_x', 'carry_end_y')

In [10]:
from pyspark.sql.functions import col

df.select("type", "team_id", "possession_team_id", "start_x", "start_y", "end_x", "end_y").filter((col('match_id')==3879863) &(col('possession')==2)).orderBy("index").show()

+-----+-------+------------------+-------+-------+-----+-----+
| type|team_id|possession_team_id|start_x|start_y|end_x|end_y|
+-----+-------+------------------+-------+-------+-----+-----+
| Pass|    228|               228|   61.0|   40.1| 58.7| 42.1|
|Carry|    228|               228|   58.7|   42.1| 57.6| 42.1|
| Pass|    228|               228|   57.6|   42.1| 35.5| 46.4|
|Carry|    228|               228|   35.5|   46.4| 37.3| 46.2|
| Pass|    228|               228|   37.3|   46.2| 78.1| 10.2|
| Pass|    230|               228|   78.0|   10.1| 67.3| 40.3|
| Pass|    230|               228|   66.7|   40.3| 66.0| 46.5|
|Carry|    228|               228|   66.1|   46.6| 70.4| 60.9|
|Carry|    228|               228|   70.4|   60.9| 70.4| 61.6|
|Carry|    228|               228|   71.1|   75.7| 76.0| 75.7|
| Pass|    228|               228|   76.0|   75.7| 84.7| 71.2|
|Carry|    228|               228|   84.7|   71.2| 81.5| 71.0|
| Pass|    228|               228|   81.5|   71.0| 65.5

In [11]:
df = df.withColumn('time_seconds', 
                   F.col('minute') * 60 + F.col('second'))

In [12]:
df.columns

['id',
 'match_id',
 'possession',
 'possession_team_id',
 'team_id',
 'minute',
 'second',
 'type',
 'duration',
 'player_id',
 'under_pressure',
 'play_pattern',
 'pass_length',
 'pass_angle',
 'pass_height',
 'pass_body_part',
 'pass_shot_assist',
 'shot_outcome',
 'start_x',
 'start_y',
 'end_x',
 'end_y',
 'time_seconds']

In [13]:
# Sprawdź rozkład goli
print("=== ROZKŁAD GOLI (event-level) ===")
df.groupBy('shot_outcome').count() \
    .withColumn('percent', F.round(F.col('count') / df.count() * 100, 2)) \
    .show()

=== ROZKŁAD GOLI (event-level) ===
+------------+------+-------+
|shot_outcome| count|percent|
+------------+------+-------+
|        Goal|   858|   0.13|
|        NULL|654487|  98.51|
|     No Goal|  9019|   1.36|
+------------+------+-------+



In [15]:
from pyspark.sql import Window

# Definiuj okno dla akcji (match_id + possession)
window_action = Window.partitionBy('match_id', 'possession')

# Stwórz action_result: 1 jeśli w akcji jest Shot z Goal, 0 w przeciwnym
df = df.withColumn('action_result',
    F.max(
        F.when(
            (F.col('type') == 'Shot') & (F.col('shot_outcome') == 'Goal'),
            1
        ).otherwise(0)
    ).over(window_action)
)

# Weryfikacja
print("=== ROZKŁAD action_result ===")
df.groupBy('action_result').count() \
    .withColumn('percent', F.round(F.col('count') / df.count() * 100, 2)) \
    .orderBy('action_result') \
    .show()

=== ROZKŁAD action_result ===
+-------------+------+-------+
|action_result| count|percent|
+-------------+------+-------+
|            0|654173|  98.47|
|            1| 10191|   1.53|
+-------------+------+-------+



In [23]:
from pyspark.sql import functions as F

# Krok 1: Usuń strzały z dataframe'a
print("=== PRZED USUNIĘCIEM STRZAŁÓW ===")
print(f"Liczba wierszy: {df.count()}")
df.groupBy('type').count().orderBy(F.desc('count')).show(10)

df = df.filter(~F.col('type').isin(['Shot', 'Carry']))

print("\n=== PO USUNIĘCIU STRZAŁÓW ===")
print(f"Liczba wierszy: {df.count()}")

# Krok 2: Sprawdź aktualny rozkład action_result
print("\n=== ROZKŁAD action_result PRZED UNDERSAMPLING ===")
df.groupBy('action_result').count() \
    .withColumn('percent', F.round(F.col('count') / df.count() * 100, 2)) \
    .orderBy('action_result') \
    .show()

# Krok 3: Undersampling - action_result==1 ma stanowić 5%
# Jeśli action_result==1 to 5%, to action_result==0 to 95%
# Stosunek: 0.95 / 0.05 = 19, czyli na każdy gol mamy 19 nie-goli

# Parametr: docelowy procent goli
target_goal_percentage = 0.05  # 5%

df_goals = df.filter(F.col('action_result') == 1)
df_non_goals = df.filter(F.col('action_result') == 0)

count_goals = df_goals.count()
count_non_goals = df_non_goals.count()

print(f"\nLiczba akcji z golem: {count_goals}")
print(f"Liczba akcji bez gola: {count_non_goals}")

# Oblicz ile nie-goli potrzebujemy na podstawie target_goal_percentage
# Jeśli gole mają być X%, to: goals / (goals + non_goals) = X
# Przekształcając: non_goals = goals * (1 - X) / X
target_non_goals = int(count_goals * (1 - target_goal_percentage) / target_goal_percentage)

print(f"Docelowa liczba akcji bez gola (dla {target_goal_percentage*100}% goli): {target_non_goals}")

# Fraction dla sampla
fraction = target_non_goals / count_non_goals

print(f"Fraction do sampla: {fraction:.4f}")

# Sample nie-goli
df_non_goals_sampled = df_non_goals.sample(withReplacement=False, fraction=fraction, seed=42)

# Połącz z powrotem
df = df_goals.union(df_non_goals_sampled)

# Weryfikacja
print("\n=== ROZKŁAD action_result PO UNDERSAMPLING ===")
df.groupBy('action_result').count() \
    .withColumn('percent', F.round(F.col('count') / df.count() * 100, 2)) \
    .orderBy('action_result') \
    .show()

print(f"\nFinalna liczba wierszy: {df.count()}")

=== PRZED USUNIĘCIEM STRZAŁÓW ===
Liczba wierszy: 184445
+-----+------+
| type| count|
+-----+------+
| Pass|104932|
|Carry| 79513|
+-----+------+


=== PO USUNIĘCIU STRZAŁÓW ===
Liczba wierszy: 104932

=== ROZKŁAD action_result PRZED UNDERSAMPLING ===
+-------------+-----+-------+
|action_result|count|percent|
+-------------+-----+-------+
|            0|99926|  95.23|
|            1| 5006|   4.77|
+-------------+-----+-------+


Liczba akcji z golem: 5006
Liczba akcji bez gola: 99926
Docelowa liczba akcji bez gola (dla 5.0% goli): 95113
Fraction do sampla: 0.9518

=== ROZKŁAD action_result PO UNDERSAMPLING ===
+-------------+-----+-------+
|action_result|count|percent|
+-------------+-----+-------+
|            0|95152|   95.0|
|            1| 5006|    5.0|
+-------------+-----+-------+


Finalna liczba wierszy: 100158


In [24]:
from pyspark.sql import functions as F

# Stwórz zmienną speed jako pass_length / duration
df = df.withColumn('speed', 
    F.when(
        (F.col('duration').isNotNull()) & (F.col('duration') != 0),
        F.round(F.col('pass_length') / F.col('duration'), 2)
    ).otherwise(None)
)

# Weryfikacja
print("=== STATYSTYKI SPEED ===")
df.select('speed').describe().show()

print("\n=== PRZYKŁADOWE WARTOŚCI ===")
df.select('type', 'pass_length', 'duration', 'speed') \
    .filter(F.col('speed').isNotNull()) \
    .show(20)

print("\n=== LICZBA NULL W SPEED ===")
df.select(
    F.count('*').alias('total'),
    F.count('speed').alias('speed_not_null'),
    (F.count('*') - F.count('speed')).alias('speed_null')
).show()

=== STATYSTYKI SPEED ===
+-------+------------------+
|summary|             speed|
+-------+------------------+
|  count|            100084|
|   mean|14.475953598976863|
| stddev| 39.71921723317651|
|    min|               0.0|
|    max|           9503.61|
+-------+------------------+


=== PRZYKŁADOWE WARTOŚCI ===
+----+-----------+--------+-----+
|type|pass_length|duration|speed|
+----+-----------+--------+-----+
|Pass|  23.494041|1.506628|15.59|
|Pass|   9.767292|2.148314| 4.55|
|Pass|   13.82787|1.573594| 8.79|
|Pass|  25.150944|2.228945|11.28|
|Pass|  2.7294688|0.159632| 17.1|
|Pass|       37.4|1.764948|21.19|
|Pass|  21.140009|1.941812|10.89|
|Pass|  10.575916| 1.31439| 8.05|
|Pass|   17.10117|1.521391|11.24|
|Pass|  10.270346| 1.10426|  9.3|
|Pass|  18.866373|1.008285|18.71|
|Pass|  12.414508| 0.90852|13.66|
|Pass|   30.73272|2.623931|11.71|
|Pass|  13.200378| 1.13906|11.59|
|Pass|     10.002|0.998102|10.02|
|Pass|  26.239094|1.231181|21.31|
|Pass|  47.420353|2.168334|21.87|
|Pa

In [25]:
df.columns

['id',
 'match_id',
 'possession',
 'possession_team_id',
 'team_id',
 'minute',
 'second',
 'type',
 'duration',
 'player_id',
 'under_pressure',
 'play_pattern',
 'pass_length',
 'pass_angle',
 'pass_height',
 'pass_body_part',
 'pass_shot_assist',
 'shot_outcome',
 'start_x',
 'start_y',
 'end_x',
 'end_y',
 'time_seconds',
 'action_result',
 'speed']

In [26]:
from pyspark.sql import functions as F

# 2. Konwertuj do pandas
df_pd = df.toPandas()
print(f"\nPandas rows: {len(df_pd)}")
print(f"Pandas columns: {len(df_pd.columns)}")

# 3. Zdefiniuj kolumny do usunięcia z X
exclude_cols = [
    'match_id', 'possession', 'id',
 'possession_team_id',
 'team_id',
 'minute',
 'second',
 'player_id',
'time_seconds',
    'action_result',
    'shot_outcome'
]

# 4. X i y
feature_cols = [col for col in df_pd.columns if col not in exclude_cols]
X = df_pd[feature_cols].copy()
y = df_pd['action_result']

print("\n=== X FEATURES ===")
print(f"Number of features: {len(feature_cols)}")
print(feature_cols)

print("\n=== y TARGET ===")
print(y.value_counts())
print(f"Goal %: {y.mean()*100:.2f}%")

print("\n=== X DTYPES ===")
print(X.dtypes)


Pandas rows: 100282
Pandas columns: 25

=== X FEATURES ===
Number of features: 14
['type', 'duration', 'under_pressure', 'play_pattern', 'pass_length', 'pass_angle', 'pass_height', 'pass_body_part', 'pass_shot_assist', 'start_x', 'start_y', 'end_x', 'end_y', 'speed']

=== y TARGET ===
action_result
0    95276
1     5006
Name: count, dtype: int64
Goal %: 4.99%

=== X DTYPES ===
type                 object
duration            float64
under_pressure       object
play_pattern         object
pass_length         float64
pass_angle          float64
pass_height          object
pass_body_part       object
pass_shot_assist     object
start_x             float32
start_y             float32
end_x               float32
end_y               float32
speed               float64
dtype: object


In [19]:
# pip install xgboost

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import pandas as pd

# 1. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train size: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test size: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")
print(f"Train goal %: {y_train.mean()*100:.2f}%")
print(f"Test goal %: {y_test.mean()*100:.2f}%")

# 2. Label encode categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
label_encoders = {}

X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

for col in categorical_cols:
    le = LabelEncoder()
    # Fit na train, transform na train i test
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))
    X_test_encoded[col] = le.transform(X_test[col].astype(str))
    label_encoders[col] = le

print(f"\nEncoded {len(categorical_cols)} categorical columns")

# 3. Handle NaN w numeric columns
X_train_encoded = X_train_encoded.fillna(-999)
X_test_encoded = X_test_encoded.fillna(-999)

# 4. Basic XGBoost
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),  # Handle imbalance
    random_state=42,
    eval_metric='logloss',
    importance_type='gain'
)

print("\n=== TRAINING ===")
model.fit(X_train_encoded, y_train)

# 5. Predictions
y_pred = model.predict(X_test_encoded)
y_pred_proba = model.predict_proba(X_test_encoded)[:, 1]

# 6. Evaluation
print("\n=== CLASSIFICATION REPORT ===")
print(classification_report(y_test, y_pred))

print("\n=== CONFUSION MATRIX ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== ROC AUC ===")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

# 7. Feature importance
print("\n=== TOP 15 FEATURE IMPORTANCE ===")
feature_importance = pd.DataFrame({
    'feature': X_train_encoded.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance)

Train size: 80225 (80.0%)
Test size: 20057 (20.0%)
Train goal %: 4.99%
Test goal %: 4.99%

Encoded 6 categorical columns

=== TRAINING ===

=== CLASSIFICATION REPORT ===
              precision    recall  f1-score   support

           0       0.96      0.70      0.81     19056
           1       0.08      0.50      0.14      1001

    accuracy                           0.69     20057
   macro avg       0.52      0.60      0.48     20057
weighted avg       0.92      0.69      0.78     20057


=== CONFUSION MATRIX ===
[[13433  5623]
 [  504   497]]

=== ROC AUC ===
ROC AUC Score: 0.6473

=== TOP 15 FEATURE IMPORTANCE ===
             feature  importance
6        pass_height    0.133404
9            start_x    0.108970
11             end_x    0.101835
3       play_pattern    0.094187
12             end_y    0.076733
8   pass_shot_assist    0.074136
1           duration    0.071157
10           start_y    0.069923
5         pass_angle    0.058283
13             speed    0.058211
4        

In [21]:
# 1. Sprawdź korelację features z targetem
correlations = X_train_encoded.corrwith(y_train).abs().sort_values(ascending=False)
print("Top 10 correlations with target:")
print(correlations)

Top 10 correlations with target:
end_x               0.080268
start_x             0.079393
pass_length         0.011389
pass_shot_assist    0.011366
type                0.011143
pass_angle          0.011140
play_pattern        0.010633
end_y               0.009854
start_y             0.009821
duration            0.009257
pass_height         0.005207
pass_body_part      0.002827
under_pressure      0.001071
dtype: float64


In [22]:
# 1. Czy shot_technique i shot_body_part to leakage?
# Te zmienne są tylko dla Shot events, więc jeśli type != Shot, to NULL
print("=== SHOT FEATURES vs TYPE ===")
print(f"Non-null shot_technique: {X_train['shot_technique'].notna().sum()}")
print(f"Non-null shot_body_part: {X_train['shot_body_part'].notna().sum()}")
print(f"Type == Shot: {(X_train['type'] == 'Shot').sum()}")

# 2. Czy end_x, end_y są tylko dla Pass/Carry?
print("\n=== END LOCATION vs TYPE ===")
print(f"Non-null end_x: {X_train['end_x'].notna().sum()}")
print(f"Type Pass or Carry: {X_train['type'].isin(['Pass', 'Carry']).sum()}")

# 3. Sprawdź korelację shot_technique z targetem
print("\n=== SHOT_TECHNIQUE (encoded) vs TARGET ===")
print(pd.crosstab(y_train, X_train_encoded['shot_technique']))

=== SHOT FEATURES vs TYPE ===


KeyError: 'shot_technique'