# Codecup Competition Series 7: Data Science | Part 3

## Author: Arash Hajian nezhad
---

#### Imports


In [None]:
import numpy as np
import pandas as pd
import optuna

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool, cv

#### Data loading

In [None]:
df = pd.read_csv('data/tsubasa/train.csv')
print(df['outcome'].unique())
print(df.head())

#### Encode outcome column values:
- 0 -> (not goal)
- 1 -> (goal)

In [None]:
replace_dict = {
    'گُل': 1,
    'گُل به خودی': 1,
    'مهار توسط دروازه بان': 0,
    'موقعیت از دست رفته': 0,
    'برخورد به دفاع': 0,
    'برخورد به تیردروازه': 0,
}
df.replace({'outcome': replace_dict}, inplace=True)
df.head()

#### Feature Engineering: Creating spatial categories based on (x, y)

In [None]:
def engineer_shooting_position(df: pd.DataFrame, offset_from_end: int) -> pd.DataFrame:
    conditions = [
        (df['x'] <= 5.5) & (df['y'] >= 0) & (df['y'] <= 5.5),
        (df['x'] <= 5.5) & (df['y'] < 0) & (df['y'] >= -5.5),
        (df['x'] > 5.5) & (df['x'] <= 16.5) & (df['y'] >= 0) & (df['y'] <= 20.15),
        (df['x'] > 5.5) & (df['x'] <= 16.5) & (df['y'] < 0) & (df['y'] >= -20.15),
        (df['x'] == 11) & (df['y'] == 0),  # penalty
        (df['x'] >= 0) & (df['x'] <= 27) & (df['y'] >= 0),
        (df['x'] >= 0) & (df['x'] <= 27) & (df['y'] < 0),
        (df['x'] > 27) & (df['y'] >= 0),
        (df['x'] > 27) & (df['y'] < 0),
    ]

    positions = [
        'pos_1',
        'pos_2',
        'pos_3',
        'pos_4',
        'pos_5',
        'pos_6',
        'pos_7',
        'pos_8',
        'pos_9',
    ]

    df['shootingPosition'] = np.select(conditions, positions)

    # move the column before the label column (for catboost categorical column specification)
    df.insert(len(df.columns) - offset_from_end, 'shootingPosition', df.pop('shootingPosition'))
    
    return df

df = engineer_shooting_position(df, 2)

#### Drop the id clomuns and the _now_ redundant (x, y) and minute columns

In [None]:
df.drop(['matchId', 'playerId', 'second', 'x', 'y', 'minute'], axis=1, inplace=True)

#### Fill `nan` values inside `interferenceOnShooter` column

In [None]:
df['interferenceOnShooter'].fillna('نامشخص', inplace=True)

#### Specifying categorical indices for use in CatBoost

In [None]:
categorical_features_indices = (0, 1, 4, 5)

#### Label seperation & Data splitting for training & validation

In [None]:
X = df.drop('outcome', axis=1)
y = df['outcome']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85, shuffle=True)

#### `CatBoost` model hyperparameters tuning using `Optuna`

In [None]:
def optuna_objective(trial):
    hyperparameters = {
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 2, 50, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.2),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1),  # can't use with GPU training / CPU only
        'depth': trial.suggest_int('depth', 1, 12),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical(
            'bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']
        ),

        # constant variables
        'iterations': 500,
        'verbose': False,
        'eval_metric': 'Accuracy',
        'use_best_model': True,
        'objective': 'Logloss',
        'loss_function': 'Logloss',
        # 'task_type': 'GPU',  # can't use with cross-validation
    }

    if hyperparameters['bootstrap_type'] == 'Bayesian':
        hyperparameters.update({'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 10)})
    elif hyperparameters['bootstrap_type'] == 'Bernoulli':
        hyperparameters.update({'subsample': trial.suggest_float('subsample', 0.1, 1)})

    model = CatBoostClassifier(**hyperparameters)
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        logging_level='Silent',
        plot=False,
    )
    
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return best_accuracy  # as we use optuna's `maximize` direction


study = optuna.create_study(direction='maximize')
study.optimize(optuna_objective, n_trials=2)

#### Getting optimization results

In [None]:
print('Number of Trials successfully finished: ', len(study.trials))

best_trial = study.best_trial
print('Best Trial\'s score:', best_trial.value)

best_hyperparams = best_trial.params
print('Best hyperparameters:', best_hyperparams)

#### Cross-Validation using the best hyperparameters obtained from `Optuna`

In [None]:
best_hyperparams.update({
        'iterations': 500,
        'verbose': False,
        'eval_metric': 'Accuracy',
        'use_best_model': True,
        'loss_function': 'Logloss',
})

model = CatBoostClassifier(**best_hyperparams)

cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        plot=False,
)

#### Creating data pools and Training the model

In [None]:
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validation_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

model.fit(X=train_pool, eval_set=validation_pool, plot=False)

#### Generating output for the competition

In [None]:
# Load test (competition) dataset
comp_df = pd.read_csv('data/tsubasa/test.csv')

# Rearrange columns for the same representation as the train dataset
comp_df.insert(8, 'minute', comp_df.pop('minute'))
comp_df.insert(8, 'second', comp_df.pop('second'))

# Apply same feature engineering as the training phase
comp_df = engineer_shooting_position(comp_df, 1)

# drop unwanted columns
comp_df.drop(['x', 'y', 'second', 'minute'], axis=1, inplace=True)

# Fill `nan` values inside `interferenceOnShooter` column
comp_df['interferenceOnShooter'].fillna('نامشخص', inplace=True)

# Get prediction
prediction_probs = model.predict_proba(comp_df)
prediction = pd.DataFrame([pred[1] for pred in prediction_probs], columns=['prediction'])

# Saving the output
prediction.to_csv('question_3fe_output.csv', index=False)