In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!pip install feature_engine 2>/dev/null 1>&2
!pip install fastparquet 2>/dev/null 1>&2

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random

from sklearn.preprocessing import StandardScaler
from feature_engine.wrappers import SklearnTransformerWrapper as SKWrapper
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from keras import optimizers
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, BatchNormalization

from matplotlib import pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Downloading data**

In [None]:
INPUT = '../input/tabular-playground-series-oct-2022/'

df_train_dtypes = pd.read_csv(INPUT + 'train_dtypes.csv')
df_test_dtypes = pd.read_csv(INPUT + 'test_dtypes.csv')
train_dtypes = {k: v for (k, v) in zip(df_train_dtypes.column, df_train_dtypes.dtype)}
test_dtypes = {k: v for (k, v) in zip(df_test_dtypes.column, df_test_dtypes.dtype)}

train_list = []
num = 5
for i in range(num):
#     df = pd.read_csv(INPUT + f'train_{i}.csv', dtype = train_dtypes)
    df = pd.read_csv(INPUT + f'train_{i}.csv')
    df.to_parquet(f'train_{i}.parquet.gzip', compression='gzip')
    print('Done with File', i)
    train_list.append(pd.read_parquet(f'train_{i}.parquet.gzip'))

# dft = pd.read_csv(INPUT + 'test.csv', dtype = test_dtypes)
dft = pd.read_csv(INPUT + 'test.csv')
dft.to_parquet('test.parquet.gzip', compression='gzip')
print('Done with File test')
df_test = pd.read_parquet('test.parquet.gzip')
df_sample = pd.read_csv(INPUT + 'sample_submission.csv')

**Preprocessing data**

Thanks to @Jose Cáliz for feature engineering ideas!

In [None]:
for i in range(num):
    print(train_list[i].shape)
    games = random.sample(list(train_list[i].game_num.unique()), 150)
    train_list[i] = train_list[i][train_list[i].game_num.isin(games)]
    print(train_list[i].shape)


In [None]:
for i in range(num):
    train_list[i]['label'] = train_list[i].team_A_scoring_within_10sec + train_list[i].team_B_scoring_within_10sec.replace(1, 2)
    train_list[i].label.value_counts(True).to_frame(name='label proportion')

In [None]:
for i in range(num):
    train_list[i]['ball_distance_to_goal_A'] = np.sqrt(
        (train_list[i].ball_pos_x)**2 + (train_list[i].ball_pos_y + 100)**2
    )
    train_list[i]['ball_distance_to_goal_B'] = np.sqrt(
        (train_list[i].ball_pos_x)**2 + (train_list[i].ball_pos_y - 100)**2
    )
    
df_test['ball_distance_to_goal_A'] = np.sqrt(
        (df_test.ball_pos_x)**2 + (df_test.ball_pos_y + 100)**2
    )
df_test['ball_distance_to_goal_B'] = np.sqrt(
        (df_test.ball_pos_x)**2 + (df_test.ball_pos_y - 100)**2
    )

In [None]:
# for i in range(num):
#     train_list[i] = train_list[i].dropna()

In [None]:
columns_to_keep = [
    'ball_pos_y', 'ball_pos_x', 'ball_vel_y',
    'p0_pos_y', 'p1_pos_y', 'p2_pos_y',
    'p3_pos_y', 'p4_pos_y', 'p5_pos_y', 
    'ball_distance_to_goal_A', 'ball_distance_to_goal_B'
]
target = []
train = []
for i in range(num):
    target.append(pd.get_dummies(train_list[i]['label']))
#     target.append(train_list[i][['team_A_scoring_within_10sec','team_B_scoring_within_10sec']])
    train.append(train_list[i].drop(['game_num', 'event_id', 'event_time', 'player_scoring_next', 'team_scoring_next', 'team_A_scoring_within_10sec', 'team_B_scoring_within_10sec', 'label'], axis = 1))
#     train.append(train_list[i][columns_to_keep])

for i in range(num):
    target[i].columns = ['nobody_scores', 'team_A_scores', 'team_b_scores']
    
test = df_test.drop(['id'], axis = 1)
# test = df_test[columns_to_keep]

In [None]:
for i in range(num):
#     train[i] = train[i].fillna(train[i].median())
    train[i] = train[i].fillna(0)
    
# test = test.fillna(test.median())
test = test.fillna(0)


In [None]:
test

In [None]:
# scaler = StandardScaler()
scaler = SKWrapper(StandardScaler(), variables=train[0].columns.tolist())
scaler.fit(train[0])
for i in range(num):
    train[i] = scaler.transform(train[i])
test = scaler.transform(test)

In [None]:
# X_train, X_valid, y_train, y_valid = train_test_split(train[0], target[0], test_size = 0.2, shuffle=True)

**Model evaluation**

In [None]:
modelnn = Sequential()
# modelnn.add(Dense(512, activation="leaky_relu"))
# modelnn.add(BatchNormalization())
# # modelnn.add(Dropout(0.2))
modelnn.add(Dense(256, activation="leaky_relu"))
modelnn.add(BatchNormalization())
modelnn.add(Dropout(0.1))
modelnn.add(Dense(128, activation="leaky_relu"))
modelnn.add(BatchNormalization())
modelnn.add(Dropout(0.1))
modelnn.add(Dense(64, activation="relu"))
modelnn.add(BatchNormalization())
modelnn.add(Dropout(0.1))
modelnn.add(Dense(32, activation="relu"))
modelnn.add(BatchNormalization())
modelnn.add(Dense(3, activation="softmax"))

opt = keras.optimizers.Adam(learning_rate=0.0001)

loss_fn = tf.keras.losses.CategoricalCrossentropy()
modelnn.compile(loss=loss_fn, optimizer=opt)

early_stopping = keras.callbacks.EarlyStopping(
    patience=30,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
# history = modelnn.fit(X_train, y_train, 
#                          validation_data=(X_valid, y_valid), 
#                          batch_size = 1024, 
#                          epochs=40, 
#                          callbacks=[early_stopping])

In [None]:
# plt.figure(figsize=(8,8),dpi=200)
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model train vs validation loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train','validation'], loc='upper right')
# plt.show()

In [None]:
# history = modelnn.fit(train[i], target[i], 
# #                               validation_data=(train[num-1], target[num-1]), 
#                               batch_size = 512, 
#                               epochs=8, 
#                               class_weight = {
#                                         0: 0.5,
#                                         1: 2,
#                                         2: 2
#                                         },
#                               callbacks=[early_stopping])

In [None]:
for j in range(8):
    for i in range(num):
        history = modelnn.fit(train[i], target[i], 
#                               validation_data=(train[num-1], target[num-1]), 
                              batch_size = 512, 
                              epochs=1, 
                              class_weight = {
                                        0: 0.5,
                                        1: 2,
                                        2: 2
                                        },
                              callbacks=[early_stopping])


**Making prediction**

In [None]:
preds = pd.DataFrame(modelnn.predict(test), 
                    columns=['nobody_scores', 'team_A_scoring_within_10sec', 'team_B_scoring_within_10sec'],
                    index=test.index)
# preds = np.round(preds).astype(int)

In [None]:
preds

In [None]:
df_sample[['team_A_scoring_within_10sec', 'team_B_scoring_within_10sec']] = preds[['team_A_scoring_within_10sec', 'team_B_scoring_within_10sec']]

In [None]:
df_sample.to_csv('submission.csv', index = False)