In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Sequential, layers, Input
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dcic-ia-2021-proyecto/sample.csv
/kaggle/input/dcic-ia-2021-proyecto/train.csv
/kaggle/input/dcic-ia-2021-proyecto/test.csv


In [2]:


# Cargamos el dataset para entrenamiento.
df_training = pd.read_csv('../input/dcic-ia-2021-proyecto/train.csv')

# Llenamos los valores nulos
mean_home_dribbling = df_training['home_team_buildUpPlayDribbling'].mean()
mean_away_dribbling = df_training['away_team_buildUpPlayDribbling'].mean()
df_training['home_team_buildUpPlayDribbling'] = df_training['home_team_buildUpPlayDribbling'].fillna(mean_home_dribbling)
df_training['away_team_buildUpPlayDribbling'] = df_training['away_team_buildUpPlayDribbling'].fillna(mean_away_dribbling)

# Eliminamos columnas irrelevantes
cols_eliminar = ["id", "country", "league", "season", "home_team", "away_team"] 
df_training = df_training.drop(columns = cols_eliminar)

#Mapeos de categorias nominales
nominal_columns = ['home_team_buildUpPlayPositioningClass', 'away_team_buildUpPlayPositioningClass', 'home_team_chanceCreationPositioningClass', 'away_team_chanceCreationPositioningClass', 'home_team_defenceDefenderLineClass', 'away_team_defenceDefenderLineClass']
df_training = df_training.join(pd.get_dummies(df_training['home_team_buildUpPlayPositioningClass'], prefix = 'home_buildUpPlay'))
df_training = df_training.join(pd.get_dummies(df_training['away_team_buildUpPlayPositioningClass'], prefix = 'away_buildUpPlay'))
df_training = df_training.join(pd.get_dummies(df_training['home_team_chanceCreationPositioningClass'], prefix = 'home_chanceCreation'))
df_training = df_training.join(pd.get_dummies(df_training['away_team_chanceCreationPositioningClass'], prefix = 'away_chanceCreation'))
df_training = df_training.join(pd.get_dummies(df_training['home_team_defenceDefenderLineClass'], prefix = 'home_defence'))
df_training = df_training.join(pd.get_dummies(df_training['away_team_defenceDefenderLineClass'], prefix = 'away_defence'))
df_training = df_training.drop(columns = nominal_columns)

speed_class_map = {
    'Slow': 0,
    'Balanced': 1,
    'Fast': 2
}
shooting_class_map = {
    'Little': 0,
    'Normal': 1,
    'Lots': 2
}
width_class_map = {
    'Narrow': 0,
    'Normal': 1,
    'Wide': 2
}

df_training['home_team_buildUpPlaySpeedClass'] = df_training['home_team_buildUpPlaySpeedClass'].map(speed_class_map)
df_training['away_team_buildUpPlaySpeedClass'] = df_training['away_team_buildUpPlaySpeedClass'].map(speed_class_map)

df_training['home_team_chanceCreationShootingClass'] = df_training['home_team_chanceCreationShootingClass'].map(shooting_class_map)
df_training['away_team_chanceCreationShootingClass'] = df_training['away_team_chanceCreationShootingClass'].map(shooting_class_map)

df_training['home_team_defenceTeamWidthClass'] = df_training['home_team_defenceTeamWidthClass'].map(width_class_map)
df_training['away_team_defenceTeamWidthClass'] = df_training['away_team_defenceTeamWidthClass'].map(width_class_map)

In [3]:
# Separamos la columna target en una nueva variable y la eliminamos del dataframe.
y = np.asarray(df_training['match_result'])
print("Etiquetas: {}".format(y))
print("-"*30)

# Eliminamos la columna 'class' del dataframe (generamos df_final, mientras que df_mapped no se ve modificado)
df_training = df_training.drop(columns = ['match_result'])

# OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

# La función fit_transform devuelve una transformación de los datos que recibe.
y_enc = enc.fit_transform(y.reshape(-1, 1)).toarray()

print("Etiquetas (one-hot encoding):\n{}".format(y_enc))
print("-"*30)

print("Mapeo del encoder:")
for val, codif in zip(enc.categories_[0].tolist(), enc.transform(enc.categories_[0].reshape(-1, 1)).toarray()):
    print("Label: {} ==> Codificación One-Hot ==> {}".format(val, codif))

Etiquetas: [ 1 -1 -1 ...  1  1  0]
------------------------------
Etiquetas (one-hot encoding):
[[0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]]
------------------------------
Mapeo del encoder:
Label: -1 ==> Codificación One-Hot ==> [1. 0. 0.]
Label: 0 ==> Codificación One-Hot ==> [0. 1. 0.]
Label: 1 ==> Codificación One-Hot ==> [0. 0. 1.]


In [4]:
test_size = 0.15

X_train, X_val, y_train, y_val = train_test_split(df_training, y_enc, test_size = test_size, random_state = 10)

print("Datos de entrenamiento:\n\tCantidad de ejemplos (filas): {}\n\tCantidad de atributos (columnas): {}\n".format(X_train.shape[0], X_train.shape[1]))
print("Datos de validación:\n\tCantidad de ejemplos (filas): {}\n\tCantidad de atributos (columnas): {}".format(X_val.shape[0], X_val.shape[1]))

Datos de entrenamiento:
	Cantidad de ejemplos (filas): 2306
	Cantidad de atributos (columnas): 34

Datos de validación:
	Cantidad de ejemplos (filas): 408
	Cantidad de atributos (columnas): 34


In [5]:
#Red Neuronal
num_features = X_train.shape[1] ## cantidad de atributos en X_train y X_val
num_outputs = y_train.shape[1] ## cantidad de categorías posibles (5 géneros musicales)

def define_model():
    model = Sequential()
    
    # Capa de entrada de la red
    model.add(Input(shape=(num_features ))) ## tamaño de la entrada a la RNA
    
    # AGREGAR CAPAS 
    model.add(layers.Dense(40, activation = "relu"))
    
    # Capa de salida de la red
    model.add(layers.Dense(num_outputs, activation = 'softmax')) #

    # DEFINIR learning rate
    lr = 0.0001

    opt = tf.keras.optimizers.Adam(learning_rate = lr)
    # opt = tf.keras.optimizers.SGD(learning_rate = lr)

    model.compile(optimizer = opt, loss = 'categorical_crossentropy', metrics = 'accuracy')
    return model

print("¡Estructura de la red definida!\n")

# Se crea el modelo, aún sin entrenar
model = define_model()

# Obtenemos una descripción de la estructura
model.summary()

¡Estructura de la red definida!

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 40)                1400      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 123       
Total params: 1,523
Trainable params: 1,523
Non-trainable params: 0
_________________________________________________________________



User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

In [6]:
# VARIABLES - EPOCHS - BATCH_SIZE
epochs = 5
batch_size = 2
verbose = 1
val_data = (X_val, y_val)

history = model.fit(X_train, y_train, batch_size, epochs, verbose)

2022-04-08 14:19:47.493070: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
model.evaluate(X_val, y_val, batch_size, verbose)



[1.343936562538147, 0.44117647409439087]

In [8]:
# Predicciones sobre el conjunto de validación
predictions = model.predict(X_val, batch_size, verbose)

lista_match_result = ['-1', '0', '1']

# Reporte de clasificación
print(classification_report(np.argmax(y_val, axis = 1), np.argmax(predictions, axis = 1), target_names = lista_match_result))

              precision    recall  f1-score   support

          -1       0.50      0.22      0.31       117
           0       0.32      0.38      0.35       113
           1       0.50      0.62      0.56       178

    accuracy                           0.44       408
   macro avg       0.44      0.41      0.40       408
weighted avg       0.45      0.44      0.43       408



In [9]:
#Testeamos el AD

df = pd.read_csv('../input/dcic-ia-2021-proyecto/test.csv')

ids = df['id'] ## IDs de los ejemplos en el conjunto de test
df = df.drop(columns = ['id']) ## Eliminamos la columna de id del dataframe que separamos.

#Se llenan los nulos con los valores promedios de las columnas
mean_home_dribbling = df['home_team_buildUpPlayDribbling'].mean()
mean_away_dribbling = df['away_team_buildUpPlayDribbling'].mean()
df['home_team_buildUpPlayDribbling'] = df['home_team_buildUpPlayDribbling'].fillna(mean_home_dribbling)
df['away_team_buildUpPlayDribbling'] = df['away_team_buildUpPlayDribbling'].fillna(mean_away_dribbling)

#Se mapean los valores categoricos a valores numerales
df['home_team_buildUpPlaySpeedClass'] = df['home_team_buildUpPlaySpeedClass'].map(speed_class_map)
df['away_team_buildUpPlaySpeedClass'] = df['away_team_buildUpPlaySpeedClass'].map(speed_class_map)
df['home_team_chanceCreationShootingClass'] = df['home_team_chanceCreationShootingClass'].map(shooting_class_map)
df['away_team_chanceCreationShootingClass'] = df['away_team_chanceCreationShootingClass'].map(shooting_class_map)
df['home_team_defenceTeamWidthClass'] = df['home_team_defenceTeamWidthClass'].map(width_class_map)
df['away_team_defenceTeamWidthClass'] = df['away_team_defenceTeamWidthClass'].map(width_class_map)

#Se agregan nuevas columnas para las categorias nominales
df = df.join(pd.get_dummies(df['home_team_buildUpPlayPositioningClass'], prefix = 'home_buildUpPlay'))
df = df.join(pd.get_dummies(df['away_team_buildUpPlayPositioningClass'], prefix = 'away_buildUpPlay'))
df = df.join(pd.get_dummies(df['home_team_chanceCreationPositioningClass'], prefix = 'home_chanceCreation'))
df = df.join(pd.get_dummies(df['away_team_chanceCreationPositioningClass'], prefix = 'away_chanceCreation'))
df = df.join(pd.get_dummies(df['home_team_defenceDefenderLineClass'], prefix = 'home_defence'))
df = df.join(pd.get_dummies(df['away_team_defenceDefenderLineClass'], prefix = 'away_defence'))

#Se eliminan las columnas irrelevantes, incluyendo las columnas de categorias nominales que quedaron obsoletas luego del mapeo
cols_eliminar = ["country", "league", "season", "home_team", "away_team"]
df = df.drop(columns = cols_eliminar)
df = df.drop(columns = nominal_columns)

#Se predice sobre el conjunto de testeo
predictions = model.predict(df)
predictions = np.argmax(predictions, axis = 1)

#Mapeo de resultado [0, 1, 2] a [-1, 0 ,1]
fixed_predictions = []
for prediction in predictions: 
    fixed_predictions.append(prediction-1)

# Generar archivo de submission
submission = pd.DataFrame({
    'id': ids,
    'match_result': fixed_predictions
})
# Guardamos el dataframe como un archivo csv.
submission.to_csv('my_submission.csv', index = False)

# Vemos las primeras líneas del archivo.
submission.head()

Unnamed: 0,id,match_result
0,2416,1
1,96,1
2,2123,1
3,2038,0
4,2696,0
