# Model Training:

In [1]:
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:
df_train = pd.read_csv('train.csv')

Keeping Goal keeper speed.

In [3]:
columnt_to_drop = [ 
                   'goalkeeping_reflexes', 
                   'goalkeeping_positioning', 
                   'goalkeeping_kicking',
                   'goalkeeping_handling',
                   'goalkeeping_diving',
                   'value_eur',
                   'wage_eur',
                   'birthday_date',
                   'height_cm',
                   'weight_kg',
                   'club_name',
                   'league_name',
                   'league_level',
                   'club_jersey_number',
                   'club_loaned_from',
                   'club_joined',
                   'club_contract_valid_until',
                   'nation_jersey_number',
                   'release_clause_eur',
                   'real_face',
                   'id',
                   'short_name',
                   'overall',
                   'potential',
                   'nationality_name',
                   'body_type',
                   'international_reputation',
                   'player_tags',
                   'player_traits',
                   'work_rate',
                   'weak_foot',
                   'skill_moves'
                   ]

df_train = df_train.drop(columnt_to_drop, axis=1)

In [4]:
df_train.loc[df_train['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0 
df_train.loc[df_train['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

In [5]:
# One-Hot Encoding para 'preferred_foot'
df_train = pd.get_dummies(df_train, columns=['preferred_foot'], drop_first=True)

label_encoder = LabelEncoder()
df_train['position_encoded'] = label_encoder.fit_transform(df_train['position'])

# Label Encoding para 'position'
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_train['position_encoded'] = label_encoder.fit_transform(df_train['position'])

# Incorporar la regla de negocio: Asignar etiqueta GK donde `goalkeeping_speed` != 0
gk_label = label_encoder.transform(['GK'])[0]  # Identificar la etiqueta numérica de GK
df_train.loc[df_train['goalkeeping_speed'] != 0, 'position_encoded'] = gk_label

# Separar características (X) y variable objetivo (y)
X = df_train.drop(columns=['position', 'position_encoded'])
y = df_train['position_encoded']

# Configurar validación cruzada
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Configuración inicial de LightGBM
params = {
    'objective': 'multiclass',
    'num_class': y.nunique(),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Almacenar los resultados de F1-Score para cada pliegue
f1_scores = []

# Validación cruzada
for train_index, val_index in kf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Crear el dataset para LightGBM
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Entrenar el modelo
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        num_boost_round=1000,
    )
    
    # Predicciones en el conjunto de validación
    y_val_pred = model.predict(X_val)
    y_val_pred_labels = np.argmax(y_val_pred, axis=1)
    
    # Reinserción de la regla de negocio en predicciones
    # Asegurar que donde `goalkeeping_speed != 0`, la etiqueta predicha sea GK
    y_val_pred_labels[X_val['goalkeeping_speed'] != 0] = gk_label

    # Calcular el F1-Score para este pliegue
    f1 = f1_score(y_val, y_val_pred_labels, average='macro')
    f1_scores.append(f1)

# Calcular el promedio de F1-Score
mean_f1_score = np.mean(f1_scores)

# Mostrar resultados
print("F1-Score por pliegue:", f1_scores)
print("F1-Score promedio:", mean_f1_score)

F1-Score por pliegue: [np.float64(0.2457880727947966), np.float64(0.2575432665824241), np.float64(0.25436913751851525), np.float64(0.2479349752661415), np.float64(0.25865621570497205)]
F1-Score promedio: 0.25285833357336995


# Model Prediction

In [6]:
df_test = pd.read_csv('test.csv')
original_df_test = df_test.copy()

In [7]:
columnt_to_drop = [ 
                   'goalkeeping_reflexes', 
                   'goalkeeping_positioning', 
                   'goalkeeping_kicking',
                   'goalkeeping_handling',
                   'goalkeeping_diving',
                   'value_eur',
                   'wage_eur',
                   'birthday_date',
                   'height_cm',
                   'weight_kg',
                   'club_name',
                   'league_name',
                   'league_level',
                   'club_jersey_number',
                   'club_loaned_from',
                   'club_joined',
                   'club_contract_valid_until',
                   'nation_jersey_number',
                   'release_clause_eur',
                   'real_face',
                   'id',
                   'short_name',
                   'overall',
                   'potential',
                   'nationality_name',
                   'body_type',
                   'international_reputation',
                   'player_tags',
                   'player_traits',
                   'work_rate',
                   'weak_foot',
                   'skill_moves'
                   ]

df_test = df_test.drop(columnt_to_drop, axis=1)

In [8]:
df_test.loc[df_test['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0 
df_test.loc[df_test['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

In [9]:
# One-Hot Encoding para 'preferred_foot'
df_test = pd.get_dummies(df_test, columns=['preferred_foot'], drop_first=True)


In [10]:
    # Predicciones en el conjunto de validación
    y_val_pred_kaggle = model.predict(df_test)
    y_val_pred_labels_kaggle = np.argmax(y_val_pred_kaggle, axis=1)
    
    # Reinserción de la regla de negocio en predicciones
    # Asegurar que donde `goalkeeping_speed != 0`, la etiqueta predicha sea GK
    y_val_pred_labels_kaggle[df_test['goalkeeping_speed'] != 0] = gk_label
    y_val_pred_labels_kaggle_decoded = label_encoder.inverse_transform(y_val_pred_labels_kaggle)


In [11]:
# Crear el DataFrame final con 'id' y las predicciones
# Supongamos que original_df_test contiene la columna 'id'
df_resultado = pd.DataFrame({
    'id': original_df_test['id'],  # Recuperar la columna 'id' del DataFrame original
    'prediccion': y_val_pred_labels_kaggle_decoded  # Predicciones decodificadas
})

# Mostrar el DataFrame final
print(df_resultado)

# (Opcional) Exportar a CSV
df_resultado.to_csv('lightGBM_Pred.csv', index=False)

         id prediccion
0    215562         RB
1    248311        LCB
2    223933         ST
3    232546         LM
4    189217         RB
..      ...        ...
762  205601         LW
763  223752        RCB
764  192450         ST
765  192366        LCB
766  232228         ST

[767 rows x 2 columns]
