# Model Training:

In [1]:
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datetime import datetime


In [2]:
# Cargar el archivo CSV
file_path = 'train.csv'  # Reemplaza con la ruta de tu archivo
df_train = pd.read_csv(file_path)

# Crear la columna 'age' a partir de 'birthday_date'
df_train['birthday_date'] = pd.to_datetime(df_train['birthday_date'], errors='coerce')
today = datetime.today()
df_train['age'] = df_train['birthday_date'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Imputaciones en 'goalkeeping_speed'
df_train.loc[df_train['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0
df_train.loc[df_train['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

# Codificar 'preferred_foot'
label_encoder_foot = LabelEncoder()
df_train['preferred_foot_encoded'] = label_encoder_foot.fit_transform(df_train['preferred_foot'])

# Codificar la variable 'position'
label_encoder_position = LabelEncoder()
df_train['position_encoded'] = label_encoder_position.fit_transform(df_train['position'])
gk_label = label_encoder_position.transform(['GK'])[0]  # Identificar la etiqueta numérica de GK
df_train.loc[df_train['goalkeeping_speed'] != 0, 'position_encoded'] = gk_label

# Eliminar columnas no necesarias
columns_to_drop = [
    'short_name', 'value_eur', 'wage_eur', 'birthday_date', 
    'club_name', 'league_name', 'league_level', 
    'club_jersey_number', 'club_loaned_from', 'club_joined', 
    'club_contract_valid_until', 'nationality_name', 'nation_jersey_number',
    'real_face', 'player_tags', 'player_traits', 
    'goalkeeping_handling', 'goalkeeping_kicking', 
    'goalkeeping_positioning', 'goalkeeping_reflexes', 
    'international_reputation', 'goalkeeping_diving', 
    'position', 'release_clause_eur', 'id', 'work_rate', 'preferred_foot'
]
df_train = df_train.drop(columns=columns_to_drop, errors='ignore')

# Simplificar la columna 'body_type' y crear dummies
df_train['body_type'] = df_train['body_type'].astype(str).apply(lambda x: x.split()[0])
body_type_dummies = pd.get_dummies(df_train['body_type'], prefix='body_type')
df_train = pd.concat([df_train, body_type_dummies], axis=1)
df_train = df_train.drop(columns=['body_type'], errors='ignore')


In [3]:
df_train

Unnamed: 0,overall,potential,height_cm,weight_kg,weak_foot,skill_moves,pace,shooting,passing,dribbling,...,defending_standing_tackle,defending_sliding_tackle,goalkeeping_speed,age,preferred_foot_encoded,position_encoded,body_type_Lean,body_type_Normal,body_type_Stocky,body_type_Unique
0,71,71,176,73,5,3,70.0,52.0,60.0,70.0,...,66,65,0.0,34,1,5,False,True,False,False
1,65,71,183,73,3,2,65.0,38.0,58.0,60.0,...,61,58,0.0,28,1,8,True,False,False,False
2,65,77,178,69,3,3,79.0,35.0,58.0,66.0,...,58,59,0.0,23,0,13,False,True,False,False
3,72,72,188,81,3,3,64.0,74.0,51.0,68.0,...,22,19,0.0,32,1,11,True,False,False,False
4,65,65,179,74,2,2,74.0,53.0,59.0,53.0,...,61,58,0.0,34,0,5,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6890,59,66,180,65,3,2,67.0,35.0,44.0,54.0,...,57,58,0.0,26,1,15,False,True,False,False
6891,83,83,183,80,4,4,72.0,81.0,77.0,81.0,...,79,75,0.0,36,1,7,False,True,False,False
6892,70,80,177,73,3,2,77.0,61.0,67.0,71.0,...,67,68,0.0,25,0,13,False,True,False,False
6893,71,71,176,70,3,2,73.0,37.0,63.0,65.0,...,72,70,0.0,35,0,6,True,False,False,False


In [4]:
# Separar características (X) y variable objetivo (y)
X = df_train.drop(columns=['position_encoded'])
y = df_train['position_encoded']

# Configurar validación cruzada
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Configuración inicial de LightGBM
params = {
    'objective': 'multiclass',
    'num_class': y.nunique(),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
}

# Almacenar los resultados de F1-Score para cada pliegue
f1_scores = []

# Validación cruzada
for train_index, val_index in kf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Crear el dataset para LightGBM
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Entrenar el modelo
    model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, val_data],
        num_boost_round=1000,
    )
    
    # Predicciones en el conjunto de validación
    y_val_pred = model.predict(X_val)
    y_val_pred_labels = np.argmax(y_val_pred, axis=1)
    
    # Reinserción de la regla de negocio en predicciones
    # Asegurar que donde `goalkeeping_speed != 0`, la etiqueta predicha sea GK
    y_val_pred_labels[X_val['goalkeeping_speed'] != 0] = gk_label
    
    # Calcular el F1-Score para este pliegue
    f1 = f1_score(y_val, y_val_pred_labels, average='macro')
    f1_scores.append(f1)

# Calcular el promedio de F1-Score
mean_f1_score = np.mean(f1_scores)

# Mostrar resultados
print("F1-Score por pliegue:", f1_scores)
print("F1-Score promedio:", mean_f1_score)

F1-Score por pliegue: [np.float64(0.24701921405713897), np.float64(0.2660115472944195), np.float64(0.25164781417609444), np.float64(0.2608256994011784), np.float64(0.24510462997184165)]
F1-Score promedio: 0.25412178098013455


# Model Prediction

In [5]:
df_test = pd.read_csv('test.csv')
original_df_test = df_test.copy()

In [6]:
# Crear la columna 'age' a partir de 'birthday_date'
df_test['birthday_date'] = pd.to_datetime(df_test['birthday_date'], errors='coerce')
today = datetime.today()
df_test['age'] = df_test['birthday_date'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Imputaciones en 'goalkeeping_speed'
df_test.loc[df_test['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0
df_test.loc[df_test['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

# Codificar 'preferred_foot'
df_test['preferred_foot_encoded'] = label_encoder_foot.fit_transform(df_test['preferred_foot'])

# Eliminar columnas no necesarias
columns_to_drop = [
    'short_name', 'value_eur', 'wage_eur', 'birthday_date', 
    'club_name', 'league_name', 'league_level', 
    'club_jersey_number', 'club_loaned_from', 'club_joined', 
    'club_contract_valid_until', 'nationality_name', 'nation_jersey_number',
    'real_face', 'player_tags', 'player_traits', 
    'goalkeeping_handling', 'goalkeeping_kicking', 
    'goalkeeping_positioning', 'goalkeeping_reflexes', 
    'international_reputation', 'goalkeeping_diving', 
    'position', 'release_clause_eur', 'id', 'work_rate', 'preferred_foot'
]
df_test = df_test.drop(columns=columns_to_drop, errors='ignore')

# Simplificar la columna 'body_type' y crear dummies
df_test['body_type'] = df_test['body_type'].astype(str).apply(lambda x: x.split()[0])
body_type_dummies = pd.get_dummies(df_test['body_type'], prefix='body_type')
df_test = pd.concat([df_test, body_type_dummies], axis=1)
df_test = df_test.drop(columns=['body_type'], errors='ignore')

In [7]:
    # Predicciones en el conjunto de validación
    y_val_pred_kaggle = model.predict(df_test)
    y_val_pred_labels_kaggle = np.argmax(y_val_pred_kaggle, axis=1)
    
    # Reinserción de la regla de negocio en predicciones
    # Asegurar que donde `goalkeeping_speed != 0`, la etiqueta predicha sea GK
    y_val_pred_labels_kaggle[df_test['goalkeeping_speed'] != 0] = gk_label
    y_val_pred_labels_kaggle_decoded = label_encoder_position.inverse_transform(y_val_pred_labels_kaggle)


In [8]:
# Crear el DataFrame final con 'id' y las predicciones
# Supongamos que original_df_test contiene la columna 'id'
df_resultado = pd.DataFrame({
    'id': original_df_test['id'],  # Recuperar la columna 'id' del DataFrame original
    'prediccion': y_val_pred_labels_kaggle_decoded  # Predicciones decodificadas
})

# Mostrar el DataFrame final
print(df_resultado)

# (Opcional) Exportar a CSV
df_resultado.to_csv('lightGBM_Pred3.csv', index=False)

         id prediccion
0    215562         RB
1    248311        LCB
2    223933         ST
3    232546         LM
4    189217         RB
..      ...        ...
762  205601         LM
763  223752        RCB
764  192450         ST
765  192366        LCB
766  232228         ST

[767 rows x 2 columns]
