# Model Training:

In [1]:
import lightgbm as lgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
from datetime import datetime

# Convert 'birthday_date' to datetime format
df_train['birthday_date'] = pd.to_datetime(df_train['birthday_date'], errors='coerce')

# Calculate age
current_year = datetime.now().year
df_train['age'] = current_year - df_train['birthday_date'].dt.year


In [4]:
columnt_to_drop = [ 
                   'id',
                   'short_name',
                   'value_eur',
                   'wage_eur',
                   'birthday_date',
                   'club_name',
                   'league_name',
                   'league_level',
                   'club_jersey_number',
                   'club_loaned_from',
                   'club_joined',
                   'club_contract_valid_until',
                   'nationality_name',
                   'nation_jersey_number',
                   'international_reputation',
                   'work_rate',
                   'body_type',
                   'real_face',
                   'release_clause_eur',
                   'player_tags',
                   'player_traits',
                   'goalkeeping_handling',
                   'goalkeeping_kicking',
                   'goalkeeping_positioning',
                   'goalkeeping_reflexes',
                   ]

df_train = df_train.drop(columnt_to_drop, axis=1)

In [5]:
df_train.loc[df_train['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0 
df_train.loc[df_train['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

In [6]:
# Import necessary libraries
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder

# One-Hot Encoding para 'preferred_foot'
df_train = pd.get_dummies(df_train, columns=['preferred_foot'], drop_first=True)

# Label Encoding para 'position'
label_encoder = LabelEncoder()
df_train['position_encoded'] = label_encoder.fit_transform(df_train['position'])

# Incorporar la regla de negocio: Asignar etiqueta GK donde `goalkeeping_speed` != 0
gk_label = label_encoder.transform(['GK'])[0]  # Identificar la etiqueta numérica de GK
df_train.loc[df_train['goalkeeping_speed'] != 0, 'position_encoded'] = gk_label

# Separar características (X) y variable objetivo (y)
X = df_train.drop(columns=['position', 'position_encoded'])
y = df_train['position_encoded']

# Configuración de búsqueda de hiperparámetros para LightGBM
param_grid = {
    'num_leaves': [15, 31, 63, 127],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'feature_fraction': [0.6, 0.8, 0.9, 1.0],
    'bagging_fraction': [0.6, 0.8, 1.0],
    'bagging_freq': [0, 5, 10],
    'max_depth': [10, 20, 30, -1],
    'lambda_l1': [0, 0.1, 0.5, 1.0],
    'lambda_l2': [0, 0.1, 0.5, 1.0]
}

# Configuración inicial de LightGBM
base_params = {
    'objective': 'multiclass',
    'num_class': y.nunique(),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'verbose': -1
}

# Configurar RandomizedSearchCV
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

lgb_estimator = lgb.LGBMClassifier(**base_params, n_jobs=-1)

random_search = RandomizedSearchCV(
    estimator=lgb_estimator,
    param_distributions=param_grid,
    n_iter=50,  # Número de combinaciones a evaluar
    scoring='f1_macro',  # Métrica para optimizar
    cv=kf,  # Validación cruzada estratificada
    verbose=1,  # Mostrar progreso
    random_state=42,
    n_jobs=-1  # Usar todos los núcleos disponibles
)

# Realizar la búsqueda de hiperparámetros
random_search.fit(X, y)

# Mejor combinación de hiperparámetros y F1-Score asociado
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Mejores hiperparámetros:", best_params)
print("Mejor F1-Score:", best_score)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Mejores hiperparámetros: {'num_leaves': 127, 'max_depth': -1, 'learning_rate': 0.2, 'lambda_l2': 0, 'lambda_l1': 1.0, 'feature_fraction': 0.9, 'bagging_freq': 5, 'bagging_fraction': 1.0}
Mejor F1-Score: 0.26428502852409197


In [7]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score

# Separar características (X) y variable objetivo (y)
X = df_train.drop(columns=['position', 'position_encoded'])
y = df_train['position_encoded']

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Configurar los mejores hiperparámetros
best_params = {
    'objective': 'multiclass',
    'num_class': y.nunique(),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 127,
    'max_depth': -1,
    'learning_rate': 0.2,
    'lambda_l1': 1.0,
    'lambda_l2': 0,
    'feature_fraction': 0.9,
    'bagging_fraction': 1.0,
    'bagging_freq': 5,
    'verbose': -1
}

# Crear datasets para LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Entrenar el modelo
model = lgb.train(
    best_params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=1000,
)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test)
y_pred_labels = y_pred.argmax(axis=1)  # Obtener la clase con mayor probabilidad

# Evaluar el modelo
f1 = f1_score(y_test, y_pred_labels, average='macro')
print(f"F1-Score en el conjunto de prueba: {f1:.3f}")

# Mostrar reporte de clasificación
report = classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_)
print(report)

F1-Score en el conjunto de prueba: 0.258
              precision    recall  f1-score   support

         CAM       0.38      0.36      0.37        56
          CB       0.00      0.00      0.00        41
         CDM       0.24      0.11      0.15        35
          CM       0.12      0.06      0.08        18
          GK       1.00      1.00      1.00       122
          LB       0.58      0.70      0.63        83
         LCB       0.41      0.43      0.42       102
         LCM       0.20      0.22      0.21        82
         LDM       0.14      0.12      0.13        41
          LF       0.00      0.00      0.00         6
          LM       0.26      0.23      0.25        69
          LS       0.10      0.07      0.08        44
          LW       0.04      0.04      0.04        27
         LWB       0.07      0.05      0.06        20
          RB       0.59      0.80      0.68        96
         RCB       0.44      0.55      0.49       121
         RCM       0.27      0.26      0

# Model Prediction

In [8]:
df_test = pd.read_csv('test.csv')
original_df_test = df_test.copy()

In [9]:
# Convert 'birthday_date' to datetime format
df_test['birthday_date'] = pd.to_datetime(df_test['birthday_date'], errors='coerce')

# Calculate age
current_year = datetime.now().year
df_test['age'] = current_year - df_test['birthday_date'].dt.year

In [10]:
columnt_to_drop = [ 
                   'id',
                   'short_name',
                   'value_eur',
                   'wage_eur',
                   'birthday_date',
                   'club_name',
                   'league_name',
                   'league_level',
                   'club_jersey_number',
                   'club_loaned_from',
                   'club_joined',
                   'club_contract_valid_until',
                   'nationality_name',
                   'nation_jersey_number',
                   'international_reputation',
                   'work_rate',
                   'body_type',
                   'real_face',
                   'release_clause_eur',
                   'player_tags',
                   'player_traits',
                   'goalkeeping_handling',
                   'goalkeeping_kicking',
                   'goalkeeping_positioning',
                   'goalkeeping_reflexes',
                   ]

df_test = df_test.drop(columnt_to_drop, axis=1)

In [11]:
df_test.loc[df_test['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0 
df_test.loc[df_test['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

In [12]:
# One-Hot Encoding para 'preferred_foot'
df_test = pd.get_dummies(df_test, columns=['preferred_foot'], drop_first=True)


In [13]:
# Predicciones en el conjunto de validación
y_val_pred_kaggle = model.predict(df_test)
y_val_pred_labels_kaggle = np.argmax(y_val_pred_kaggle, axis=1)
    
# Reinserción de la regla de negocio en predicciones
# Asegurar que donde `goalkeeping_speed != 0`, la etiqueta predicha sea GK
y_val_pred_labels_kaggle[df_test['goalkeeping_speed'] != 0] = gk_label
y_val_pred_labels_kaggle_decoded = label_encoder.inverse_transform(y_val_pred_labels_kaggle)


In [14]:
# Crear el DataFrame final con 'id' y las predicciones
# Supongamos que original_df_test contiene la columna 'id'
df_resultado = pd.DataFrame({
    'id': original_df_test['id'],  # Recuperar la columna 'id' del DataFrame original
    'prediccion': y_val_pred_labels_kaggle_decoded  # Predicciones decodificadas
})

# Mostrar el DataFrame final
print(df_resultado)

# (Opcional) Exportar a CSV
df_resultado.to_csv('lightGBM_Pred4.csv', index=False)

         id prediccion
0    215562         RB
1    248311        LCB
2    223933         ST
3    232546        CAM
4    189217         LM
..      ...        ...
762  205601         LW
763  223752        LCB
764  192450         ST
765  192366        RCB
766  232228         RS

[767 rows x 2 columns]
