# Model Training:

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from datetime import datetime
import numpy as np

# Load the dataset
file_path = 'train.csv'  # Replace with your file path
df_train = pd.read_csv(file_path)

# Step 1: Create the 'age' column from 'birthday_date'
df_train['birthday_date'] = pd.to_datetime(df_train['birthday_date'], errors='coerce')
today = datetime.today()
df_train['age'] = df_train['birthday_date'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Step 2: Handle missing values in 'goalkeeping_speed' and adjust related features
df_train.loc[df_train['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0
df_train.loc[df_train['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

# Step 3: Encode the target variable 'position'
label_encoder_position = LabelEncoder()
df_train['position_encoded'] = label_encoder_position.fit_transform(df_train['position'])

# Step 5: Encode 'preferred_foot'
label_encoder_foot = LabelEncoder()
df_train['preferred_foot_encoded'] = label_encoder_foot.fit_transform(df_train['preferred_foot'])

# Step 4: Drop unnecessary columns
columns_to_drop = [
    'short_name', 'value_eur', 'wage_eur', 'birthday_date', 
    'club_name', 'league_name', 'league_level', 
    'club_jersey_number', 'club_loaned_from', 'club_joined', 
    'club_contract_valid_until', 'nationality_name', 'nation_jersey_number',
    'real_face', 'player_tags', 'player_traits', 
    'goalkeeping_handling', 'goalkeeping_kicking', 
    'goalkeeping_positioning', 'goalkeeping_reflexes', 
    'international_reputation', 'goalkeeping_diving', 
    'position', 'release_clause_eur', 'id', 'work_rate','preferred_foot'
]
df_train = df_train.drop(columns=columns_to_drop, errors='ignore')

# Step 6: Simplify the 'body_type' column and create dummy variables
df_train['body_type'] = df_train['body_type'].astype(str).apply(lambda x: x.split()[0])
body_type_dummies = pd.get_dummies(df_train['body_type'], prefix='body_type')
df_train = pd.concat([df_train, body_type_dummies], axis=1)
df_train = df_train.drop(columns=['body_type'], errors='ignore')

# Step 7: Select features based on correlation
selected_features = [
    'attacking_finishing', 'shooting', 'mentality_positioning', 'pace', 
    'attacking_volleys', 'mentality_penalties', 'dribbling', 'skill_dribbling', 
    'skill_ball_control', 'movement_sprint_speed', 'power_long_shots', 
    'movement_acceleration', 'physic', 'attacking_heading_accuracy', 
    'skill_moves', 'passing', 'skill_curve', 'movement_agility', 
    'power_stamina', 'power_shot_power', 'attacking_crossing', 
    'attacking_short_passing', 'goalkeeping_speed'
]

# Step 8: Prepare X_train and y_train
X = df_train[selected_features]
y = df_train['position_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Configure the Randomized Search
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=50,  # Number of settings sampled
    scoring='f1_weighted',  # Optimize for F1-Score
    cv=5,  # 5-fold cross-validation
    verbose=1,  # Show progress
    random_state=42,
    n_jobs=-1  # Use all cores
)

# Step 9: Perform the Randomized Search
random_search.fit(X, y)

# Resultados del ajuste
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Mejores hiperparámetros:", best_params)
print("Mejor F1-Score promedio (validación cruzada):", best_score)

# Entrenar el modelo final con los mejores parámetros encontrados
final_rf_model = RandomForestClassifier(
    **best_params,  # Usar los mejores hiperparámetros
    class_weight='balanced',
    random_state=42
)

# Entrenar con todo el conjunto de datos
final_rf_model.fit(X, y)

# Best parameters and corresponding score
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1-Score: {best_score:.3f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits




Mejores hiperparámetros: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20}
Mejor F1-Score promedio (validación cruzada): 0.3242412282798508
Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 20}
Best F1-Score: 0.324


# Model Prediction

In [2]:
df_test = pd.read_csv('test.csv')
original_df_test = pd.read_csv('test.csv')

In [None]:
# Cargar el conjunto de prueba
file_path_test = 'test.csv'  # Reemplaza con la ruta de tu archivo de prueba
df_test = pd.read_csv(file_path_test)

# Aplicar el mismo preprocesamiento al conjunto de prueba
# Crear la columna 'age'
df_test['birthday_date'] = pd.to_datetime(df_test['birthday_date'], errors='coerce')
today = datetime.today()
df_test['age'] = df_test['birthday_date'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Rellenar valores faltantes y ajustar características relacionadas
df_test.loc[df_test['goalkeeping_speed'].isna(), ['goalkeeping_speed']] = 0
df_test.loc[df_test['goalkeeping_speed'] != 0, ['pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']] = 0

# Crear dummies para 'body_type' en el conjunto de prueba
df_test['body_type'] = df_test['body_type'].astype(str).apply(lambda x: x.split()[0])
body_type_dummies_test = pd.get_dummies(df_test['body_type'], prefix='body_type')
df_test = pd.concat([df_test, body_type_dummies_test], axis=1)

# Asegurarse de que las columnas del conjunto de prueba coincidan con las del entrenamiento
# Reindexar el conjunto de prueba para que tenga las mismas columnas que el entrenamiento
df_test_aligned = df_test.reindex(columns=X.columns, fill_value=0)

# Realizar predicciones con el modelo entrenado
y_pred_test = final_rf_model.predict(df_test_aligned)

# Decodificar las etiquetas predichas
y_pred_decoded = label_encoder_position.inverse_transform(y_pred_test)


Predicciones realizadas y guardadas en 'predicted_positions.csv'.


In [5]:
# Crear el DataFrame final con 'id' y las predicciones
# Supongamos que original_df_test contiene la columna 'id'
df_resultado = pd.DataFrame({
    'id': original_df_test['id'],  # Recuperar la columna 'id' del DataFrame original
    'prediccion': y_pred_decoded  # Predicciones decodificadas
})

# Mostrar el DataFrame final
print(df_resultado)

# (Opcional) Exportar a CSV
df_resultado.to_csv('random_forest_Pred2.csv', index=False)

         id prediccion
0    215562         LB
1    248311        LCB
2    223933         ST
3    232546        CAM
4    189217         RM
..      ...        ...
762  205601         LM
763  223752        LCB
764  192450         ST
765  192366        LCB
766  232228         ST

[767 rows x 2 columns]
