# Projeto Final de Machine Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import joblib

pd.set_option('display.max_columns', None)

In [2]:
carac = pd.read_csv("caracteristicas_jogadores.csv")
names = pd.read_csv("nomes_jogadores.csv")

In [3]:
# birthday is in the format yyyy-mm-dd, we only want the year
names["birthday"] = names["birthday"].apply(lambda x: (x.split("-")[0]))
carac["date"] = carac["date"].apply(lambda x: (x.split("-")[0]))

'finishing','heading_accuracy','short_passing','volleys','dribbling','curve','free_kick_accuracy','long_passing','ball_control','acceleration','sprint_speed','agility','reactions','balance','shot_power','jumping','stamina','strength','long_shots','aggression','interceptions','positioning','vision','penalties','marking','standing_tackle','sliding_tackle','gk_diving','gk_handling','gk_kicking','gk_positioning','gk_reflexes'

In [4]:
# Selecionando apenas as colunas desejadas de cada DataFrame
df_geral = carac[['player_fifa_api_id', 'date', 'overall_rating','attacking_work_rate','defensive_work_rate','crossing','finishing','heading_accuracy','short_passing','volleys','dribbling','curve','free_kick_accuracy','long_passing','ball_control','acceleration','sprint_speed','agility','reactions','balance','shot_power','jumping','stamina','strength','long_shots','aggression','interceptions','positioning','vision','penalties','marking','standing_tackle','sliding_tackle','gk_diving','gk_handling','gk_kicking','gk_positioning','gk_reflexes']].merge(
    names[['player_fifa_api_id', 'player_name','birthday','height','weight']], on='player_fifa_api_id', how='left'
)
df_geral.dropna(inplace=True)

df_geral["age"] = df_geral["date"].astype(int) - df_geral["birthday"].astype(int)


In [5]:
#excluindo todas as linhas em que os work rates nao sao nem low nem medium nem high
df_geral = df_geral[
    (df_geral['defensive_work_rate'] == 'low') |
    (df_geral['defensive_work_rate'] == 'medium') |
    (df_geral['defensive_work_rate'] == 'high')
]

df_geral.attacking_work_rate.value_counts()

attacking_work_rate
medium    124871
high       42751
low         8539
Name: count, dtype: int64

In [6]:
df_geral.defensive_work_rate.value_counts()

defensive_work_rate
medium    130764
high       26972
low        18425
Name: count, dtype: int64

In [7]:
#transformando as colunas de work rate em categorias
df_geral['attacking_work_rate'] = df_geral['attacking_work_rate'].astype('category')
df_geral['defensive_work_rate'] = df_geral['defensive_work_rate'].astype('category')


In [8]:
# printe todas as colunas de df_geral
print(df_geral.columns)

Index(['player_fifa_api_id', 'date', 'overall_rating', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes', 'player_name', 'birthday', 'height', 'weight', 'age'],
      dtype='object')


In [9]:
forecast = df_geral[['overall_rating', 'crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
       'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration',
       'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
       'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'marking',
       'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes', 'height', 'weight', 'age', 'attacking_work_rate','defensive_work_rate']]

In [10]:
forecast.head()

Unnamed: 0,overall_rating,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes,height,weight,age,attacking_work_rate,defensive_work_rate
0,67.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0,182.88,187.0,24,medium,medium
1,67.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0,182.88,187.0,23,medium,medium
2,62.0,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,63.0,41.0,45.0,54.0,48.0,65.0,66.0,69.0,6.0,11.0,10.0,8.0,8.0,182.88,187.0,23,medium,medium
3,61.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0,182.88,187.0,23,medium,medium
4,61.0,48.0,43.0,70.0,60.0,43.0,50.0,44.0,38.0,63.0,48.0,60.0,64.0,59.0,46.0,65.0,54.0,58.0,54.0,76.0,34.0,62.0,40.0,44.0,53.0,47.0,62.0,63.0,66.0,5.0,10.0,9.0,7.0,7.0,182.88,187.0,15,medium,medium


# Modelos

### Criando as variaveis X e Y

In [11]:
# Definindo X e y
X = forecast.drop(columns=['overall_rating'])  # Todas as colunas exceto a target
y = forecast['overall_rating']  # Variável alvo

### Dividindo colunas numericas e colunas categoricas

In [12]:
numerical_columns = [col for col in X.columns if col not in ['attacking_work_rate', 'defensive_work_rate']]
categorical_columns = ['attacking_work_rate', 'defensive_work_rate']

### Preprocessamento dos dados

In [13]:
# Criando o ColumnTransformer
preprocessing = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("categ", OneHotEncoder(drop='first'), categorical_columns)
    ]
)

### Dividindo em train, validation e test

In [14]:
# Dividindo os dados inicialmente em treino e restante (80% treino, 20% restante)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Dividindo o restante (40%) em validação e teste (20% validação, 20% teste)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Confirmando tamanhos dos conjuntos
print(f"Tamanho do conjunto de treino: {len(X_train)}")
print(f"Tamanho do conjunto de validação: {len(X_val)}")
print(f"Tamanho do conjunto de teste: {len(X_test)}")

Tamanho do conjunto de treino: 105696
Tamanho do conjunto de validação: 35232
Tamanho do conjunto de teste: 35233


# Função para avaliar o modelo

In [15]:
def calculate_performance(actuals, forecasts, method):
    actuals, forecasts = np.array(actuals), np.array(forecasts)

    ### Service Level-Adjusted Mean Absolute Error
    sla_mae = np.mean(np.abs([2*i if i > 0 else i for i in np.subtract(actuals, forecasts)]))
    
    ### Mean Absolute Error
    mae = np.mean(np.abs(np.subtract(actuals, forecasts)))

    ### Mean Squared Error
    mse = np.mean(np.square(np.subtract(actuals, forecasts)))

    ### Root Mean Squared Error
    rmse = np.sqrt(mse)

    ### Mean Absolute Percentage Error
    mape = np.mean(np.abs(np.divide(np.subtract(actuals, forecasts), actuals)))

    ### Weighted Mean Absolute Percentage Error (WMAPE)
    wmape = np.sum(np.abs(actuals - forecasts)) / np.sum(np.abs(actuals))

    print("Method: ", method)
    print("Weighted Mean Absolute Percentage Error (WMAPE): ", wmape)
    print("Service Level-Adjusted Mean Absolute Error: ", sla_mae)
    print("Mean Absolute Error: ", mae)
    print("Root Mean Squared Error: ", rmse)
    print("Mean Absolute Percentage Error: ", mape)
    print("---------------------------")

# Exemplo de chamada
# calculate_performance(test_labels, normalized_predictions, 'Random Forest')

# Testando 4 modelos + hiperparametros

In [16]:
# Criando pipelines para cada modelo
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', RandomForestRegressor(random_state=42))
])

# Random Forest com hiperparâmetros ajustados
pipeline_rf_2 = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', RandomForestRegressor(random_state=42, n_estimators=200, max_depth=15))
])

pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', GradientBoostingRegressor(random_state=42))
])

pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])

# XGBoost com hiperparâmetros ajustados
pipeline_xgb_2 = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror', n_estimators=150, learning_rate=0.05))
])

# Criando e avaliando o Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('random_forest', pipeline_rf),
        ('gradient_boosting', pipeline_gb),
        ('xgboost', pipeline_xgb)
    ],
    final_estimator=RandomForestRegressor(random_state=42)  # Modelo meta
)

In [None]:

# Treinando e avaliando cada modelo individualmente
print("Performance dos modelos individuais:")
for pipeline, name in zip(
    [pipeline_rf, pipeline_rf_2, pipeline_gb, pipeline_xgb, pipeline_xgb_2, stacking_regressor],
    ["Random Forest", "Random Forest 2", "Gradient Boosting", "XGBoost", "XGBoost 2","StackingRegressor"]
):
    # Treinando o modelo
    pipeline.fit(X_train, y_train)
    
    # Fazendo previsões
    y_pred_val = pipeline.predict(X_val)
    
    # Calculando a performance
    calculate_performance(y_val, y_pred_val, name)


Performance dos modelos individuais:
Method:  Random Forest
Weighted Mean Absolute Percentage Error (WMAPE):  0.008585312540923542
Service Level-Adjusted Mean Absolute Error:  0.8986056352875793
Mean Absolute Error:  0.5898379225028365
Root Mean Squared Error:  1.0116159525266697
Mean Absolute Percentage Error:  0.008847479087986792
---------------------------


## O modelo escolhido foi: Random Forest

### Treinando o modelo com os dados de teste analisar o overfitting

In [None]:
# Fazendo previsões
y_pred_train = pipeline_rf.predict(X_train)

# Calculando a performance
calculate_performance(y_train, y_pred_train, name)

Method:  StackingRegressor
Weighted Mean Absolute Percentage Error (WMAPE):  0.0032605434742227534
Service Level-Adjusted Mean Absolute Error:  0.33981808199020486
Mean Absolute Error:  0.22393340766085165
Root Mean Squared Error:  0.3925779078035761
Mean Absolute Percentage Error:  0.0033708017583576177
---------------------------


# Juntando train e validate e testando com o test

In [None]:
# Combinando os conjuntos de treino e validação
X_train_val = pd.concat([X_train, X_val], axis=0)
y_train_val = pd.concat([y_train, y_val], axis=0)

print(f"Tamanho do novo conjunto de treino: {len(X_train_val)}")
print(f"Tamanho do conjunto de teste: {len(X_test)}")

Tamanho do novo conjunto de treino: 140928
Tamanho do conjunto de teste: 35233


In [None]:
# Treinando o modelo com o conjunto combinado (treino + validação)
pipeline_rf.fit(X_train_val, y_train_val)

# Fazendo previsões no conjunto de teste
y_pred_test = pipeline_rf.predict(X_test)

# Calculando a performance no conjunto de teste
calculate_performance(y_test, y_pred_test, "modelo escolhido")

Method:  modelo escolhido
Weighted Mean Absolute Percentage Error (WMAPE):  0.007387112484111441
Service Level-Adjusted Mean Absolute Error:  0.7704873172559505
Mean Absolute Error:  0.5075818271612558
Root Mean Squared Error:  0.919455452599429
Mean Absolute Percentage Error:  0.007628767207554462
---------------------------


# Treinando um modelo com todos os dados

In [None]:
# Combinando todos os conjuntos (treino + validação + teste)
X_full = pd.concat([X_train, X_val, X_test], axis=0)
y_full = pd.concat([y_train, y_val, y_test], axis=0)

print(f"Tamanho do conjunto completo: {len(X_full)}")

Tamanho do conjunto completo: 176161


In [None]:

# Treinando o modelo escolhido (pipeline_rf) com o conjunto completo
pipeline_rf.fit(X_full, y_full)

# Nome do arquivo para salvar o modelo completo
rf_model_filename = "pipeline_rf_model.pkl"



NameError: name 'joblib' is not defined

In [None]:
# Salvando o modelo completo
joblib.dump(pipeline_rf, rf_model_filename)

print(f"Modelo 'pipeline_rf' salvo como {rf_model_filename}.")

Modelo 'pipeline_rf' salvo como pipeline_rf_model.pkl.
