# Projeto Final de Machine Learning

In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)

In [3]:
carac = pd.read_csv("caracteristicas_jogadores.csv")
names = pd.read_csv("nomes_jogadores.csv")

In [4]:
# birthday is in the format yyyy-mm-dd, we only want the year
names["birthday"] = names["birthday"].apply(lambda x: (x.split("-")[0]))
carac["date"] = carac["date"].apply(lambda x: (x.split("-")[0]))

'finishing','heading_accuracy','short_passing','volleys','dribbling','curve','free_kick_accuracy','long_passing','ball_control','acceleration','sprint_speed','agility','reactions','balance','shot_power','jumping','stamina','strength','long_shots','aggression','interceptions','positioning','vision','penalties','marking','standing_tackle','sliding_tackle','gk_diving','gk_handling','gk_kicking','gk_positioning','gk_reflexes'

In [5]:
# Selecionando apenas as colunas desejadas de cada DataFrame
df_geral = carac[['player_fifa_api_id', 'date', 'overall_rating','attacking_work_rate','defensive_work_rate','crossing','finishing','heading_accuracy','short_passing','volleys','dribbling','curve','free_kick_accuracy','long_passing','ball_control','acceleration','sprint_speed','agility','reactions','balance','shot_power','jumping','stamina','strength','long_shots','aggression','interceptions','positioning','vision','penalties','marking','standing_tackle','sliding_tackle','gk_diving','gk_handling','gk_kicking','gk_positioning','gk_reflexes']].merge(
    names[['player_fifa_api_id', 'player_name','birthday','height','weight']], on='player_fifa_api_id', how='left'
)
df_geral.dropna(inplace=True)

df_geral["age"] = df_geral["date"].astype(int) - df_geral["birthday"].astype(int)


In [6]:
#excluindo todas as linhas em que os work rates nao sao nem low nem medium nem high
df_geral = df_geral[
    (df_geral['defensive_work_rate'] == 'low') |
    (df_geral['defensive_work_rate'] == 'medium') |
    (df_geral['defensive_work_rate'] == 'high')
]

df_geral.attacking_work_rate.value_counts()

attacking_work_rate
medium    124871
high       42751
low         8539
Name: count, dtype: int64

In [7]:
df_geral.defensive_work_rate.value_counts()

defensive_work_rate
medium    130764
high       26972
low        18425
Name: count, dtype: int64

In [8]:
#transformando as colunas de work rate em categorias
df_geral['attacking_work_rate'] = df_geral['attacking_work_rate'].astype('category')
df_geral['defensive_work_rate'] = df_geral['defensive_work_rate'].astype('category')


In [10]:
# printe todas as colunas de df_geral
print(df_geral.columns)

Index(['player_fifa_api_id', 'date', 'overall_rating', 'attacking_work_rate',
       'defensive_work_rate', 'crossing', 'finishing', 'heading_accuracy',
       'short_passing', 'volleys', 'dribbling', 'curve', 'free_kick_accuracy',
       'long_passing', 'ball_control', 'acceleration', 'sprint_speed',
       'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina',
       'strength', 'long_shots', 'aggression', 'interceptions', 'positioning',
       'vision', 'penalties', 'marking', 'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes', 'player_name', 'birthday', 'height', 'weight', 'age'],
      dtype='object')


In [11]:
forecast = df_geral[['overall_rating', 'crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
       'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration',
       'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power',
       'jumping', 'stamina', 'strength', 'long_shots', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'marking',
       'standing_tackle', 'sliding_tackle',
       'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
       'gk_reflexes', 'height', 'weight', 'age', 'attacking_work_rate','defensive_work_rate']]

In [1]:
forecast.head()
forecast.shape()

NameError: name 'forecast' is not defined

# Modelos

In [13]:
# Definindo X e y
X = forecast.drop(columns=['overall_rating'])  # Todas as colunas exceto a target
y = forecast['overall_rating']  # Variável alvo

In [14]:
numerical_columns = [col for col in X.columns if col not in ['attacking_work_rate', 'defensive_work_rate']]
categorical_columns = ['attacking_work_rate', 'defensive_work_rate']

In [15]:
# Criando o ColumnTransformer
preprocessing = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        ("categ", OneHotEncoder(drop='first'), categorical_columns)
    ]
)

In [16]:
# Dividindo os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Função para avaliar o modelo

In [None]:
def calculate_performance(actuals, forecasts, method):
    actuals, forecasts = np.array(actuals), np.array(forecasts)

    ### Service Level-Adjusted Mean Absolute Error
    sla_mae = np.mean(np.abs([2*i if i > 0 else i for i in np.subtract(actuals, forecasts)]))
    
    ### Mean Absolute Error
    mae = np.mean(np.abs(np.subtract(actuals, forecasts)))

    ### Mean Squared Error
    mse = np.mean(np.square(np.subtract(actuals, forecasts)))

    ### Root Mean Squared Error
    rmse = np.sqrt(mse)

    ### Mean Absolute Percentage Error
    mape = np.mean(np.abs(np.divide(np.subtract(actuals, forecasts), actuals)))

    ### Weighted Mean Absolute Percentage Error (WMAPE)
    wmape = np.sum(np.abs(actuals - forecasts)) / np.sum(np.abs(actuals))

    print("Method: ", method)
    print("Weighted Mean Absolute Percentage Error (WMAPE): ", wmape)
    print("Service Level-Adjusted Mean Absolute Error: ", sla_mae)
    print("Mean Absolute Error: ", mae)
    print("Root Mean Squared Error: ", rmse)
    print("Mean Absolute Percentage Error: ", mape)
    print("---------------------------")

# Exemplo de chamada
# calculate_performance(test_labels, normalized_predictions, 'Random Forest')

# Testando os 3 modelos

In [18]:
# Criando pipelines para cada modelo
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', RandomForestRegressor(random_state=42))
])

pipeline_gb = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', GradientBoostingRegressor(random_state=42))
])

pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessing),
    ('model', XGBRegressor(random_state=42, objective='reg:squarederror'))
])

In [19]:

# Treinando e avaliando cada modelo individualmente
print("Performance dos modelos individuais:")
for pipeline, name in zip(
    [pipeline_rf, pipeline_gb, pipeline_xgb],
    ["Random Forest", "Gradient Boosting", "XGBoost"]
):
    # Treinando o modelo
    pipeline.fit(X_train, y_train)
    
    # Fazendo previsões
    y_pred = pipeline.predict(X_test)
    
    # Calculando a performance
    calculate_performance(y_test, y_pred, name)


Performance dos modelos individuais:
Method:  Random Forest
Weighted Mean Absolute Percentage Error (WMAPE):  0.0073519082647571085
Service Level-Adjusted Mean Absolute Error:  0.771869943746544
Mean Absolute Error:  0.5048582302379595
Root Mean Squared Error:  0.9050176124532725
Mean Absolute Percentage Error:  0.00756744835428615
Stockout Percentage:  0.0038360902159025483
Overstock Percentage:  0.00362091560219257
---------------------------
Method:  Gradient Boosting
Weighted Mean Absolute Percentage Error (WMAPE):  0.01884396331918904
Service Level-Adjusted Mean Absolute Error:  1.9479347364277197
Mean Absolute Error:  1.2940218551964089
Root Mean Squared Error:  1.8001452707935242
Mean Absolute Percentage Error:  0.01965442235455145
Stockout Percentage:  0.00940330903746612
Overstock Percentage:  0.00971202509109673
---------------------------
Method:  XGBoost
Weighted Mean Absolute Percentage Error (WMAPE):  0.012562339370027504
Service Level-Adjusted Mean Absolute Error:  1.297

# Stacking Regressor dos 3 modelos

In [20]:
# Criando e avaliando o Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('random_forest', pipeline_rf),
        ('gradient_boosting', pipeline_gb),
        ('xgboost', pipeline_xgb)
    ],
    final_estimator=RandomForestRegressor(random_state=42)  # Modelo meta
)

In [21]:
# Treinando o Stacking Regressor
stacking_regressor.fit(X_train, y_train)

# Fazendo previsões
y_pred_stacking = stacking_regressor.predict(X_test)

calculate_performance(y_test, y_pred_stacking, 'stacking_regressor')

Method:  stacking_regressor
Weighted Mean Absolute Percentage Error (WMAPE):  0.00813471269356284
Service Level-Adjusted Mean Absolute Error:  0.8450095948967888
Mean Absolute Error:  0.5586136967532038
Root Mean Squared Error:  0.9195815350487261
Mean Absolute Percentage Error:  0.008379348783371663
Stockout Percentage:  0.004213371297469998
Overstock Percentage:  0.004058469631116947
---------------------------


# Salvando Resutados

In [22]:
# Salvando resultados em um DataFrame e CSV
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_stacking
}).reset_index(drop=True)

# Salvando em CSV
results_df.to_csv('stacking_regressor_results.csv', index=False)

results_df.head()

Unnamed: 0,Actual,Predicted
0,72.0,75.25
1,59.0,59.09
2,73.0,73.17
3,69.0,69.5
4,53.0,50.06


# Importancia de cada modelo

In [23]:
# Calculando a importância das previsões dos modelos base para o final_estimator
feature_importances_meta = stacking_regressor.final_estimator_.feature_importances_

# Nomes das "features" são os nomes dos estimadores no ensemble
meta_feature_names = [name for name, _ in stacking_regressor.estimators]

# Criando DataFrame com as importâncias no modelo meta
meta_importance_df = pd.DataFrame({
    'Feature': meta_feature_names,
    'Importance': feature_importances_meta
}).sort_values(by='Importance', ascending=False)

# Salvando as importâncias no modelo meta
meta_importance_df.to_csv('meta_feature_importances_stacking.csv', index=False)

# Exibindo as importâncias no modelo meta
print("Importâncias das Features no Modelo Meta:")
(meta_importance_df).head()

Importâncias das Features no Modelo Meta:


Unnamed: 0,Feature,Importance
0,random_forest,0.98446
2,xgboost,0.008714
1,gradient_boosting,0.006827
