In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GroupKFold 
from sklearn.model_selection import GroupShuffleSplit


In [2]:
meio_posicao= pd.read_excel('fim_posicao_queen.xlsx', sheet_name=0)
df = pd.DataFrame(meio_posicao)

In [3]:
# ==== 2. Separar features e target ====
X = df[["bom","ruim","media","idade"]]
y = df["vencedora"]

In [4]:
groups = df['tempfranquia']
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
gss
train_idx, test_idx = next(gss.split(X, y, groups=groups))

X_train = X.iloc[train_idx]
X_test = X.iloc[test_idx]
y_train = y.iloc[train_idx]
y_test = y.iloc[test_idx]


In [5]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)


In [6]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)
rf.fit(X_res, y_res)

In [7]:
y_pred = rf.predict(X_test)
print("Matriz de confusão:\n", confusion_matrix(y_test, y_pred))
print("\nRelatório de classificação:\n", classification_report(y_test, y_pred))


Matriz de confusão:
 [[30 10]
 [ 9  4]]

Relatório de classificação:
               precision    recall  f1-score   support

           0       0.77      0.75      0.76        40
           1       0.29      0.31      0.30        13

    accuracy                           0.64        53
   macro avg       0.53      0.53      0.53        53
weighted avg       0.65      0.64      0.65        53



In [8]:
importances = pd.DataFrame({
    "feature": X.columns,
    "importancia": rf.feature_importances_
}).sort_values(by="importancia", ascending=False)

print("\nImportância das variáveis:\n", importances)


Importância das variáveis:
   feature  importancia
2   media     0.430796
1    ruim     0.277740
0     bom     0.203732
3   idade     0.087731


In [9]:
# Probabilidades de cada participante no teste
probs = rf.predict_proba(X_test)

# Colocar em um DataFrame junto com o nome da queen
resultados = pd.DataFrame({
    "queen": df.loc[X_test.index, "queen"],
    "prob_nao_vencedora": probs[:,0],
    "prob_vencedora": probs[:,1],
    "real": y_test.values
}).sort_values(by="prob_vencedora", ascending=False)

print(resultados)


                     queen  prob_nao_vencedora  prob_vencedora  real
221              Eva Blunt            0.019458        0.980542     0
189         Marina Summers            0.058400        0.941600     0
208                Alvilda            0.076375        0.923625     1
96           Scarlett BoBo            0.142011        0.857989     0
237             Suzie Toot            0.171427        0.828573     0
225            Pandora Nox            0.172726        0.827274     1
216       Cristian Peralta            0.215224        0.784776     1
143          Carmen Farala            0.251045        0.748955     1
235              Lexi Love            0.367143        0.632857     0
209               La Veuve            0.401589        0.598411     0
95               Rita Baga            0.405793        0.594207     0
218                Matraka            0.421095        0.578905     0
184       Ruby On The Nail            0.421856        0.578144     0
217              Gala Varo        

In [10]:
resultados.to_excel('./previsoes/fim_forest_previsoes_vencedoras.xlsx', index=False)
