In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GroupKFold 


In [3]:
queen_por_pontuacao= pd.read_excel('queen_por_pontuacao.xlsx', sheet_name=0)
queen_por_posicao= pd.read_excel('queen_por_pontuacao.xlsx', sheet_name=0)

In [26]:
cols = ["id","queen","ep","bom","ruim","media","colocacao","idade","tempfranquia","vencedora"]
df = pd.DataFrame(queen_por_pontuacao, columns=cols)
num_cols = ["ep","bom","ruim","media","colocacao","idade","tempfranquia","vencedora"]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(float)


In [27]:
feature_cols = ["ep","idade", "bom","ruim","media"]  # features por episódio
groups = df.groupby(["tempfranquia","queen"])
sequences = []
labels = []
meta = []
for (season, queen), g in groups:
    g_sorted = g.sort_values("ep")
    seq = g_sorted[feature_cols].values.astype(float)
    sequences.append(seq)
    labels.append(int(g_sorted["vencedora"].max()))  # 1 se venceu a temporada, 0 caso contrário
    meta.append({"season": int(season), "queen": queen, "n_eps": len(g_sorted)})

print("Total de sequências (queen x temporada):", len(sequences))
print("Distribuição de labels:", pd.Series(labels).value_counts().to_dict())
print("Exemplo meta[0]:", meta[0])
print("Exemplo seq[0]:\n", sequences[0])

Total de sequências (queen x temporada): 629
Distribuição de labels: {0: 575, 1: 54}
Exemplo meta[0]: {'season': 101, 'queen': 'Akashia', 'n_eps': 3}
Exemplo seq[0]:
 [[ 1. 22.  0.  5. -2.]
 [ 2. 22.  0. 10. -4.]
 [ 3. 22.  0. 16. -7.]]


In [29]:
groups.head() 

Unnamed: 0,id,queen,ep,bom,ruim,media,colocacao,idade,tempfranquia,vencedora
0,1,BeBe Zahara Benet,1.0,1.0,1.0,0.0,1.0,28.0,101.0,1.0
1,1,BeBe Zahara Benet,2.0,2.0,2.0,0.0,1.0,28.0,101.0,1.0
2,1,BeBe Zahara Benet,3.0,8.0,2.0,2.0,1.0,28.0,101.0,1.0
3,1,BeBe Zahara Benet,4.0,12.0,2.0,3.0,1.0,28.0,101.0,1.0
4,1,BeBe Zahara Benet,5.0,12.0,7.0,1.0,1.0,28.0,101.0,1.0
...,...,...,...,...,...,...,...,...,...,...
4025,629,Kara Might,1.0,4.0,0.0,1.0,9.0,22.0,307.0,0.0
4026,629,Kara Might,2.0,5.0,1.0,1.0,9.0,22.0,307.0,0.0
4027,629,Kara Might,3.0,5.0,7.0,-2.0,9.0,22.0,307.0,0.0
4028,630,Srirasha Hotsauce,1.0,0.0,5.0,-2.0,10.0,27.0,307.0,0.0


In [10]:
all_steps = np.vstack(sequences)  # concat de todos os episódios (2D)
scaler = StandardScaler().fit(all_steps)
sequences_scaled = [scaler.transform(s) for s in sequences]

# pad_sequences: tenta importar do tensorflow, se não existir usa implementação numpy
try:
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    use_tf_pad = True
except Exception:
    use_tf_pad = False

def pad_sequences_np(sequences, maxlen=None, dtype="float32", padding="post", value=0.0):
    if maxlen is None:
        maxlen = max(len(s) for s in sequences)
    n_features = sequences[0].shape[1]
    X = np.full((len(sequences), maxlen, n_features), fill_value=value, dtype=dtype)
    for i, s in enumerate(sequences):
        length = min(len(s), maxlen)
        if padding == "post":
            X[i, :length, :] = s[:length, :]
        else:
            X[i, -length:, :] = s[:length, :]
    return X

max_len = max(len(s) for s in sequences_scaled)
n_features = sequences_scaled[0].shape[1]
if use_tf_pad:
    X = pad_sequences(sequences_scaled, maxlen=max_len, dtype="float32", padding="post", value=0.0)
else:
    X = pad_sequences_np(sequences_scaled, maxlen=max_len, dtype="float32", padding="post", value=0.0)

y = np.array(labels).astype("float32")
print("X shape (padded):", X.shape, " y shape:", y.shape)

X shape (padded): (629, 14, 4)  y shape: (629,)


In [11]:
# Bloco 5: train/test
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

unique, counts = np.unique(y, return_counts=True)
print("Contagem labels:", dict(zip(unique.astype(int), counts)))
stratify_param = y if (len(np.unique(y))>1 and np.min(np.bincount(y.astype(int)))>1) else None

X_train, X_test, y_train, y_test, meta_train, meta_test = train_test_split(
    X, y, meta, test_size=0.2, random_state=42, stratify=stratify_param
)
print("Shapes -> X_train:", X_train.shape, "X_test:", X_test.shape)

# class weight (útil se houver muito desbalanceamento)
if len(np.unique(y_train))>1:
    classes = np.unique(y_train.astype(int))
    cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_train.astype(int))
    class_weight = dict(zip(classes, cw))
    print("class_weight:", class_weight)
else:
    class_weight = None
    print("Não há variação de classes no treino para calcular class_weight.")


Contagem labels: {0: 575, 1: 54}
Shapes -> X_train: (503, 14, 4) X_test: (126, 14, 4)
class_weight: {0: 0.5467391304347826, 1: 5.848837209302325}


In [12]:
# Bloco 6: construir e treinar RNN — requer tensorflow
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.layers import Masking, LSTM, Dense, Dropout
    from tensorflow.keras.callbacks import EarlyStopping

    tf.keras.backend.clear_session()
    model = Sequential([
        Masking(mask_value=0., input_shape=(max_len, n_features)),
        LSTM(32, return_sequences=False),
        Dropout(0.3),
        Dense(16, activation="relu"),
        Dense(1, activation="sigmoid")
    ])
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    print("Modelo compilado. Resumo:")
    model.summary()

    # Treino (poucas épocas para demo — aumente quando tiver mais dados)
    es = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True, verbose=1)
    history = model.fit(
        X_train, y_train,
        validation_split=0.2 if len(X_train)>1 else 0.0,
        epochs=20,
        batch_size=8,
        class_weight=class_weight,
        callbacks=[es],
        verbose=2
    )
    print("Treino finalizado.")
except Exception as e:
    print("Erro: não foi possível importar TensorFlow ou treinar. Erro:", e)
    print("Se você ainda não instalou TensorFlow, rode: !pip install tensorflow e execute este bloco novamente.")





  super().__init__(**kwargs)


Modelo compilado. Resumo:


Epoch 1/20
51/51 - 3s - 53ms/step - accuracy: 0.4701 - loss: 0.6252 - val_accuracy: 0.7228 - val_loss: 0.6522
Epoch 2/20
51/51 - 0s - 5ms/step - accuracy: 0.7214 - loss: 0.5078 - val_accuracy: 0.7822 - val_loss: 0.5484
Epoch 3/20
51/51 - 0s - 5ms/step - accuracy: 0.7562 - loss: 0.4441 - val_accuracy: 0.8317 - val_loss: 0.4300
Epoch 4/20
51/51 - 0s - 5ms/step - accuracy: 0.7687 - loss: 0.4001 - val_accuracy: 0.8317 - val_loss: 0.4248
Epoch 5/20
51/51 - 0s - 5ms/step - accuracy: 0.7687 - loss: 0.3802 - val_accuracy: 0.8317 - val_loss: 0.4128
Epoch 6/20
51/51 - 0s - 5ms/step - accuracy: 0.7662 - loss: 0.3677 - val_accuracy: 0.8416 - val_loss: 0.4047
Epoch 7/20
51/51 - 0s - 5ms/step - accuracy: 0.7562 - loss: 0.3629 - val_accuracy: 0.8317 - val_loss: 0.3939
Epoch 8/20
51/51 - 0s - 5ms/step - accuracy: 0.7687 - loss: 0.3422 - val_accuracy: 0.8317 - val_loss: 0.3822
Epoch 9/20
51/51 - 0s - 5ms/step - accuracy: 0.7711 - loss: 0.3436 - val_accuracy: 0.8317 - val_loss: 0.3925
Epoch 10/20
51/51 

In [13]:
# Bloco 7: avaliação e exemplos (requer que o modelo tenha sido treinado)
try:
    # avaliação
    results = model.evaluate(X_test, y_test, verbose=0)
    print("Teste - loss: {:.4f}, accuracy: {:.4f}".format(results[0], results[1]))

    # predições (probabilidades)
    y_prob = model.predict(X_test).ravel()
    y_pred = (y_prob >= 0.5).astype(int)

    # métricas extras
    from sklearn.metrics import roc_auc_score, classification_report
    if len(np.unique(y_test))>1:
        print("ROC AUC:", roc_auc_score(y_test, y_prob))
    print("\nClassification report (test):")
    print(classification_report(y_test, y_pred, zero_division=0))

    # mostrar exemplos
    print("\nExemplos de predições:")
    for i in range(min(8, len(y_test))):
        print(f"Queen: {meta_test[i]['queen']}, season: {meta_test[i]['season']}, n_eps: {meta_test[i]['n_eps']}, true: {int(y_test[i])}, prob: {y_prob[i]:.4f}")

    # demonstrar probabilidade usando apenas primeiros k episódios da primeira sequência de teste
    def predict_with_first_k(seq_padded, k):
        arr = seq_padded.copy()
        if k < seq_padded.shape[0]:
            arr[k:,:] = 0.0
        return model.predict(arr[None, ...])[0,0]

    if len(X_test) > 0:
        idx = 0
        print("\nEvolução de probabilidade com os primeiros k episódios (primeira sequência de teste):")
        for k in range(1, meta_test[idx]['n_eps']+1):
            print(f"k={k} -> prob: {predict_with_first_k(X_test[idx], k):.4f}")

except Exception as e:
    print("Erro durante avaliação/predição — verifique se o modelo foi treinado. Erro:", e)


Teste - loss: 0.3019, accuracy: 0.8413
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
ROC AUC: 0.9256916996047431

Classification report (test):
              precision    recall  f1-score   support

         0.0       1.00      0.83      0.90       115
         1.0       0.35      1.00      0.52        11

    accuracy                           0.84       126
   macro avg       0.68      0.91      0.71       126
weighted avg       0.94      0.84      0.87       126


Exemplos de predições:
Queen: Leona Winter, season: 3015, n_eps: 7, true: 0, prob: 0.8071
Queen: Sanjina DaBish Queen, season: 504, n_eps: 4, true: 0, prob: 0.1457
Queen: Alexis Mateo, season: 301, n_eps: 12, true: 0, prob: 0.3458
Queen: Amanda Tears, season: 1012, n_eps: 2, true: 0, prob: 0.0403
Queen: Jade Sotomayor, season: 101, n_eps: 4, true: 0, prob: 0.0347
Queen: Sherry Pie, season: 1201, n_eps: 12, true: 0, prob: 0.7900
Queen: Sminty Drop, season: 402, n_eps: 4, true: 0, prob: 0.1441
Queen

In [18]:
# 1) Obter as probabilidades
y_probs = model.predict(X_test).flatten()

# 2) Juntar meta_data das queens de teste
results = []
for idx, meta in enumerate(meta_test):   # meta_test = infos das queens no conjunto de teste
    results.append({
        "season": meta["season"],
        "queen": meta["queen"],
        "n_eps": meta["n_eps"],
        "prob_vencedora": float(y_probs[idx]),
        "real_vencedora": int(y_test[idx])
    })

df_results = pd.DataFrame(results)

# 3) Exportar para Excel
df_results.to_excel("probabilidades_queens.xlsx", index=False)

print("Arquivo 'probabilidades_queens.xlsx' salvo!")
print(df_results.head(10))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
Arquivo 'probabilidades_queens.xlsx' salvo!
   season                 queen  n_eps  prob_vencedora  real_vencedora
0    3015          Leona Winter      7        0.807085               0
1     504  Sanjina DaBish Queen      4        0.145716               0
2     301          Alexis Mateo     12        0.345755               0
3    1012          Amanda Tears      2        0.040292               0
4     101        Jade Sotomayor      4        0.034696               0
5    1201            Sherry Pie     12        0.790034               0
6     402           Sminty Drop      4        0.144131               0
7     206             Eva Blunt     10        0.841298               1
8     205         Beverly Kills      6        0.002638               0
9     602       Zahirah Zapanta      4        0.012125               0


In [25]:
results = []

n_features_model = X_train.shape[2]  # número de features que o modelo espera

for (season, qid), g in queen_por_pontuacao.groupby(["tempfranquia","queen"]):
    g = g.sort_values("ep")
    nome = g["queen"].iloc[0]

    row = {"queen": nome, "temporada": season}
    feats = g[["ep","bom","ruim","media","colocacao"]].values

    for t in range(1, len(feats)+1):
        prefix = feats[:t]

        padded = np.zeros((max_len, n_features_model))
        padded[:t, :prefix.shape[1]] = prefix  # garante compatibilidade

        padded_batch = np.expand_dims(padded, axis=0)
        prob = model.predict(padded_batch, verbose=0).ravel()[0]

        row[f"episodio_{t}"] = f"{prob*100:.1f}%"

    results.append(row)

df_results = pd.DataFrame(results)
df_results.to_excel("probabilidades_por_episodio.xlsx", index=False)
print(df_results.head())


ValueError: could not broadcast input array from shape (1,5) into shape (1,4)