In [34]:
import pandas as pd

In [35]:
df = pd.read_csv('challenge_train.csv')

# Treinamento do modelo
Dado o dataset abaixo contendo informações relacionadas aos cards de Gods
Unchained, implemente e treine um modelo para classificar se o card se
enquadra em uma estratégia de “early” ou “late” game, com base nas features
e“mana”, “attack”, “health”, “type” e “god”.

Descrição dos dados:
id (int) - Identificador único da carta no banco de dados do jogo
name (str) - Nome da carta
mana (int) - Custo de mana para colocar a carta na mesa
attack (int) - Dano que a carta causa ao oponente
health (int) - Resistência ao dano ou durabilidade da carta
type (str) - ['spell','creature','weapon','god power']
god (str) - ['death','neutral','deception','nature','light','war','magic']
strategy (str) - Estágio do jogo ['early','late']

In [36]:
df

Unnamed: 0,id,name,mana,attack,health,type,god,strategy
0,1118,Firewine,5,0,0,spell,nature,early
1,1036,Leyhoard Hatchling,10,2,1,creature,magic,late
2,244,Aetherfuel Alchemist,6,4,4,creature,neutral,late
3,215,Millenium Matryoshka,4,2,2,creature,neutral,late
4,87013,Poison Peddler,4,1,3,creature,neutral,late
...,...,...,...,...,...,...,...,...
783,1252,Living Container,2,1,1,creature,death,early
784,1028,Famished Ghoul,4,4,4,creature,death,late
785,393,The Iron Horse,7,5,5,creature,neutral,late
786,73,Golem Excavator,7,6,7,creature,neutral,late


In [37]:
# Ajustar dicionários id2type e type2id
types = df['type'].unique().tolist()
id2type = {id: type for id, type in enumerate(types)}
type2id = {type: id for id, type in enumerate(types)}

In [38]:
gods = df['god'].unique().tolist()
id2god = {id: god for id, god in enumerate(gods)}
god2id = {god: id for id, god in enumerate(gods)}

In [39]:
strategies = df['strategy'].unique().tolist()
strategy2id = {strategy : id for id, strategy in enumerate(strategies)}
id2strategy = {id: strategy for id, strategy in enumerate(strategies)}

In [40]:
df['type_'] = df.type.apply(lambda x: type2id.get(x))

In [41]:
df['god_'] = df.god.apply(lambda x: god2id.get(x))

In [42]:
df['strategy_'] = df.strategy.apply(lambda x: strategy2id.get(x))

In [43]:
df_new = df[['mana', 'attack', 'health', 'type_', 'god_', 'strategy_']]

In [44]:
df_new.strategy_.value_counts(normalize=True)

strategy_
0    0.548223
1    0.451777
Name: proportion, dtype: float64

In [45]:
df_new

Unnamed: 0,mana,attack,health,type_,god_,strategy_
0,5,0,0,0,0,0
1,10,2,1,1,1,1
2,6,4,4,1,2,1
3,4,2,2,1,2,1
4,4,1,3,1,2,1
...,...,...,...,...,...,...
783,2,1,1,1,3,0
784,4,4,4,1,3,1
785,7,5,5,1,2,1
786,7,6,7,1,2,1


In [46]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [47]:
# Separar as features e o target
X = df_new.drop(columns=['strategy_']) # todas as colunas que serão usadas no X
y = df_new['strategy_']

In [48]:
# Divide os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Cria um pipeline para padronizar os dados e treinar os modelos
pipelines = {
    'logistic_regression': Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression())]),
    'random_forest': Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier())]),
    'svm': Pipeline([('scaler', StandardScaler()), ('clf', SVC())]),
    'knn': Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier())]),
    'decision_tree': Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier())])
}


In [50]:
# Avaliar cada modelo usando cross-validation
results = {}
for name, pipeline in pipelines.items():
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    results[name] = cv_scores

In [51]:
for name, scores in results.items():
    print(f"{name}: Mean accuracy = {scores.mean():.4f}, Std deviation = {scores.std():.4f}")


logistic_regression: Mean accuracy = 0.9984, Std deviation = 0.0032
random_forest: Mean accuracy = 0.9746, Std deviation = 0.0137
svm: Mean accuracy = 0.9810, Std deviation = 0.0119
knn: Mean accuracy = 0.9651, Std deviation = 0.0205
decision_tree: Mean accuracy = 0.9683, Std deviation = 0.0123


In [52]:
# Treinar e avaliar cada modelo no conjunto de teste
for name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(f"\n{name} Classification Report:")
    print(classification_report(y_test, y_pred))


logistic_regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        92
           1       1.00      1.00      1.00        66

    accuracy                           1.00       158
   macro avg       1.00      1.00      1.00       158
weighted avg       1.00      1.00      1.00       158


random_forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        92
           1       0.98      0.98      0.98        66

    accuracy                           0.99       158
   macro avg       0.99      0.99      0.99       158
weighted avg       0.99      0.99      0.99       158


svm Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        92
           1       1.00      0.94      0.97        66

    accuracy                           0.97       158
   macro avg      

In [53]:
import joblib 

In [54]:
# Treinar e salvar o melhor modelo
best_model = pipelines['logistic_regression']
best_model.fit(X_train, y_train)
joblib.dump(best_model, 'best_model.pkl')
print("Modelo salvo como best_model.pkl")

Modelo salvo como best_model.pkl


In [110]:
df1 = pd.read_csv('challenge_train.csv')
df2 = pd.read_csv('challenge_test.csv')

df = pd.concat([df1, df2], ignore_index=True)
#del df1 
#del df2

# Definir 'id' como índice
df.set_index('id', inplace=True)

In [114]:
gg = df.loc[[1118]]

In [115]:
gg

Unnamed: 0_level_0,name,mana,attack,health,type,god,strategy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1118,Firewine,5,0,0,spell,nature,early


In [78]:
df2

Unnamed: 0,id,name,mana,attack,health,type,god,strategy
0,100042,Walking Plant,1,1,1,creature,nature,
1,1018,Black Jaguar,2,3,3,creature,nature,
2,129,Avatar of Death,6,6,6,creature,death,
3,87069,Mind Jolt,5,0,0,spell,magic,
4,1090,Minotaur Phalanx,6,4,8,creature,neutral,
...,...,...,...,...,...,...,...,...
193,1128,Wiccan Trapper,1,3,1,creature,neutral,
194,65,Pickpocket,4,0,0,spell,deception,
195,229,Phalanx Champion,4,3,4,creature,light,
196,883,Fated Arrival,3,0,0,spell,light,


In [80]:
df

Unnamed: 0,id,name_x,mana_x,attack_x,health_x,type_x,god_x,strategy_x,name_y,mana_y,attack_y,health_y,type_y,god_y,strategy_y
0,1118,Firewine,5.0,0.0,0.0,spell,nature,early,,,,,,,
1,1036,Leyhoard Hatchling,10.0,2.0,1.0,creature,magic,late,,,,,,,
2,244,Aetherfuel Alchemist,6.0,4.0,4.0,creature,neutral,late,,,,,,,
3,215,Millenium Matryoshka,4.0,2.0,2.0,creature,neutral,late,,,,,,,
4,87013,Poison Peddler,4.0,1.0,3.0,creature,neutral,late,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
981,1128,,,,,,,,Wiccan Trapper,1.0,3.0,1.0,creature,neutral,
982,65,,,,,,,,Pickpocket,4.0,0.0,0.0,spell,deception,
983,229,,,,,,,,Phalanx Champion,4.0,3.0,4.0,creature,light,
984,883,,,,,,,,Fated Arrival,3.0,0.0,0.0,spell,light,
