In [40]:
# Imports
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import numpy as np

# Load data

In [41]:
df = pd.read_csv("..//ingestion//arknights_operator_stats.csv")
df.head()

Unnamed: 0,operator,rarity,class,promotion_level,level,hp,attack,defense,resistance,redeployment_time,dp_cost,block_count,attack_interval,cn_release_date,global_release_date,is_limited
0,Mizuki,6.0,Specialist,0,1,760,372,155,10,70,19,0,3.5,8/3/2021,,0.0
1,Mizuki,6.0,Specialist,0,2,765,375,157,10,70,19,0,3.5,8/3/2021,,0.0
2,Mizuki,6.0,Specialist,0,3,771,379,158,10,70,19,0,3.5,8/3/2021,,0.0
3,Mizuki,6.0,Specialist,0,4,776,382,160,10,70,19,0,3.5,8/3/2021,,0.0
4,Mizuki,6.0,Specialist,0,5,782,385,161,10,70,19,0,3.5,8/3/2021,,0.0


# Pre-process data

In [42]:
df.columns

Index(['operator', 'rarity', 'class', 'promotion_level', 'level', 'hp',
       'attack', 'defense', 'resistance', 'redeployment_time', 'dp_cost',
       'block_count', 'attack_interval', 'cn_release_date',
       'global_release_date', 'is_limited'],
      dtype='object')

In [43]:
# Remove undesired features
df_max_stats = df.drop(["cn_release_date", "global_release_date", "is_limited"], axis="columns")

# Keep only the maxed out stats for each operator
df_max_stats = df_max_stats.groupby("operator").max()
# Keep the operator names as a column, not the index
df_max_stats.reset_index(inplace=True)

In [44]:
# Reorder columns to use the label as the last column
new_col_order = df_max_stats.columns.tolist()
new_col_order.remove("class")
new_col_order.append("class")
df_max_stats = df_max_stats[new_col_order]

In [45]:
# Save a list with the names of operators
operators = df_max_stats["operator"].tolist()
# Remove the "operator" column, keeping only numeric features and the label
df_max_stats.drop("operator", axis="columns", inplace=True)

In [46]:
# Normalize numeric features
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

# Indices of numeric features (column-wise)
numeric_features = list(range(df_max_stats.shape[1]-1))
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features)
    ]
)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('scaler', StandardScaler())]),
                                 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])])

# Build the Decision Tree model

In [49]:
features_list = [
    "rarity", "promotion_level", 
    "level", "hp", "attack", 
    "defense", "resistance", "redeployment_time", 
    "dp_cost", "block_count", "attack_interval"
]

label_list = ["class"]

features = df_max_stats[features_list].to_numpy()
label = df_max_stats[label_list].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=123, shuffle=True)

In [50]:
# Create the model as pipeline that includes the pre-processing steps and the algorithm
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", tree.DecisionTreeClassifier())
])
# Train the model
pipeline.fit(x_train, y_train)
# Make predictions
y_pred = pipeline.predict(x_test)

In [51]:
accuracy = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
f"Accuracy: {accuracy}%"

'Accuracy: 78.57%'

In [56]:
for label, pred in zip(y_test, y_pred):
    if label != pred:
        print(f"{label[0]} was wrongly predicted as {pred}")

Vanguard was wrongly predicted as Guard
Defender was wrongly predicted as Sniper
Vanguard was wrongly predicted as Guard
Guard was wrongly predicted as Vanguard
Supporter was wrongly predicted as Specialist
Vanguard was wrongly predicted as Supporter
Sniper was wrongly predicted as Specialist
Supporter was wrongly predicted as Specialist
Specialist was wrongly predicted as Sniper
