In [1]:
# Imports
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

# Load data

In [2]:
df = pd.read_csv("..//ingestion//arknights_operator_stats.csv")
df.head()

Unnamed: 0,operator,rarity,class,promotion_level,level,hp,attack,defense,resistance,redeployment_time,dp_cost,block_count,attack_interval,cn_release_date,global_release_date,is_limited
0,Pallas,6.0,Guard,0,1,794,302,213,0,70,15,2,1.05,7/2/2021,,False
1,Pallas,6.0,Guard,0,2,800,305,215,0,70,15,2,1.05,7/2/2021,,False
2,Pallas,6.0,Guard,0,3,806,307,217,0,70,15,2,1.05,7/2/2021,,False
3,Pallas,6.0,Guard,0,4,812,310,218,0,70,15,2,1.05,7/2/2021,,False
4,Pallas,6.0,Guard,0,5,818,313,220,0,70,15,2,1.05,7/2/2021,,False


# Pre-process data

In [3]:
df.columns

Index(['operator', 'rarity', 'class', 'promotion_level', 'level', 'hp',
       'attack', 'defense', 'resistance', 'redeployment_time', 'dp_cost',
       'block_count', 'attack_interval', 'cn_release_date',
       'global_release_date', 'is_limited'],
      dtype='object')

In [4]:
# Remove undesired features
df_max_stats = df.drop(["cn_release_date", "global_release_date", "is_limited"], axis="columns")

# Keep only the maxed out stats for each operator
df_max_stats = df_max_stats.groupby("operator").max()
# Keep the operator names as a column, not the index
df_max_stats.reset_index(inplace=True)

In [5]:
# Reorder columns to use the label as the last column
new_col_order = df_max_stats.columns.tolist()
new_col_order.remove("class")
new_col_order.append("class")
df_max_stats = df_max_stats[new_col_order]

In [6]:
# Shuffle the rows
#df_max_stats= df_max_stats.sample(frac=1).reset_index(drop=True)

In [7]:
# Save a list with the names of operators
operators = df_max_stats["operator"].tolist()
# But map them to integers in the DF (the model can't be trained with string data!)
df_max_stats["operator"] = pd.Categorical(df_max_stats["operator"]).codes

In [8]:
df_max_stats.head()

Unnamed: 0,operator,rarity,promotion_level,level,hp,attack,defense,resistance,redeployment_time,dp_cost,block_count,attack_interval,class
0,0,2.0,0,30,1378,400,50,10,70,24,1,2.9,Caster
1,1,6.0,2,90,2034,703,152,10,70,13,1,1.3,Specialist
2,2,5.0,2,80,1420,633,124,20,80,22,1,1.6,Caster
3,3,4.0,2,70,1576,735,209,0,70,18,1,1.6,Sniper
4,4,3.0,1,55,1080,365,134,0,70,11,1,1.0,Sniper


# Build the Decision Tree model

In [9]:
features_list = [
    "operator", "rarity", "promotion_level", 
    "level", "hp", "attack", 
    "defense", "resistance", "redeployment_time", 
    "dp_cost", "block_count", "attack_interval"
]

label_list = ["class"]

features = df_max_stats[features_list].to_numpy()
label = df_max_stats[label_list].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(features, label, test_size=0.2, random_state=123, shuffle=True)

In [10]:
# Create the model
model = tree.DecisionTreeClassifier()
# Train
model.fit(x_train, y_train)
# And make predictions using the test dataset
y_pred = model.predict(x_test)

In [11]:
accuracy = round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
f"Accuracy: {accuracy}%"

'Accuracy: 80.95%'

## Review wrong predictions

In [12]:
# Relevant columns to review
review_columns = ["operator", "class_real", "class_predicted"]
# Create a DF for the test dataset
review_df = pd.DataFrame(columns=features_list, data=x_test)
# And add the test labels and predictions to it
review_df["class_real"] = y_test
review_df["class_predicted"] = y_pred

# Map the operator numeric ids back to their names
review_df["operator"] = review_df["operator"].map(lambda operator_id: operators[int(operator_id)])
# And query only the rows with wrong predictions
review_df.query("class_real != class_predicted")[review_columns]

Unnamed: 0,operator,class_real,class_predicted
0,Spot,Defender,Guard
1,Scene,Supporter,Specialist
4,Tachanka,Guard,Specialist
6,Frostleaf,Guard,Specialist
12,Durin,Caster,Supporter
14,Rangers,Sniper,Supporter
17,Tsukinogi,Supporter,Specialist
19,Bagpipe,Vanguard,Guard
