In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("data/dnd_chars_all.tsv", sep="\t")
df.head()

In [None]:
df.drop(columns=["ip", "finger", "hash"], inplace=True)

In [114]:
df.head()
df.columns
df["AC"]

0        10
1        10
2        21
3        16
4        16
         ..
10889    13
10890    16
10891    19
10892    14
10893    13
Name: AC, Length: 10894, dtype: int64

In [None]:
cdf = df.dropna(subset=["justClass", "race", "processedRace"]).copy()
cdf["justClass_expanded"] = cdf["justClass"].str.split("|")
cdf_expand = cdf.explode("justClass_expanded")
cdf_expand["normalized_race"] = cdf_expand["processedRace"].str.strip().str.lower()
cdf_expand["normalized_class"] = cdf_expand["justClass_expanded"].str.strip().str.lower()
cdf_expand["normalized_background"] = cdf_expand["background"].str.strip().str.lower()
cdf_expand.head()

In [None]:
cdf_expand["processedAlignment"].unique()

In [None]:
import matplotlib.pyplot as plt

In [None]:

race_counts = cdf_expand["normalized_race"].value_counts()
races_to_keep = race_counts[race_counts > 100].index
pivot_table = pd.crosstab(cdf_expand["normalized_race"], cdf_expand["normalized_class"])
pivot_table_clean = pivot_table.loc[races_to_keep]
pivot_table_clean.T.plot(kind='bar', stacked=True, figsize=(14, 6))
plt.title("Race Distribution per Class")
plt.xlabel("Class")
plt.ylabel("Number of Characters")
plt.legend(title='Race', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
cdf_expand

# Take race, normalized_race, processedAlignment to predict justClass_expanded
training_data = []

len(cdf_expand["background"].unique()), len(cdf_expand["background"])

features = cdf_expand[["normalized_race", "normalized_background", "processedAlignment"]]
target = cdf_expand["normalized_class"].apply(lambda x: 1 if x == 'cleric' else 0)

In [123]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection  import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer

In [None]:
encoder = OneHotEncoder(sparse_output=False)

In [None]:
encoded_features = encoder.fit_transform(features)
encoder.get_feature_names_out(features.columns)
encoded_features_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(features.columns))
encoder.get_feature_names_out(features.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(encoded_features_df, target, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cross_val_accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
print(f'Accuracy: {accuracy:.2f}')
cross_val_accuracy

In [None]:
from sklearn.preprocessing import LabelEncoder

In [127]:
label_encoder = LabelEncoder()

column_transformer = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(sparse_output=False), ["normalized_race", "normalized_background", "good", "lawful", "countryCode"]),
    ("num", StandardScaler(), ['Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha'])
])

classifier_target = cdf_expand["normalized_class"]
classifier_target_encoded = label_encoder.fit_transform(classifier_target)
classifier_features = cdf_expand[["normalized_race", "normalized_background", "good", "lawful", "countryCode", 'Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha']]

classifier_encoded_features = column_transformer.fit_transform(classifier_features)
column_transformer.get_feature_names_out(classifier_features.columns)
classifier_encoded_features_df = pd.DataFrame(classifier_encoded_features, columns=column_transformer.get_feature_names_out(classifier_features.columns))

X_train, X_test, y_train, y_test = train_test_split(classifier_encoded_features_df, classifier_target_encoded, test_size=0.2, random_state=42)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100]
}
model = LogisticRegression(max_iter=100, multi_class='multinomial', solver='saga', class_weight='balanced')
#grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
#grid_search.fit(X_train, y_train)
model.fit(X_train, y_train)

predictions = model.predict(X_test)

#print("Best parameters:", grid_search.best_params_)

decoded_predictions = label_encoder.inverse_transform(predictions)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

#cross_val_accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
#cross_val_accuracy
# Get the unique classes from the predictions
#unique_pred_classes = pd.unique(predictions)

# Filter target names to match the unique classes in predictions
#filtered_target_names = label_encoder.inverse_transform(unique_pred_classes)

#print(classification_report(y_test, predictions, target_names=filtered_target_names))




Accuracy: 0.52




In [None]:
classifier_encoded_features_df

In [None]:
unique_classes = pd.unique(y_test)
print(f"Unique classes in y_test: {unique_classes}")
print(f"Number of unique classes in y_test: {len(unique_classes)}")

In [130]:
from sklearn.svm import SVC



label_encoder = LabelEncoder()

column_transformer = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(sparse_output=False), ["normalized_race", "normalized_background", "good", "lawful", "countryCode"]),
    ("num", StandardScaler(), ['Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha'])
])

classifier_target = cdf_expand["normalized_class"]
classifier_target_encoded = label_encoder.fit_transform(classifier_target)
classifier_features = cdf_expand[["normalized_race", "normalized_background", "good", "lawful", "countryCode", 'Str', 'Dex', 'Con', 'Int', 'Wis', 'Cha']]

classifier_encoded_features = column_transformer.fit_transform(classifier_features)
column_transformer.get_feature_names_out(classifier_features.columns)
classifier_encoded_features_df = pd.DataFrame(classifier_encoded_features, columns=column_transformer.get_feature_names_out(classifier_features.columns))

X_train, X_test, y_train, y_test = train_test_split(classifier_encoded_features_df, classifier_target_encoded, test_size=0.2, random_state=42)

classifier = SVC()
classifier.fit(X_train, y_train)

predictions = classifier.predict(X_test)

decoded_predictions = label_encoder.inverse_transform(predictions)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

#cross_val_accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
#cross_val_accuracy
# Get the unique classes from the predictions
#unique_pred_classes = pd.unique(predictions)

# Filter target names to match the unique classes in predictions
#filtered_target_names = label_encoder.inverse_transform(unique_pred_classes)

#print(classification_report(y_test, predictions, target_names=filtered_target_names))


Accuracy: 0.60
