In [2]:
import sys
import os
from dotenv import load_dotenv
import pandas as pd
from BloomtechMonsterLab import Monster

# Load environment variables from .env
load_dotenv()

# Set working directory to project root
os.chdir(r"C:\Users\Owner\Documents\labs ds\BandersnatchStarter")

# Add project root to Python path
sys.path.append(os.getcwd())

# Import Database
from app.data import Database

# Initialize database
db = Database(collection="monsters")

# Seed data ONLY if empty
if db.count() == 0:
    print("Seeding monster data...")
    db.seed(1000)
else:
    print("Monster data already exists.")

# Load data into DataFrame
df = db.dataframe()

print("Data shape:", df.shape)
print("Columns:", df.columns)

df.head()




Seeding monster data...
Data shape: (1000, 9)
Columns: Index(['Name', 'Type', 'Level', 'Rarity', 'Damage', 'Health', 'Energy',
       'Sanity', 'Timestamp'],
      dtype='object')


Unnamed: 0,Name,Type,Level,Rarity,Damage,Health,Energy,Sanity,Timestamp
0,Spore Spirit,Fey,9,Rank 0,9d2,17.87,17.52,17.39,2025-12-31 08:12:27 AM
1,Imp,Demonic,7,Rank 4,7d10,65.99,65.71,74.86,2025-12-31 08:12:27 AM
2,Flame Mephit,Elemental,3,Rank 2,3d6+2,19.51,15.43,16.79,2025-12-31 08:12:27 AM
3,Hell Hound,Demonic,2,Rank 1,2d4,6.18,9.39,8.78,2025-12-31 08:12:27 AM
4,Magma Devil,Devilkin,10,Rank 0,10d2,20.52,20.45,20.81,2025-12-31 08:12:27 AM


In [None]:
# 2️ Prepare features and target for model training

target_col = "Rarity"

# Use ONLY numeric feature columns
feature_cols = ["Level", "Health", "Energy", "Sanity"]

X = df[feature_cols]
y = df[target_col]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)



Training set size: (800, 4)
Test set size: (200, 4)


In [None]:
# 3️ Train multiple models and compare accuracy
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Define models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
}

# Train, predict, and store accuracy
accuracies = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")


RandomForest Accuracy: 0.9600
DecisionTree Accuracy: 0.9200
LogisticRegression Accuracy: 0.9750


In [None]:
# 4 Pick the best model based on accuracy
best_model_name = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_model_name]

print(f"Best model: {best_model_name} with accuracy {best_accuracy:.4f}")

# Optional: store the trained model for later
best_model = models[best_model_name]


Best model: LogisticRegression with accuracy 0.9750
