***Link:*** [Kaggle dataset](https://www.kaggle.com/datasets/nudratabbas/pokmon-stats-and-types-generation-1-2-index/data)

In [1]:
#!/usr/bin/env python3
"""
POKEMON IDENTIFIER - ML-based Pokemon identification
K-Nearest Neighbors
"""

import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import pairwise_distances
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('pokemon_stats_2025.csv')
df['type_2'] = df['type_2'].fillna('none')

le_type1 = LabelEncoder()
le_type2 = LabelEncoder()
le_pokemon = LabelEncoder()

df['type_1_encoded'] = le_type1.fit_transform(df['type_1'])
df['type_2_encoded'] = le_type2.fit_transform(df['type_2'])
df['pokemon_id'] = le_pokemon.fit_transform(df['name'])

df['bmi'] = df['weight'] / (df['height']**2)
df['is_dual_type'] = (df['type_2'] != 'none').astype(int)
df['total_stats'] = df[[
    'hp', 'attack', 'defense', 'special_attack', 'special_defense', 'speed'
]].sum(axis=1)

physical_features = [
    'height', 'weight', 'type_1_encoded', 'type_2_encoded', 'bmi',
    'is_dual_type'
]
stats_features = [
    'hp', 'attack', 'defense', 'special_attack', 'special_defense', 'speed',
    'total_stats'
]
combined_features = physical_features + stats_features

y = df['pokemon_id'].values

X_physical = df[physical_features].values

scaler_physical = StandardScaler()
X_physical_scaled = scaler_physical.fit_transform(X_physical)

knn_physical = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn_physical.fit(X_physical_scaled, y)

train_acc = knn_physical.score(X_physical_scaled, y)
print(f"\nAccuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")

X_stats = df[stats_features].values

scaler_stats = StandardScaler()
X_stats_scaled = scaler_stats.fit_transform(X_stats)

knn_stats = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn_stats.fit(X_stats_scaled, y)

train_acc_stats = knn_stats.score(X_stats_scaled, y)
print(f"\nAccuracy: {train_acc_stats:.4f} ({train_acc_stats*100:.2f}%)")

X_combined = df[combined_features].values

scaler_combined = StandardScaler()
X_combined_scaled = scaler_combined.fit_transform(X_combined)

knn_combined = KNeighborsClassifier(n_neighbors=1, weights='distance')
knn_combined.fit(X_combined_scaled, y)

train_acc_combined = knn_combined.score(X_combined_scaled, y)
print(f"\nAccuracy: {train_acc_combined:.4f} ({train_acc_combined*100:.2f}%)")

pkg_physical = {
    'model': knn_physical,
    'scaler': scaler_physical,
    'features': physical_features,
    'le_pokemon': le_pokemon,
    'le_type1': le_type1,
    'le_type2': le_type2,
    'dataset': df[['name', 'height', 'weight', 'type_1', 'type_2']],
    'type': 'physical'
}

with open('pokemon_identifier_physical.pkl', 'wb') as f:
    pickle.dump(pkg_physical, f)

pkg_stats = {
    'model': knn_stats,
    'scaler': scaler_stats,
    'features': stats_features,
    'le_pokemon': le_pokemon,
    'dataset': df[['name'] + stats_features],
    'type': 'stats'
}

with open('pokemon_identifier_stats.pkl', 'wb') as f:
    pickle.dump(pkg_stats, f)

pkg_combined = {
    'model': knn_combined,
    'scaler': scaler_combined,
    'features': combined_features,
    'le_pokemon': le_pokemon,
    'le_type1': le_type1,
    'le_type2': le_type2,
    'dataset': df,
    'type': 'combined'
}

with open('pokemon_identifier_combined.pkl', 'wb') as f:
    pickle.dump(pkg_combined, f)


def predict_top_k(model, scaler, X, le_pokemon, k=5):
    X_scaled = scaler.transform(X)

    distances, indices = model.kneighbors(X_scaled, n_neighbors=k)

    results = []
    for i in range(len(X)):
        pokemon_ids = model._y[indices[i]]
        pokemon_names = le_pokemon.inverse_transform(pokemon_ids)
        dists = distances[i]

        max_dist = dists.max() if dists.max() > 0 else 1
        confidences = 100 * (1 - dists / max_dist)

        results.append(list(zip(pokemon_names, confidences)))

    return results



Accuracy: 1.0000 (100.00%)

Accuracy: 0.9880 (98.80%)

Accuracy: 1.0000 (100.00%)


In [2]:
#TEST1
print("\nCharizard (17 dm, 905 hg, fire/flying)")
t1_enc = le_type1.transform(['fire'])[0]
t2_enc = le_type2.transform(['flying'])[0]
bmi = 905 / (17**2)

X_test = np.array([[17, 905, t1_enc, t2_enc, bmi, 1]])
top_k = predict_top_k(knn_physical, scaler_physical, X_test, le_pokemon, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
    print(f"    {i}. {name:15s} (probability: {conf:.1f}%)")



Charizard (17 dm, 905 hg, fire/flying)

  Top 5:
    1. Charizard       (probability: 100.0%)
    2. Moltres         (probability: 20.3%)
    3. Zapdos          (probability: 11.6%)
    4. Dodrio          (probability: 6.5%)
    5. Articuno        (probability: 0.0%)


In [3]:
#TEST2
print("\nPikachu (4 dm, 60 hg, electric)")
t1_enc = le_type1.transform(['electric'])[0]
t2_enc = le_type2.transform(['none'])[0]
bmi = 60 / (4**2)

X_test = np.array([[4, 60, t1_enc, t2_enc, bmi, 0]])
top_k = predict_top_k(knn_physical, scaler_physical, X_test, le_pokemon, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
    print(f"    {i}. {name:15s} (probability: {conf:.1f}%)")



Pikachu (4 dm, 60 hg, electric)

  Top 5:
    1. Pikachu         (probability: 100.0%)
    2. Voltorb         (probability: 65.7%)
    3. Cleffa          (probability: 52.7%)
    4. Jolteon         (probability: 15.4%)
    5. Tyrogue         (probability: 0.0%)


In [4]:
#TEST3
print("\nIdentify stats (78, 84, 78, 109, 85, 100)")
stats = np.array([[78, 84, 78, 109, 85, 100, 534]])  # Charizard stats
top_k_stats = predict_top_k(knn_stats, scaler_stats, stats, le_pokemon, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k_stats[0], 1):
    print(f"    {i}. {name:15s} (probability: {conf:.1f}%)")



Identify stats (78, 84, 78, 109, 85, 100)

  Top 5:
    1. Typhlosion      (probability: 100.0%)
    2. Charizard       (probability: 100.0%)
    3. Golduck         (probability: 13.5%)
    4. Zapdos          (probability: 6.6%)
    5. Starmie         (probability: 0.0%)


In [5]:
#TEST4
print("\nUnknown parameters (15 dm, 700 hg, water/flying)")
t1_enc = le_type1.transform(['water'])[0]
t2_enc = le_type2.transform(['flying'])[0]
bmi = 700 / (15**2)

X_test = np.array([[15, 700, t1_enc, t2_enc, bmi, 1]])
top_k = predict_top_k(knn_physical, scaler_physical, X_test, le_pokemon, k=5)

print("\n  Top 5:")
for i, (name, conf) in enumerate(top_k[0], 1):
    print(f"    {i}. {name:15s} (probability: {conf:.1f}%)")


Unknown parameters (15 dm, 700 hg, water/flying)

  Top 5:
    1. Skarmory        (probability: 24.3%)
    2. Aerodactyl      (probability: 19.1%)
    3. Quagsire        (probability: 3.6%)
    4. Poliwrath       (probability: 2.8%)
    5. Crobat          (probability: 0.0%)
