In [1]:
import os
import numpy as np
import pandas as pd
import re  
from pathlib import Path
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch

image_dir_path = 'dataset'
paths = []
pokemonNames, pokemonTarget, pokemonRGB, pokemonImage = [], [], [], []

def img_normalize():
    removeBadFiles()
    renameFile()
    removeAlphaTransparency()
    resize()

def removeBadFiles():
    for pokeFolder in os.listdir(image_dir_path):
        for imageFile in os.listdir(f"{image_dir_path}/{pokeFolder}"):
            try:
                if not imageFile[imageFile.index('.'):].lower() in ['.png', '.jpg', '.jpeg']:
                    os.remove(f"{image_dir_path}/{pokeFolder}/{imageFile}")
            except:
                pass

def renameFile():
    for pokeFolder in os.listdir(image_dir_path):
        for i, imageFile in enumerate(os.listdir(f"{image_dir_path}/{pokeFolder}")):
            pattern = "image\d*.*(p|P)*(g|G)"
            result = re.search(pattern, f"{image_dir_path}/{pokeFolder}/{imageFile}")
            if result is None:
                os.rename(f"{image_dir_path}/{pokeFolder}/{imageFile}", f"{image_dir_path}/{pokeFolder}/image{i}.jpg")

def removeAlphaTransparency():
    for pokeFolder in os.listdir(image_dir_path):
        for imageFile in os.listdir(f"{image_dir_path}/{pokeFolder}"):
            img = Image.open(f"{image_dir_path}/{pokeFolder}/{imageFile}")
            if not img.mode == 'RGB':
                img.convert('RGB').save(f"{image_dir_path}/{pokeFolder}/{imageFile}")

def resize():
    for pokeFolder in os.listdir(image_dir_path):
        for imageFile in os.listdir(f"{image_dir_path}/{pokeFolder}"):
            img = Image.open(f"{image_dir_path}/{pokeFolder}/{imageFile}")
            newImg = img.resize((128,128))
            newImg.save(f"{image_dir_path}/{pokeFolder}/{imageFile}")
    
img_normalize()

def createDataDebug(pokemon_bunch_data):
    print(pokemon_bunch_data['data'].shape)
    print(pokemon_bunch_data['images'].shape)
    print(pokemon_bunch_data['target'].shape)
    print(pokemon_bunch_data['target_names'].shape)

def createData():

    for path in Path(image_dir_path).rglob('*.jpg'):
        if path.parts[1] not in pokemonNames:
            pokemonNames.append(path.parts[1])
        paths.append(path.parts[-2:]) 
    image_dataset = pd.DataFrame(data=paths, columns=['Pokemon', 'Image'])

    for index in range(image_dataset.shape[0]):
        pokemonTarget.append(pokemonNames.index(image_dataset.loc[index]['Pokemon']))
        img = Image.open('dataset/' + image_dataset.loc[index]['Pokemon'] + '/' + image_dataset.loc[index]['Image'])
        pixel = img.load()
        tempRGB, tempRow = [], []
        for i in range(img.height):
            tempImage = []
            for j in range(img.width):
                tempRGB.extend([pixel[i, j][0], pixel[i, j][1], pixel[i, j][2]])
                tempImage.append(np.asarray(pixel[i, j]))
            tempRow.append(tempImage)
        pokemonImage.append(tempRow)
        pokemonRGB.append(tempRGB)

    pokemon_bunch_data = Bunch( data=np.asarray(pokemonRGB), 
                                images=np.asarray(pokemonImage),
                                target=np.asarray(pokemonTarget),
                                target_names=np.asarray(pokemonNames) )

    #createDataDebug(pokemon_bunch_data)
    return pokemon_bunch_data

pokemon_bunch_data = createData()

pokeTrain, pokeTest, labelTrain, labelTest = \
    train_test_split(pokemon_bunch_data['data'], pokemon_bunch_data['target'], test_size=0.1)

########### Random Forest Algorithm ###########

forest = RandomForestClassifier(n_estimators=200, max_depth=3, min_samples_leaf=9).fit(pokeTrain, labelTrain)
print("Now Testing RandomForest: \n " + "Accuracy: " + str(forest.score(pokeTest, labelTest)) + "%\n")

########### KNearestNeighbors Algorithm ###########

pokemonKNN = KNeighborsClassifier(n_neighbors = 3).fit(pokeTrain, labelTrain)
print("Now Testing kNN: \n " + "Accuracy: " +  str(pokemonKNN.score(pokeTest, labelTest))+ "%")

# All 150 pokemon takes ~40 minutes are so

# Next goal: Refactoring



Now Testing RandomForest: 
 Accuracy: 0.8620689655172413%

Now Testing kNN: 
 Accuracy: 0.7701149425287356%
