In [86]:
import os
import numpy as np
from pathlib import Path
from PIL import Image
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils import Bunch
import time, string, random

def poke_predict(n, otherCases):
    # poke_predict takes an n INTEGER to determine how many times the user wants to iterate the machine learning process
    # if n = 1, the machine learning will be run once, and if n = 100, machine learning will be run 100 times
    # It is highly recommended to not use any n value higher than 10 as it will take a very long time.
    # otherCases is a BOOLEAN that signifies whether or not you want the other extra cases to be tested
    # By default, it is False, but you can set it to True. However, it will take much longer. 
    print("#################### Processing and Normalizing Images \n")

    # Specifies the directory of all the images
    image_dir_path = 'dataset'
    # Declares the four lists that will be needed to create the data for machine learning
    pokemonRGB, pokemonImage, pokemonTarget, pokemonNames = [], [], [], []

    # Starts the process of normalizing all the images taken from Kaggle
    def img_normalize():
        # This goes through every file in the directory
        for i, path in enumerate(Path(image_dir_path).rglob('*.*')):
            # If the file is NOT either a png or jpg
            if not path.parts[2][path.parts[2].index('.'):].lower() in ['.png', '.jpg', '.jpeg']:
                # Remove the file; we don't want it
                os.remove(path)
            # Otherwise check if it starts with 'new' to signify that it has been normalized already
            elif not path.parts[2].startswith('new'):
                # Opens the image
                with Image.open(path) as img:
                    # Checks if the image has been normalized by height already or not
                    if not (img.width == 64 and img.height == 64):
                        # Generates a 3 character long string
                        ranString = ''.join(random.choices(string.ascii_uppercase + string.digits, k=3))
                        # Does 3 things: it resizes all the images to a 64x64 image,
                        # Then it converts every image to RGB, so it gets rid of the Alpha channel if it has, 
                        # or other formats of images so that every image is the same
                        # Saves every image as a new name and as a png
                        img.resize((64,64)).convert('RGB').save(f"{path.parts[0]}/{path.parts[1]}/new{i}-{ranString}.png")
                        # Removes the old image file
                        os.remove(path)

    # Starts a timer to count how long it will take to normalize images
    start = time.perf_counter()
    img_normalize()
    end = time.perf_counter() 
    # On average, if the images are not normalized already, this will take about 30 seconds or so
    # If the images are normalized already, then it should take only  0.01 to check
    print(f"Time Elapsed to Process and Normalize Data: {end-start:0.4f} seconds!\n")

    print("#################### Creating Data From Images \n")

    # Starts the process of turning the images into data for machine learning
    def createData():
        # Again, goes through every image file
        for path in Path(image_dir_path).rglob('*.*'):
            # Checks if a Pokemon is in a list, if not, then append
            if path.parts[1] not in pokemonNames:
                pokemonNames.append(path.parts[1])
            # Appends the index value of the pokemon from pokemonNames to the pokemonTarget; think as if they correspond.
            # If Abra was the first pokemon in pokemonNames, the first value in pokemonTarget would be 0 (because 0th index)
            pokemonTarget.append(pokemonNames.index(path.parts[1]))
            # Opens the image
            with Image.open(path) as img:
                # Loads the image as tuple of RGB values 
                pixel = img.load()
                # Create two temporary lists
                tempRGB, tempRow = [], []   
                # Iterates by width and height (so every pixel)
                for i in range(img.width):
                    # Creates a new temporary list to help
                    tempImage = []
                    for j in range(img.height):
                        # tempRGB will be a 1D list that has ALL of the RGB values not in tuple form
                        tempRGB.extend([pixel[i, j][0], pixel[i, j][1], pixel[i, j][2]])
                        # tempImage will be a 1D list that has the RGB for EACH ROW
                        tempImage.append(np.asarray(pixel[i, j]))
                    # tempRow appends each row of RGB values from tempImage so it will be a 2D list of rows and columns of RGB values
                    # Essentially tempRow will make an image from rows of RGB values
                    tempRow.append(tempImage)
                # Each image in 2D will be appended to another list that will store all of the images from the directory
                pokemonImage.append(tempRow)
                # Takes the 1D array from tempRGB and appends it to one list
                pokemonRGB.append(tempRGB)

        # Takes all of the lists from above and uses thes scikit-learn Bunch class to create a Bunch object to use for Machine Learning
        # This is why the data has to be made like so
        # Referencing from our lecture and documentations, we can see that the machine learning algorithms for images uses Bunch for 
        # machine learning.
        return Bunch( data=np.asarray(pokemonRGB), 
                      images=np.asarray(pokemonImage),
                      target=np.asarray(pokemonTarget),
                      target_names=np.asarray(pokemonNames) )

    # Again, starts a timer to see how long it will take to create data
    start = time.perf_counter()
    pokemon_bunch_data = createData()
    end = time.perf_counter()
    # On average, it takes about 13 seconds or so to create data
    print(f"Time Elapsed to Create Data: {end-start:0.4f} seconds!\n")

    # Function to debug code if the Bunch object is incorrectly done
    def createDataDebug(pokemon_bunch_data):
        print(pokemon_bunch_data['data'].shape)
        print(pokemon_bunch_data['images'].shape)
        print(pokemon_bunch_data['target'].shape)
        print(pokemon_bunch_data['target_names'].shape)

    # This part takes very long
    forestData = []
    forestData2 = []
    knnData = []
    knnData2 = []
    for i in range(n):
        # We are purposely having random_state = i so we can measure the difference in the parameters for each case
        # Test size being 0.15 seems like a fair amount given the amount of images that we are working with
        pokeTrain, pokeTest, labelTrain, labelTest = \
            train_test_split(pokemon_bunch_data['data'], pokemon_bunch_data['target'], test_size=0.15, random_state=i)

        # For the sake of time, we have selected only two major cases that is of interest to compare
        # It is being compared to the default value of RandomForest
        # The paramater we are manipulating for RandomForest are:
        #   n_estimators
        #   criterion
        #   min_samples_leaf
        # Documetnation can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
        # To see what which parameter does
        print(f"#################### Training RandomForest Classifier #{i+1}\n")
        start = time.perf_counter()
        pokemonForest = RandomForestClassifier().fit(pokeTrain, labelTrain)
        end = time.perf_counter()
        score = pokemonForest.score(pokeTest, labelTest)
        print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees!\n")
        forestData.append(score)

        start = time.perf_counter()
        pokemonForest = RandomForestClassifier(n_estimators=200, criterion='entropy').fit(pokeTrain, labelTrain)
        end = time.perf_counter()
        score = pokemonForest.score(pokeTest, labelTest)
        print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 200 trees and entropy!\n")
        forestData2.append(score)

        # All other cases will not have their scores factored into the averages for the sake of skewing the accuracy.
        # It is pretty interesting to run this once, however, to see the accuracy of all the other cases.
        if otherCases:
            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(criterion='entropy').fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees and entropy!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(criterion='entropy', min_samples_leaf=8).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees and entropy and min sample leaf of 8!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(criterion='log_loss').fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees and log_loss!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(criterion='log_loss', min_samples_leaf=8).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees and log_loss and min sample leaf of 8!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(max_depth=3).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees with a max-depth of 3!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(min_samples_leaf=8).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees with a min samples leaf of 8!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(max_depth=3, min_samples_leaf=8).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 100 trees with a max-depth of 3 and a min sample leaf of 8!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(n_estimators=200).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 200 trees!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(n_estimators=200, criterion='entropy', min_samples_leaf=8).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 200 trees and entropy and min samples leaf of 8!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(n_estimators=200, criterion='log_loss').fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 200 trees and log_loss!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(n_estimators=200, criterion='log_loss', min_samples_leaf=8).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 200 trees and log_loss and a min samples leaf of 8!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(n_estimators=200, max_depth=3).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 200 trees and max-depth of 3!\n")

            start = time.perf_counter()
            pokemonForest = RandomForestClassifier(n_estimators=200, max_depth=3, min_samples_leaf=8).fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score = pokemonForest.score(pokeTest, labelTest)
            print(f"Testing RandomForest with {score:0.4f}% Accuracy in {end-start:0.4f} seconds with 200 trees and max-depth of 3 and a min sample leaf of 8!\n")
       
        # It is being compared to the default value of kNN
        # The paramater we are manipulating for kNN are:
        #   n_neighbors
        #   leaf_size
        #   weights
        # Documentation can be found here: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
        # To see what which parameter does
        print(f"#################### Training kNearestNeighbor Classifier #{i+1}\n")
        start = time.perf_counter()
        pokemonKNN = KNeighborsClassifier(n_neighbors = 7, leaf_size = 100, weights = "distance").fit(pokeTrain, labelTrain)
        end = time.perf_counter()
        score2 = pokemonKNN.score(pokeTest, labelTest)
        print(f"Testing kNN with {score2:0.4f}% Accuracy in {end-start:0.4f} seconds with 7 neighbors, leaf size 100, and distance method!\n")
        knnData2.append(score2)

        start = time.perf_counter()
        pokemonKNN = KNeighborsClassifier().fit(pokeTrain, labelTrain)
        end = time.perf_counter()
        score2 = pokemonKNN.score(pokeTest, labelTest)
        print(f"Testing kNN with {score2:0.4f}% Accuracy in {end-start:0.4f} seconds with 5 neighbors!\n")
        knnData.append(score2)

        # All other cases will not have their scores factored into the averages for the sake of skewing the accuracy.
        if otherCases:
            start = time.perf_counter()
            pokemonKNN = KNeighborsClassifier(leaf_size = 100, weights = "distance").fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score2 = pokemonKNN.score(pokeTest, labelTest)
            print(f"Testing kNN with {score2:0.4f}% Accuracy in {end-start:0.4f} seconds with 5 neighbors, leaf size 100, and distance method!\n")

            start = time.perf_counter()
            pokemonKNN = KNeighborsClassifier(n_neighbors = 3, leaf_size = 100, weights = "distance").fit(pokeTrain, labelTrain)
            end = time.perf_counter()
            score2 = pokemonKNN.score(pokeTest, labelTest)
            print(f"Testing kNN with {score2:0.4f}% Accuracy in {end-start:0.4f} seconds with 3 neighbors, leaf size 100, and distance method!\n")

    # Outputs the accuracy of both machine learning algorithms to see how they stack up
    # From even one trial run, we can see that randomForest is generally the better algorithm for handling images than kNN is
    # by over 15 points every time. 
    print(f"""#################### Conclusion \n
Over {n} trial(s), default RandomForest has an average of {sum(forestData)/len(forestData):0.4f}% accuracy.\n
Over {n} trial(s), modded RandomForest has an average of {sum(forestData2)/len(forestData2):0.4f}% accuracy.\n
Over {n} trial(s), default kNN has an average of {sum(knnData)/len(knnData):0.4f}% accuracy.\n
Over {n} trial(s), modded kNN has an average of {sum(knnData2)/len(knnData2):0.4f}% accuracy.\n""")

# Use this by default for quick results. About 1 minute, may take awhile depend on your computer/laptop.
poke_predict(1, False) 

# Run the bottom after running the above to see other performances. About 2 minutes 20 minutes or so
# poke_predict(1, True)

# Run to see the aggregate average over 10 trials, or more. About 3 minutes and 30 seconds or so
# poke_predict(10, False)

# The following below is a example of running poke_predict 10 times: 

#################### Conclusion 

# Over 10 trial(s), default RandomForest has an average of 0.8432% accuracy.

# Over 10 trial(s), modded RandomForest has an average of 0.8523% accuracy.

# Over 10 trial(s), default kNN has an average of 0.6355% accuracy.

# Over 10 trial(s), modded kNN has an average of 0.7236% accuracy.

#################### Processing and Normalizing Images 

Time Elapsed to Process and Normalize Data: 0.0137 seconds!

#################### Creating Data From Images 

Time Elapsed to Create Data: 11.4738 seconds!

#################### Training RandomForest Classifier #1

Testing RandomForest with 0.8591% Accuracy in 3.7848 seconds with 100 trees!

Testing RandomForest with 0.8727% Accuracy in 11.2050 seconds with 200 trees and entropy!

#################### Training kNearestNeighbor Classifier #1

Testing kNN with 0.6955% Accuracy in 0.0005 seconds with 7 neighbors, leaf size 100, and distance method!

Testing kNN with 0.6455% Accuracy in 0.0002 seconds with 5 neighbors!

#################### Conclusion 

Over 1 trial(s), default RandomForest has an average of 0.8591% accuracy.

Over 1 trial(s), modded RandomForest has an average of 0.8727% accuracy.

Over 1 trial(s), default kNN has an average of 0.6455% accuracy.

Over 1 trial(s), modded kNN has an average of 0.6955% accuracy.

