In [1]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
optical_recognition_of_handwritten_digits = fetch_ucirepo(id=80) 
  
# data (as pandas dataframes) 
X = optical_recognition_of_handwritten_digits.data.features 
y = optical_recognition_of_handwritten_digits.data.targets 
  
# metadata 
print(optical_recognition_of_handwritten_digits.metadata) 

# variable information 
print(optical_recognition_of_handwritten_digits.variables) 

{'uci_id': 80, 'name': 'Optical Recognition of Handwritten Digits', 'repository_url': 'https://archive.ics.uci.edu/dataset/80/optical+recognition+of+handwritten+digits', 'data_url': 'https://archive.ics.uci.edu/static/public/80/data.csv', 'abstract': 'Two versions of this database available; see folder', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 5620, 'num_features': 64, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Wed Aug 23 2023', 'dataset_doi': '10.24432/C50P49', 'creators': ['E. Alpaydin', 'C. Kaynak'], 'intro_paper': {'title': 'Methods of Combining Multiple Classifiers and Their Applications to Handwritten Digit Recognition', 'authors': 'C. Kaynak', 'published_in': 'MSc Thesis, Institute of Graduate Studies in Science and Engineering, Bogazici University', 

In [2]:
y= y.to_numpy()
y =y.ravel()

In [3]:
import numpy as np
import random

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9750889679715302


In [6]:
def cal_pop_fitness(pop, X_train, y_train, X_test, y_test):
    fitness = []
    for i in range(len(pop)):
        columns = np.unique(pop[i])
        
        X_train_ind = X_train.iloc[:,columns]
        rf_classifier.fit(X_train_ind, y_train)

        y_pred = rf_classifier.predict(X_test.iloc[:,columns])

        accuracy = accuracy_score(y_test, y_pred)

        fitness.append(accuracy)

    return fitness

def select_mating_pool(pop, fitness, num_parents):
    # Selecting the best individuals in the current generation as parents for producing the offspring of the next generation.
    parents = np.empty((num_parents, pop.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.where(fitness == np.max(fitness))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[parent_num, :] = pop[max_fitness_idx, :]
        fitness[max_fitness_idx] = -1
    return parents

def crossover(parents, offspring_size):
    offspring = np.empty(offspring_size)

    for k in range(offspring_size[0]):
        # Always perform one-point crossover.
        crossover_point = np.random.randint(1, parents.shape[1])
        parent1_idx = k % parents.shape[0]
        parent2_idx = (k + 1) % parents.shape[0]
        offspring[k, :crossover_point] = parents[parent1_idx, :crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]

    return offspring

def mutation(offspring_crossover):
    # Mutation changes a single gene in each offspring randomly.
    for idx in range(offspring_crossover.shape[0]):
        # The random value to be added to the gene.
        random_value = np.random.uniform(-1.0, 1.0, 1)
        offspring_crossover[idx, 4] = offspring_crossover[idx, 4] + random_value
    return offspring_crossover

In [7]:
col_len = 16

sol_per_pop = 8
num_parents_mating = 4

# Defining the population size.
pop_size = (sol_per_pop,col_len) # The population will have sol_per_pop chromosome where each chromosome has num_weights genes.
#Creating the initial population.
new_population = np.random.randint(63, size=pop_size)
print(new_population)

[[19 30 42 49 17 21 34  9 49 13 14 41 27  4 55 23]
 [44 48 46 41 25  5 58  9 18 32 54 28 30 46 43 19]
 [27 36 37 55 52 33 60 12 56 28 22 47 47 31  7 29]
 [28 31  3 62 18 33 22  8 60 47 30  7 18 59  7 27]
 [11 16 14 36 62 53 50  9 39 29 17 48 46 62 32 14]
 [ 3 49 15 29  7 26 22 46 24 40 18 61  7  6  2 23]
 [20 22 35 21 30 22 19 10 13 56  1 40 39 61 53 29]
 [38 54 46 40 15  9 43 19 47 36 50 10 15 17 32 34]]


In [8]:
num_generations = 10
for generation in range(num_generations):
    # Measing the fitness of each chromosome in the population.
    fitness = cal_pop_fitness(new_population, X_train,y_train,X_test,y_test)

    print(fitness)

    # Selecting the best parents in the population for mating.
    parents = select_mating_pool(new_population, fitness, 
                                      num_parents_mating)
    
    # print(parents)

    # Generating next generation using crossover.
    offspring_crossover = crossover(parents,
                                       offspring_size=(pop_size[0]-parents.shape[0], col_len))


    # # Adding some variations to the offsrping using mutation.
    # offspring_mutation = mutation(offspring_crossover)

    # print(offspring_mutation)

    # Creating the new population based on the parents and offspring.
    new_population[0:parents.shape[0], :] = parents
    new_population[parents.shape[0]:, :] = offspring_crossover  # should be offspring_mutation

    # new_population[0:parents.shape[0], :] = offspring_crossover

    # The best result in the current iteration.
    if generation%10 ==0:
        print("Generation : ", generation)
        # print(offspring_crossover)
        print()

[0.8778173190984578, 0.9045077105575327, 0.8440094899169632, 0.8001186239620404, 0.8155397390272835, 0.8185053380782918, 0.858244365361803, 0.8677342823250297]
Generation :  0

[0.9045077105575327, 0.8778173190984578, 0.8677342823250297, 0.858244365361803, 0.8374851720047449, 0.8558718861209964, 0.8623962040332147, 0.8920521945432978]
[0.9045077105575327, 0.8920521945432978, 0.8778173190984578, 0.8677342823250297, 0.8997627520759194, 0.8641755634638197, 0.8422301304863582, 0.7900355871886121]
[0.9045077105575327, 0.8997627520759194, 0.8920521945432978, 0.8778173190984578, 0.9045077105575327, 0.8997627520759194, 0.8641755634638197, 0.9199288256227758]
[0.9199288256227758, 0.9045077105575327, 0.9045077105575327, 0.8997627520759194, 0.9199288256227758, 0.9045077105575327, 0.9009489916963227, 0.9264531435349941]
[0.9264531435349941, 0.9199288256227758, 0.9199288256227758, 0.9045077105575327, 0.9264531435349941, 0.9199288256227758, 0.9134045077105575, 0.9045077105575327]
[0.9264531435349941

In [9]:
# Getting the best solution after iterating finishing all generations.
#At first, the fitness is calculated for each solution in the final generation.
fitness = cal_pop_fitness(new_population, X_train, y_train, X_test,y_test)
# Then return the index of that solution corresponding to the best fitness.
best_match_idx = np.where(fitness == np.max(fitness))

print("Best solution : ", new_population[best_match_idx, :])
print("Best solution fitness : ", fitness[best_match_idx])

Best solution :  [[[44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]
  [44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]
  [44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]
  [44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]
  [44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]
  [44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]
  [44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]
  [44 30 42 49 17 21 34  9 18 32 54 28 30 46 43 19]]]


TypeError: list indices must be integers or slices, not tuple