In [1]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
optical_recognition_of_handwritten_digits = fetch_ucirepo(id=80) 
  
# data (as pandas dataframes) 
X = optical_recognition_of_handwritten_digits.data.features 
y = optical_recognition_of_handwritten_digits.data.targets 
  
# metadata 
print(optical_recognition_of_handwritten_digits.metadata) 

# variable information 
print(optical_recognition_of_handwritten_digits.variables) 

{'uci_id': 80, 'name': 'Optical Recognition of Handwritten Digits', 'repository_url': 'https://archive.ics.uci.edu/dataset/80/optical+recognition+of+handwritten+digits', 'data_url': 'https://archive.ics.uci.edu/static/public/80/data.csv', 'abstract': 'Two versions of this database available; see folder', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 5620, 'num_features': 64, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Wed Aug 23 2023', 'dataset_doi': '10.24432/C50P49', 'creators': ['E. Alpaydin', 'C. Kaynak'], 'intro_paper': {'title': 'Methods of Combining Multiple Classifiers and Their Applications to Handwritten Digit Recognition', 'authors': 'C. Kaynak', 'published_in': 'MSc Thesis, Institute of Graduate Studies in Science and Engineering, Bogazici University', 

In [2]:
y= y.to_numpy()
y =y.ravel()

In [3]:
import numpy as np
import random

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9750889679715302


In [6]:
svm_classifier = SVC(kernel='linear', C=1.0)

svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9733096085409253


In [7]:
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)

y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.966785290628707


In [8]:
def cal_pop_fitness(pop, X_train, y_train, X_test, y_test):
    fitness = []
    for i in range(len(pop)):
        columns = np.unique(pop[i])
        
        X_train_ind = X_train.iloc[:,columns]
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_classifier.fit(X_train_ind, y_train)

        y_pred = rf_classifier.predict(X_test.iloc[:,columns])

        accuracy = accuracy_score(y_test, y_pred)

        fitness.append(accuracy)

    return fitness

def fit_svm(pop, X_train, y_train, X_test, y_test):
    fitness = []
    for i in range(len(pop)):
        columns = np.unique(pop[i])
        
        X_train_ind = X_train.iloc[:, columns]
        svm_classifier = SVC(kernel='linear', C=1.0)
        svm_classifier.fit(X_train_ind, y_train)

        y_pred = svm_classifier.predict(X_test.iloc[:, columns])

        accuracy = accuracy_score(y_test, y_pred)

        fitness.append(accuracy)

    return fitness

def fit_xgboost(pop, X_train, y_train, X_test, y_test):
    fitness = []
    for i in range(len(pop)):
        columns = np.unique(pop[i])
        
        X_train_ind = X_train.iloc[:, columns]
        xgb_classifier = xgb.XGBClassifier()
        xgb_classifier.fit(X_train_ind, y_train)

        y_pred = xgb_classifier.predict(X_test.iloc[:, columns])

        accuracy = accuracy_score(y_test, y_pred)

        fitness.append(accuracy)

    return fitness

def select_mating_pool(pop, fitness, num_parents):
    # Selecting the best individuals in the current generation as parents for producing the offspring of the next generation.
    parents = np.empty((num_parents, pop.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.where(fitness == np.max(fitness))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[parent_num, :] = pop[max_fitness_idx, :]
        fitness[max_fitness_idx] = -1
    return parents

def crossover(parents, offspring_size):
    offspring = np.empty(offspring_size)

    for k in range(offspring_size[0]):
        # Always perform one-point crossover.
        crossover_point = np.random.randint(1, parents.shape[1])
        parent1_idx = k % parents.shape[0]
        parent2_idx = (k + 1) % parents.shape[0]
        offspring[k, :crossover_point] = parents[parent1_idx, :crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]

    return offspring

def mutation(offspring_crossover):
    # Mutation changes a single gene in each offspring randomly.
    for idx in range(offspring_crossover.shape[0]):
        # The random value to be added to the gene.
        random_value = np.random.uniform(-1.0, 1.0, 1)
        offspring_crossover[idx, 4] = offspring_crossover[idx, 4] + random_value
    return offspring_crossover

In [9]:
col_len = 8

sol_per_pop = 16
num_parents_mating = 8

# Defining the population size.
pop_size = (sol_per_pop,col_len) # The population will have sol_per_pop chromosome where each chromosome has num_weights genes.
#Creating the initial population.
new_population = np.random.randint(63, size=pop_size)
print(new_population)

[[ 5 34 15 33  0 60 37 51]
 [32 28 42 58 31 53 38 27]
 [24 16 23 40 33  7 54 41]
 [47 62 48 21 28 53 20 58]
 [43 56  6  9 38 61 54 32]
 [31 32 62 33 16 36 20 19]
 [53 20 48 24 16 35 47 56]
 [62 10 10  2 44 32 32 36]
 [31 14 61 21 14 23 24 18]
 [ 3 35 45 18 12 23 58  6]
 [25  6 52  6 58 27 60 11]
 [34 30  1 15  8 61 12 48]
 [38 13 35 56 37 17  4 60]
 [19 28 23 14  0  0  1 34]
 [16  4  7 24 59  9 28 35]
 [12 59 45 58 60 30 58 53]]


In [10]:
num_generations = 10
for generation in range(num_generations):
    # Measing the fitness of each chromosome in the population.
    fitness = fit_xgboost(new_population, X_train,y_train,X_test,y_test)

    print(fitness)

    # Selecting the best parents in the population for mating.
    parents = select_mating_pool(new_population, fitness, 
                                      num_parents_mating)
    
    # print(parents)

    # Generating next generation using crossover.
    offspring_crossover = crossover(parents,
                                       offspring_size=(pop_size[0]-parents.shape[0], col_len))


    # # Adding some variations to the offsrping using mutation.
    # offspring_mutation = mutation(offspring_crossover)

    # print(offspring_mutation)

    # Creating the new population based on the parents and offspring.
    new_population[0:parents.shape[0], :] = parents
    new_population[parents.shape[0]:, :] = offspring_crossover  # should be offspring_mutation

    # new_population[0:parents.shape[0], :] = offspring_crossover

    # The best result in the current iteration.
    if generation%10 ==0:
        print("Generation : ", generation)
        # print(offspring_crossover)
        print()

[0.6346381969157769, 0.7230130486358244, 0.41637010676156583, 0.7461447212336892, 0.6565836298932385, 0.6500593119810202, 0.44780545670225386, 0.5693950177935944, 0.5255041518386714, 0.6850533807829181, 0.6690391459074733, 0.5652431791221827, 0.6731909845788849, 0.529655990510083, 0.5652431791221827, 0.6838671411625148]
Generation :  0

[0.7461447212336892, 0.7230130486358244, 0.6850533807829181, 0.6838671411625148, 0.6731909845788849, 0.6690391459074733, 0.6565836298932385, 0.6500593119810202, 0.6447212336892052, 0.66785290628707, 0.7153024911032029, 0.6441281138790036, 0.7034400948991696, 0.6393831553973903, 0.7437722419928826, 0.6500593119810202]
[0.7461447212336892, 0.7437722419928826, 0.7230130486358244, 0.7153024911032029, 0.7034400948991696, 0.6850533807829181, 0.6838671411625148, 0.6731909845788849, 0.7443653618030842, 0.6524317912218268, 0.6631079478054567, 0.7093712930011863, 0.5889679715302492, 0.7117437722419929, 0.7188612099644128, 0.7876631079478055]
[0.7876631079478055, 

In [11]:
# Getting the best solution after iterating finishing all generations.
#At first, the fitness is calculated for each solution in the final generation.
fitness = fit_xgboost(new_population, X_train, y_train, X_test,y_test)
# Then return the index of that solution corresponding to the best fitness.
best_match_idx = np.where(fitness == np.max(fitness))

print("Best solution : ", new_population[best_match_idx, :])
print("Best solution fitness : ", fitness[best_match_idx[0][0]])

Best solution :  [[[38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]
  [38 13 45 18 60 30 20 58]]]
Best solution fitness :  0.8214709371293001
