In [1]:
from ucimlrepo import fetch_ucirepo 

# fetch dataset 
optical_recognition_of_handwritten_digits = fetch_ucirepo(id=80) 
  
# data (as pandas dataframes) 
X = optical_recognition_of_handwritten_digits.data.features 
y = optical_recognition_of_handwritten_digits.data.targets 
  
# metadata 
print(optical_recognition_of_handwritten_digits.metadata) 

# variable information 
print(optical_recognition_of_handwritten_digits.variables) 

{'uci_id': 80, 'name': 'Optical Recognition of Handwritten Digits', 'repository_url': 'https://archive.ics.uci.edu/dataset/80/optical+recognition+of+handwritten+digits', 'data_url': 'https://archive.ics.uci.edu/static/public/80/data.csv', 'abstract': 'Two versions of this database available; see folder', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 5620, 'num_features': 64, 'feature_types': ['Integer'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1998, 'last_updated': 'Wed Aug 23 2023', 'dataset_doi': '10.24432/C50P49', 'creators': ['E. Alpaydin', 'C. Kaynak'], 'intro_paper': {'title': 'Methods of Combining Multiple Classifiers and Their Applications to Handwritten Digit Recognition', 'authors': 'C. Kaynak', 'published_in': 'MSc Thesis, Institute of Graduate Studies in Science and Engineering, Bogazici University', 

In [2]:
y= y.to_numpy()
y =y.ravel()

In [3]:
import numpy as np
import random

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9750889679715302


In [6]:
svm_classifier = SVC(kernel='linear', C=1.0)

svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9733096085409253


In [8]:
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train, y_train)

y_pred = xgb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.966785290628707


In [7]:
def cal_pop_fitness(pop, X_train, y_train, X_test, y_test):
    fitness = []
    for i in range(len(pop)):
        columns = np.unique(pop[i])
        
        X_train_ind = X_train.iloc[:,columns]
        rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_classifier.fit(X_train_ind, y_train)

        y_pred = rf_classifier.predict(X_test.iloc[:,columns])

        accuracy = accuracy_score(y_test, y_pred)

        fitness.append(accuracy)

    return fitness

def fit_svm(pop, X_train, y_train, X_test, y_test):
    fitness = []
    for i in range(len(pop)):
        columns = np.unique(pop[i])
        
        X_train_ind = X_train.iloc[:, columns]
        svm_classifier = SVC(kernel='linear', C=1.0)
        svm_classifier.fit(X_train_ind, y_train)

        y_pred = svm_classifier.predict(X_test.iloc[:, columns])

        accuracy = accuracy_score(y_test, y_pred)

        fitness.append(accuracy)

    return fitness

def fit_xgboost(pop, X_train, y_train, X_test, y_test):
    fitness = []
    for i in range(len(pop)):
        columns = np.unique(pop[i])
        
        X_train_ind = X_train.iloc[:, columns]
        xgb_classifier = xgb.XGBClassifier()
        xgb_classifier.fit(X_train_ind, y_train)

        y_pred = xgb_classifier.predict(X_test.iloc[:, columns])

        accuracy = accuracy_score(y_test, y_pred)

        fitness.append(accuracy)

    return fitness

def select_mating_pool(pop, fitness, num_parents):
    # Selecting the best individuals in the current generation as parents for producing the offspring of the next generation.
    parents = np.empty((num_parents, pop.shape[1]))
    for parent_num in range(num_parents):
        max_fitness_idx = np.where(fitness == np.max(fitness))
        max_fitness_idx = max_fitness_idx[0][0]
        parents[parent_num, :] = pop[max_fitness_idx, :]
        fitness[max_fitness_idx] = -1
    return parents

def crossover(parents, offspring_size):
    offspring = np.empty(offspring_size)

    for k in range(offspring_size[0]):
        # Always perform one-point crossover.
        crossover_point = np.random.randint(1, parents.shape[1])
        parent1_idx = k % parents.shape[0]
        parent2_idx = (k + 1) % parents.shape[0]
        offspring[k, :crossover_point] = parents[parent1_idx, :crossover_point]
        offspring[k, crossover_point:] = parents[parent2_idx, crossover_point:]

    return offspring

def mutation(offspring_crossover):
    # Mutation changes a single gene in each offspring randomly.
    for idx in range(offspring_crossover.shape[0]):
        # The random value to be added to the gene.
        random_value = np.random.uniform(-1.0, 1.0, 1)
        offspring_crossover[idx, 4] = offspring_crossover[idx, 4] + random_value
    return offspring_crossover

In [8]:
col_len = 32

sol_per_pop = 8
num_parents_mating = 4

# Defining the population size.
pop_size = (sol_per_pop,col_len) # The population will have sol_per_pop chromosome where each chromosome has num_weights genes.
#Creating the initial population.
new_population = np.random.randint(63, size=pop_size)
print(new_population)

[[28  0 47 18 25 53 22 41 61 14 11 22 47 32  1 61 51 23 42  2 58 31 29  9
  44 19  8 21 35 12 55 26]
 [21 10 16 20 15 50  4  9 46 14 18 25  5 14 41 25 27  2 30 40 38 42 14 38
  44 17 54 12 29 11 23 11]
 [29 38 54 57 41 47 44 49 15 23 48 18  5 26 31  1 54 55 13 51 36 26 17 49
  33 47 35 60  5 45 43  4]
 [27 36  7 52 13 24 41 44  5  6 26 52 34 50  8 14 48 52 12 24 57 41 40 14
  27 32 21  7  2 12 37 56]
 [61 11 14  6  8  9 12  7 53  8 46 13 30 23 61 23 54 11 41 41 44 61  2 59
  26 18 48 12 44 41 46 45]
 [44 11 46 58 14  5 26 36  3 22 17 62 55 14 13 21 59 49 34  6 61  8 52 11
  16 30 14 45 40 45  3  9]
 [57 60 44  6 61 62 40 30 25 35 23 49  9 57 12 11 58 11 60 47 62 58 59 25
  15 20 42  1 35 29 16  0]
 [43  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7 32
  42 36 47 56  5 44 50 29]]


In [9]:
num_generations = 10
for generation in range(num_generations):
    # Measing the fitness of each chromosome in the population.
    fitness = fit_svm(new_population, X_train,y_train,X_test,y_test)

    print(fitness)

    # Selecting the best parents in the population for mating.
    parents = select_mating_pool(new_population, fitness, 
                                      num_parents_mating)
    
    # print(parents)

    # Generating next generation using crossover.
    offspring_crossover = crossover(parents,
                                       offspring_size=(pop_size[0]-parents.shape[0], col_len))


    # # Adding some variations to the offsrping using mutation.
    # offspring_mutation = mutation(offspring_crossover)

    # print(offspring_mutation)

    # Creating the new population based on the parents and offspring.
    new_population[0:parents.shape[0], :] = parents
    new_population[parents.shape[0]:, :] = offspring_crossover  # should be offspring_mutation

    # new_population[0:parents.shape[0], :] = offspring_crossover

    # The best result in the current iteration.
    if generation%10 ==0:
        print("Generation : ", generation)
        # print(offspring_crossover)
        print()

[0.933570581257414, 0.9341637010676157, 0.8991696322657177, 0.8991696322657177, 0.8825622775800712, 0.9062870699881376, 0.8784104389086596, 0.933570581257414]
Generation :  0

[0.9341637010676157, 0.933570581257414, 0.933570581257414, 0.9062870699881376, 0.938908659549229, 0.9400948991696323, 0.9122182680901542, 0.9199288256227758]
[0.9400948991696323, 0.938908659549229, 0.9341637010676157, 0.933570581257414, 0.9365361803084223, 0.9341637010676157, 0.928232502965599, 0.8926453143534994]
[0.9400948991696323, 0.938908659549229, 0.9365361803084223, 0.9341637010676157, 0.938908659549229, 0.938908659549229, 0.9181494661921709, 0.9317912218268091]
[0.9400948991696323, 0.938908659549229, 0.938908659549229, 0.938908659549229, 0.9341637010676157, 0.9329774614472124, 0.9424673784104389, 0.9317912218268091]
[0.9424673784104389, 0.9400948991696323, 0.938908659549229, 0.938908659549229, 0.9400948991696323, 0.9442467378410438, 0.9341637010676157, 0.9448398576512456]
[0.9448398576512456, 0.9442467378

In [11]:
# Getting the best solution after iterating finishing all generations.
#At first, the fitness is calculated for each solution in the final generation.
fitness = fit_svm(new_population, X_train, y_train, X_test,y_test)
# Then return the index of that solution corresponding to the best fitness.
best_match_idx = np.where(fitness == np.max(fitness))

print("Best solution : ", new_population[best_match_idx, :])
print("Best solution fitness : ", fitness[best_match_idx[0][0]])

Best solution :  [[[28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]
  [28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]
  [28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]
  [28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]
  [28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]
  [28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]
  [28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]
  [28  1 51 18 16 62 53 52  9 36  0 30  2  2  0 27  3 12 50 13 37  6  7
   32 42 36 47 21 35 12 55 26]]]
Best solution fitness :  0.9448398576512456
