In [31]:
import numpy as np
import matplotlib.pyplot as plt

In [32]:
#@title NB codes
import random

DATA_WIDTH=28
DATA_HEIGHT=28
NUMBER_OF_TRAINING_EXAMPLES=5000
NUMBER_OF_VALIDATION_EXAMPLES=1000

ALL_TRAINING_IMAGES=[]
ALL_TRAINING_LABELS=[]
ALL_VALIDATION_IMAGES=[]
ALL_VALIDATION_LABELS=[]

'''
Convert ASC-II pixel into numerical data and vice versa
    - ' ' is converted to ..., which means it's part of the background
    - '#' is converted to ..., part of the image interior
    - '+' is converted to ..., part of the edges
    
'''

def _pixel_to_value(character):
    if(character == ' '):
        return 0
    elif(character == '#'):
        return 1
    elif(character == '+'):
        return 2  
    
def _value_to_pixel(value):
    if(value == 0):
        return ' '
    elif(value == 1):
        return '#'
    elif(value == 2):
        return '+'
'''
Function for loading data and label files
'''

def _load_data_file(filename, n, width, height):
    fin = [l[:-1] for l in open(filename).readlines()]
    fin.reverse()
    items = []
    for i in range(n):
        data = []
        for j in range(height):
            row = map(_pixel_to_value, list(fin.pop()))
            data.append(row)
        items.append(data)
    return items
        
def _load_label_file(filename, n):
    fin = [l[:-1] for l in open(filename).readlines()]
    labels = []
    for i in range(n):
        labels.append(int(fin[i]))
    return labels

def _load_all_data():
    global ALL_TRAINING_IMAGES
    global ALL_TRAINING_LABELS
    global ALL_VALIDATION_IMAGES
    global ALL_VALIDATION_LABELS

    ALL_TRAINING_IMAGES = _load_data_file("trainingimages",
        NUMBER_OF_TRAINING_EXAMPLES, DATA_WIDTH, DATA_HEIGHT)
    ALL_TRAINING_LABELS = _load_label_file("traininglabels",
        NUMBER_OF_TRAINING_EXAMPLES)

    ALL_VALIDATION_IMAGES = _load_data_file("validationimages",
        NUMBER_OF_VALIDATION_EXAMPLES, DATA_WIDTH, DATA_HEIGHT)
    ALL_VALIDATION_LABELS = _load_label_file("validationlabels",
        NUMBER_OF_VALIDATION_EXAMPLES)

'''
Function for printing digits
'''

def _print_digit_image(data):
    for row in range(len(data)):
        print(''.join(map(_value_to_pixel, data[row])))
    
#%% [code]
'''
Extract 'basic' features, i.e., whether a pixel is in the background or part of the digit
'''

'''
Extract advanced features that you will come up with. For example: 
**Note that it's not mandatory to use the features listed above.
'''

    # Calculate the width and height of the digit
    # Calculate the number of #s and +s in the top half ( or in any other part)
    # Define a rowlimit and investigate if the amount of #s/+s surpasses the limit
    




"\nExtract advanced features that you will come up with. For example: \n**Note that it's not mandatory to use the features listed above.\n"

In [33]:
_load_all_data()

In [59]:
example_number = random.randint(0, NUMBER_OF_TRAINING_EXAMPLES)
print("Printing digit example #" + str(example_number + 1) + " with label: " \
        + str(ALL_TRAINING_LABELS[example_number]))
_print_digit_image(ALL_TRAINING_IMAGES[example_number])

Printing digit example #4516 with label: 5
                            
                            
                            
                            
                            
                      ++    
                  +++####   
            +++++########   
          +############++   
        ++#########++++     
       +#### +++++          
       +####                
       +#####++             
       +#######+            
        +++#####+           
           +++###+          
              +###+         
     +#+       +###+        
     ###+       +###        
     +####+      ###        
      +####++    ###        
       +#####++++###        
         +##########        
          ++# #####+        
             +++++          
                            
                            
                            


In [34]:
x_train = []
for img in ALL_TRAINING_IMAGES:
  element = []
  for row in img:
    element.append(list(row))
  x_train.append(element)
x_train = np.array(x_train)

x_test = []
for img in ALL_VALIDATION_IMAGES:
  element = []
  for row in img:
    element.append(list(row))
  x_test.append(element)
x_test = np.array(x_test)

y_train = ALL_TRAINING_LABELS
y_test = ALL_VALIDATION_LABELS

In [50]:
#@title functions for extracting the features
def calculate_height(image):
  h1, h2 = 0, 0
  f = 0
  for row in range(len(image)):
    if sum(image[row]) > 0 and f == 0:
      h1 = row
      f = 1
    if sum(image[row]) == 0 and f == 1:
      h2 = row
      break
  return (h2 - h1)

def count_hashtags(image):
  count = 0
  for row in range(len(image)):
    count += np.sum((np.array(image[row])==1).astype(int))
  return count

def count_plus(image):
  count = 0
  for row in range(len(image)):
    count += np.sum((np.array(image[row])==2).astype(int))
  return count

def first_row(image):
  f = 0
  count = 0
  for row in range(len(image)):
    if sum(image[row]) > 0 and f == 0:
      count = sum(image[row])
      break 
  return count


In [51]:
def extract_features(images):
  features = [[],[],[],[]]
  for image in images:
    features[0].append(calculate_height(image))
    features[1].append(count_hashtags(image))
    features[2].append(count_plus(image))
    features[3].append(first_row(image))
  return features

def each_class_data():
  classes = [[], [], [], [], [], [], [], [], [], []]
  for i in range(len(x_train)):
    classes[y_train[i]].append(x_train[i])
  return classes

def calculate_mean_var(classes):
  parameters = []
  for each_class_images in classes:
    p = []
    features = extract_features(each_class_images)
    for i in range(4):
      p.append([np.mean(features[i]), np.var(features[i])])
    parameters.append(p)
  return parameters

def calculate_x(x_test):
  x_total = []
  for test_images in x_test:
    x_total.append([calculate_height(test_images), count_hashtags(test_images), count_plus(test_images),first_row(test_images)])
  return x_total

def gaussian(mean, var, x):
  return np.exp(-0.5*((x-mean)**2)/var)/(var**0.5)

def calculate_prior_prob(y_train):
  p = np.zeros(10)
  for i in range(10):
    p[i] = np.sum((np.array(y_train) == i).astype(int)) / len(y_train)
  return p

def accuracy(parameters, x_test, y_test,y_train):
  x_test = calculate_x(x_test)
  prior_prob = calculate_prior_prob(y_train)
  y_predict = []
  for i in range(len(x_test)):
    prob = []
    for j in range(len(parameters)):
      p1 = gaussian(parameters[j][0][0], parameters[j][0][1], x_test[i][0])
      p2 = gaussian(parameters[j][1][0], parameters[j][1][1], x_test[i][1])
      p3 = gaussian(parameters[j][2][0], parameters[j][2][1], x_test[i][2])
      p4 = gaussian(parameters[j][3][0], parameters[j][3][1], x_test[i][3])
      prob.append(prior_prob[j]*p1*p2*p3*p4)
    label = prob.index(max(prob))
    y_predict.append(label)
  true = 0
  for i in range(len(y_test)):
    if y_test[i] == y_predict[i]:
      true += 1 
  print('accuracy:', true/len(y_test)) 

In [52]:
classes = each_class_data()
parameters = calculate_mean_var(classes)
accuracy(parameters, x_test, y_test,y_train)

accuracy: 0.333
