In [51]:
import numpy as np
import pandas as pd
#import csv

# Digit Classification - Naive Bayes

In [3]:
train_img_d_raw = pd.read_csv("data/digitdata/trainingimages",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_d_raw = pd.read_csv("data/digitdata/traininglabels",
                              skip_blank_lines=False, header=None)
test_img_d_raw = pd.read_csv("data/digitdata/testimages",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_d_raw = pd.read_csv("data/digitdata/testlabels",
                              skip_blank_lines=False, header=None)

In [4]:
print(train_img_d_raw.shape, train_lbl_d_raw.shape)
print(test_img_d_raw.shape, test_lbl_d_raw.shape)

(140000,) (5000, 1)
(28000,) (1000, 1)


In [5]:
train_img_d_raw[0:28]

0                                 
1                                 
2                                 
3                                 
4                                 
5                     +++++##+    
6             +++++######+###+    
7            +##########+++++     
8             #######+##          
9             +++###  ++          
10               +#+              
11               +#+              
12                +#+             
13                +##++           
14                 +###++         
15                  ++##++        
16                    +##+        
17                     ###+       
18                  +++###        
19                ++#####+        
20              ++######+         
21            ++######+           
22           +######+             
23        ++######+               
24        +####++                 
25                                
26                                
27                                
Name: 0, dtype: obje

In [6]:
# Visualization of my conversion in next code block
test = train_img_d_raw.copy()
test = test.apply(lambda x: x.replace('+', '1'))
test = test.apply(lambda x: x.replace('#', '1'))
test = test.apply(lambda x: x.replace(' ', '0'))
test[0:28]

0     0000000000000000000000000000
1     0000000000000000000000000000
2     0000000000000000000000000000
3     0000000000000000000000000000
4     0000000000000000000000000000
5     0000000000000000111111110000
6     0000000011111111111111110000
7     0000000111111111111111100000
8     0000000011111111110000000000
9     0000000011111100110000000000
10    0000000000011100000000000000
11    0000000000011100000000000000
12    0000000000001110000000000000
13    0000000000001111100000000000
14    0000000000000111111000000000
15    0000000000000011111100000000
16    0000000000000000111100000000
17    0000000000000000011110000000
18    0000000000000011111100000000
19    0000000000001111111100000000
20    0000000000111111111000000000
21    0000000011111111100000000000
22    0000000111111110000000000000
23    0000111111111000000000000000
24    0000111111100000000000000000
25    0000000000000000000000000000
26    0000000000000000000000000000
27    0000000000000000000000000000
Name: 0, dtype: obje

In [7]:
# Converts string image (arr) into integer array
def convert(arr):
    train_img_d_arr = [[0 for col in range(len(arr[0]))] for row in range(len(arr))]
    for row in range(len(arr)):
        for col in range(len(arr[0])):
            if arr[row][col] == '+':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == '#':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == ' ':
                train_img_d_arr[row][col] = 0
    return train_img_d_arr

In [8]:
# Testing for convert method
first_d = convert(train_img_d_raw[0:28])
sum(first_d[23])

9

In [64]:
# Divides a 28x28 digit image into 28/feat_size x 28/feat_size matrices
def partition(feat_size, arr):
    matrix_num = int(28/feat_size)
    features = [[0 for col in range(matrix_num)] for row in range(matrix_num)]
    for row in range(matrix_num):
        for col in range(matrix_num):
            for mat_row in range(feat_size):
                for mat_col in range(feat_size):
                    features[row][col] += arr[mat_row + row*feat_size][mat_col + col*feat_size]
    return features

In [10]:
# Testing for partition method
first_d = convert(train_img_d_raw[0:28])
partition(7, first_d)

[[0, 6, 12, 6], [0, 29, 17, 2], [0, 7, 36, 0], [6, 23, 4, 0]]

In [65]:
# Find features for 2000 digits, last column being the response var
def feature_ext(feat_size, data):
    features = [[0 for col in range(int((28/feat_size)**2))] for row in range(int(len(data)/28))]
    for digit in range(int(len(data)/28)):
        # print('iter:', digit)
        tmp = partition(feat_size, data[0 + 28*digit:28 + 28*digit])
        count = 0
        for row in range(int(28/feat_size)):
            for col in range(int(28/feat_size)):
                features[digit][count] = tmp[row][col]
                count += 1    
    return features

In [39]:
# Training method

obs_num = 5000                  # pixels*num of observations
feat_size = 4                     # pixels per feature that make up X by X matrix
feature_num = int((28/feat_size)**2) # number of features

train_img_d = convert(train_img_d_raw[0:28*obs_num])
features = pd.DataFrame(feature_ext(feat_size, train_img_d))


# Add training labels to dataset (1000 default value)
tmp = train_lbl_d_raw[:obs_num]
tmp.rename(columns={0:int((28/feat_size)**2)}, inplace=True)
features = pd.concat([features, tmp], axis=1)

# P(Class)
p_class = features.iloc[:, -1].value_counts()
p_class.sort_index(inplace=True)
p_class = p_class/obs_num

# P(Data | Class)
final = []
for dig in range(10):
    p_data_class = pd.DataFrame([[0 for col in range(feature_num*1)] for row in range(feat_size**2 + 1)])
    for feature in range(feature_num):
        tmp = features.loc[features.iloc[:,-1] == dig][feature].value_counts()
        for feature_val in tmp.index:
            p_data_class[feature][feature_val] = tmp[feature_val]
    p_data_class = p_data_class / features.iloc[:, -1].value_counts().sort_index()[dig]
    p_data_class.columns = pd.MultiIndex.from_product([[dig], range(feature_num)], names=['Digit', 'Feature'])
    if dig == 0:
        final = p_data_class
    else:
        final = final.join(p_data_class)
p_data_class = final

In [40]:
# Testing method

obs_num = 1000                        # pixels*num of observations
feat_size = 4                        # pixels per feature that make up X by X matrix
feature_num = int((28/feat_size)**2) # number of features

observation = feature_ext(feat_size, convert(test_img_d_raw[0:28*obs_num]))

p_data_class = p_data_class.replace(0, 0.00000000001) # Removes 0% probabilities

test_lbl_d_raw[:obs_num] # test set

total_correct = 0
for obs, num in zip(observation, range(obs_num)):
    prob_f = []
    correct = 1
    for dig in range(10):
        tmp = p_data_class.iloc[:, p_data_class.columns.get_level_values(0)==dig]
        prob = []
        for feat in range(feature_num):
            prob.append(tmp.iloc[:, feat][obs[feat]])
        prob_f.append(np.prod(prob)*p_class[dig])
    if test_lbl_d_raw[0][num] == prob_f.index(max(prob_f)):
        correct = True
        total_correct += 1
    else:
        correct = False
print('Total classified correct:', total_correct/obs_num)

Total classified correct: 0.747


# Face Classification

In [41]:
train_img_f_raw = pd.read_csv("data/facedata/facedatatrain",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_f_raw = pd.read_csv("data/facedata/facedatatrainlabels",
                              skip_blank_lines=False, header=None)
test_img_f_raw = pd.read_csv("data/facedata/facedatatest",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_f_raw = pd.read_csv("data/facedata/facedatatestlabels",
                              skip_blank_lines=False, header=None)

In [42]:
print(train_img_f_raw.shape, train_lbl_f_raw.shape)
print(test_img_f_raw.shape, test_lbl_f_raw.shape)

(31570,) (451, 1)
(10500,) (150, 1)


In [68]:
# Divides a 28x28 digit image into 28/feat_size x 28/feat_size matrices
def partition(feat_size, arr):
    features = [[0 for col in range(30)] for row in range(35)]
    for row in range(35):
        for col in range(30):
            for mat_row in range(2):
                for mat_col in range(2):
                    features[row][col] += arr[mat_row + row*feat_size][mat_col + col*feat_size]
    return features

In [69]:
# Find features for 2000 digits, last column being the response var
def feature_ext(feat_size, data):
    features = [[0 for col in range(1050)] for row in range(int(len(data)/70))]
    for digit in range(int(len(data)/70)):
        # print('iter:', digit)
        tmp = partition(feat_size, data[0 + 70*digit:70 + 70*digit])
        count = 0
        for row in range(35):
            for col in range(30):
                features[digit][count] = tmp[row][col]
                count += 1    
    return features

In [82]:
# Training method

obs_num = 451                  # pixels*num of observations
feat_size = 2                     # pixels per feature that make up X by X matrix
feature_num = 1050 # number of features

train_img_f = convert(train_img_f_raw[0:70*obs_num])
features = pd.DataFrame(feature_ext(feat_size, train_img_f))


# Add training labels to dataset (1000 default value)
tmp = train_lbl_f_raw[:obs_num]
tmp.rename(columns={0:1050}, inplace=True)
features = pd.concat([features, tmp], axis=1)

# P(Class)
p_class = features.iloc[:, -1].value_counts()
p_class.sort_index(inplace=True)
p_class = p_class/obs_num

# P(Data | Class)
final = []
for classif in range(2):
    p_data_class = pd.DataFrame([[0 for col in range(feature_num*1)] for row in range(feat_size**2 + 1)])
    for feature in range(feature_num):
        tmp = features.loc[features.iloc[:,-1] == classif][feature].value_counts()
        for feature_val in tmp.index:
            p_data_class[feature][feature_val] = tmp[feature_val]
    p_data_class = p_data_class / features.iloc[:, -1].value_counts().sort_index()[classif]
    p_data_class.columns = pd.MultiIndex.from_product([[classif], range(feature_num)], names=['Class', 'Feature'])
    if classif == 0:
        final = p_data_class
    else:
        final = final.join(p_data_class)
p_data_class = final

In [83]:
# Testing method

obs_num = 150                        # pixels*num of observations
feat_size = 2                        # pixels per feature that make up X by X matrix
feature_num = 1050 # number of features

observation = feature_ext(feat_size, convert(test_img_f_raw[0:70*obs_num]))

p_data_class = p_data_class.replace(0, 0.00000000001) # Removes 0% probabilities

test_lbl_f_raw[:obs_num] # test set

total_correct = 0
for obs, num in zip(observation, range(obs_num)):
    prob_f = []
    correct = 1
    for classif in range(2):
        tmp = p_data_class.iloc[:, p_data_class.columns.get_level_values(0)==classif]
        prob = []
        for feat in range(feature_num):
            prob.append(tmp.iloc[:, feat][obs[feat]])
        prob_f.append(np.prod(prob)*p_class[classif])
    if test_lbl_f_raw[0][num] == prob_f.index(max(prob_f)):
        correct = True
        total_correct += 1
    else:
        correct = False
print('Total classified correct:', total_correct/obs_num)

Total classified correct: 0.6866666666666666


# Digit Classification - KNN

In [52]:
train_img_d_raw = pd.read_csv("data/digitdata/trainingimages",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_d_raw = pd.read_csv("data/digitdata/traininglabels",
                              skip_blank_lines=False, header=None)
test_img_d_raw = pd.read_csv("data/digitdata/testimages",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_d_raw = pd.read_csv("data/digitdata/testlabels",
                              skip_blank_lines=False, header=None)

In [138]:
# Converts string image (arr) into integer array
def convert(arr):
    train_img_d_arr = [[0 for col in range(len(arr[0]))] for row in range(len(arr))]
    for row in range(len(arr)):
        for col in range(len(arr[0])):
            if arr[row][col] == '+':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == '#':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == ' ':
                train_img_d_arr[row][col] = 0
    return train_img_d_arr

In [255]:
obs_num = 50 # train set num

tmp = convert(train_img_d_raw[0:28*obs_num])
train_dig_df = []
for dig in range(obs_num):
    train_dig_df.append(pd.DataFrame(tmp[0 + 28*dig:28 + 28*dig]))

test_obs = 10 # test set num

tmp2 = convert(test_img_d_raw[0:28*test_obs])
test_dig_df = []
for dig in range(test_obs):
    test_dig_df.append(pd.DataFrame(tmp2[0 + 28*dig:28 + 28*dig]))


correct = 0
for obs in range(test_obs):
    k = []
    for img in train_dig_df:
        k.append(((test_dig_df[obs] - img)**2).sum().sum())
    k = pd.DataFrame(k)
    k = k[0].sort_values()[0:7]
    maj_class = []


    for i in k.index:
        maj_class.append(train_lbl_d_raw[0][i])
    maj_val = pd.DataFrame(maj_class)[0].value_counts().index[0]
    if maj_val == test_lbl_d_raw[0][obs]:
        correct += 1

correct / test_obs

0.4

# Face Classification

In [215]:
train_img_f_raw = pd.read_csv("data/facedata/facedatatrain",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_f_raw = pd.read_csv("data/facedata/facedatatrainlabels",
                              skip_blank_lines=False, header=None)
test_img_f_raw = pd.read_csv("data/facedata/facedatatest",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_f_raw = pd.read_csv("data/facedata/facedatatestlabels",
                              skip_blank_lines=False, header=None)

In [253]:
obs_num = 451 # train set num

tmp = convert(train_img_f_raw[0:70*obs_num])
train_face_df = []
for face in range(obs_num):
    train_face_df.append(pd.DataFrame(tmp[0 + 70*face:70 + 70*face]))

test_obs = 150 # test set num

tmp2 = convert(test_img_f_raw[0:70*test_obs])
test_face_df = []
for face in range(test_obs):
    test_face_df.append(pd.DataFrame(tmp2[0 + 70*face:70 + 70*face]))


correct = 0
for obs in range(test_obs):
    k = []
    for img in train_face_df:
        k.append(((test_face_df[obs] - img)**2).sum().sum())
    k = pd.DataFrame(k)
    k = k[0].sort_values()[0:7]
    #print('k', k)
    maj_class = []

    for i in k.index:
        maj_class.append(train_lbl_f_raw[0][i])
    maj_val = pd.DataFrame(maj_class)[0].value_counts().index[0]
    #print(maj_val, test_lbl_f_raw[0][obs])
    if maj_val == test_lbl_f_raw[0][obs]:
        correct += 1

correct / test_obs

0.5866666666666667

In [254]:
obs_num = 451 # train set num

tmp = convert(train_img_f_raw[0:70*obs_num])
train_face_df = []
for face in range(obs_num):
    train_face_df.append(pd.DataFrame(tmp[25 + 70*face:50 + 70*face]).iloc[:, 15:45])

test_obs = 150 # test set num

tmp2 = convert(test_img_f_raw[0:70*test_obs])
test_face_df = []
for face in range(test_obs):
    test_face_df.append(pd.DataFrame(tmp2[25 + 70*face:50 + 70*face]).iloc[:, 15:45])


correct = 0
for obs in range(test_obs):
    k = []
    for img in train_face_df:
        k.append(((test_face_df[obs] - img)**2).sum().sum())
    k = pd.DataFrame(k)
    k = k[0].sort_values()[0:7]
    #print('k', k)
    maj_class = []

    for i in k.index:
        maj_class.append(train_lbl_f_raw[0][i])
    maj_val = pd.DataFrame(maj_class)[0].value_counts().index[0]
    if maj_val == test_lbl_f_raw[0][obs]:
        correct += 1

correct / test_obs

0.74