In [461]:
import numpy as np
import pandas as pd
import time
import random
import itertools

In [645]:
# Visualize all columns/rows
pd.options.display.max_columns = 60
pd.options.display.max_rows = 70

# Naive Bayes

## Import and Data Processing

### Digit Data

In [451]:
train_img_d_raw = pd.read_csv("data/digitdata/trainingimages",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_d_raw = pd.read_csv("data/digitdata/traininglabels",
                              skip_blank_lines=False, header=None)
test_img_d_raw = pd.read_csv("data/digitdata/testimages",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_d_raw = pd.read_csv("data/digitdata/testlabels",
                              skip_blank_lines=False, header=None)

In [669]:
print(train_img_d_raw.shape, train_lbl_d_raw.shape)
print(test_img_d_raw.shape, test_lbl_d_raw.shape)

(140000,) (5000, 1)
(28000,) (1000, 1)


In [646]:
# Visualization of first digit of raw digit data
train_img_d_raw[0:28]

0                                 
1                                 
2                                 
3                                 
4                                 
5                     +++++##+    
6             +++++######+###+    
7            +##########+++++     
8             #######+##          
9             +++###  ++          
10               +#+              
11               +#+              
12                +#+             
13                +##++           
14                 +###++         
15                  ++##++        
16                    +##+        
17                     ###+       
18                  +++###        
19                ++#####+        
20              ++######+         
21            ++######+           
22           +######+             
23        ++######+               
24        +####++                 
25                                
26                                
27                                
Name: 0, dtype: obje

In [548]:
# Method to convert all the image data into array full of 0s and 1s
def convert(data):
    converted = data.copy()
    for i in range(converted.shape[0]):
        converted[i] = converted[i].replace(' ', '0')
        converted[i] = converted[i].replace('#', '1')
        converted[i] = converted[i].replace('+', '1')
    converted = converted.apply(lambda x: pd.Series(list(x)))
    converted = converted.apply(pd.to_numeric)
    return converted

In [599]:
# Converting all raw data
train_img_d = convert(train_img_d_raw)
test_img_d = convert(test_img_d_raw)

In [629]:
# Visualization of first digit of converted digit data
train_img_d[0:28]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0
6,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0
7,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0


### Face Data

In [671]:
train_img_f_raw = pd.read_csv("data/facedata/facedatatrain",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_f_raw = pd.read_csv("data/facedata/facedatatrainlabels",
                              skip_blank_lines=False, header=None)
test_img_f_raw = pd.read_csv("data/facedata/facedatatest",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_f_raw = pd.read_csv("data/facedata/facedatatestlabels",
                              skip_blank_lines=False, header=None)

In [670]:
print(train_img_f_raw.shape, train_lbl_f_raw.shape)
print(test_img_f_raw.shape, test_lbl_f_raw.shape)

(31570,) (451, 1)
(10500,) (150, 1)


In [648]:
# Visualization of first digit of raw digit data
train_img_f_raw[0:70]

0                                                   ...
1                                                   ...
2                                  ###              ...
3                               ###   ###           ...
4                             ##         ####       ...
5                           ##               ###    ...
6                          #                    ### ...
7                         #                        #...
8                         #                         ...
9                     #   #                         ...
10                   #   #                          ...
11                  #    #          ##              ...
12                  #    #         #  #             ...
13                 #     #    #    #   ###          ...
14                 #     #   #     #  #   #         ...
15                #      # ##      #  #   #         ...
16                #       #       #        #        ...
17               #                #   #    #    

In [649]:
# Converting all raw data
train_img_f = convert(train_img_f_raw)
test_img_f = convert(test_img_f_raw)

In [650]:
# Visualization of first digit of converted digit data
train_img_f[0:70]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,1,1,0


## Classification Methods

In [789]:
# Creates an array where each row contains the feature values for one image observation.
def partition(data, feat_size, wid_feat_num, ht_feat_num):
    features = [[0 for col in range(wid_feat_num)] for row in range(ht_feat_num)]
    
    for row in range(ht_feat_num):
        for col in range(wid_feat_num):
            for mat_row in range(feat_size):
                for mat_col in range(feat_size):
                    features[row][col] += data[mat_row + row*feat_size][mat_col + col*feat_size]
    return features

In [790]:
# Creates dataframe that counts how often each feature value appears in the data
def feature_ext(data, feat_size, wid_feat_num, ht_feat_num):
    img_ht = int(feat_size*ht_feat_num)
    features = [[0 for col in range(int(wid_feat_num*ht_feat_num))]
                for row in range(int(len(data)/img_ht))]
    
    for classif in range(int(len(data)/img_ht)):
        tmp = partition(data[0 + img_ht*classif:img_ht + img_ht*classif],
                         feat_size, wid_feat_num, ht_feat_num)
        count = 0
        for row in range(ht_feat_num):
            for col in range(wid_feat_num):
                features[classif][count] = tmp[row][col]
                count += 1    
    return features

In [798]:
# Training method for Naive Bayes model
def train_nb(train_data, train_lbl, class_num, feat_size, wid_feat_num, ht_feat_num, split):
    start = time.time()
    
    # Parameters
    obs_num = int(train_lbl.shape[0]*split)
    img_wid = int(train_data.shape[1])
    img_ht = int(train_data.shape[0] / train_lbl.shape[0])
    feature_num = int(wid_feat_num*ht_feat_num)
    
    # Random sample of training set
    random.seed(1)
    sample_num = random.sample(range(train_lbl.shape[0]), obs_num)
    sample_range = []
    sample_data = train_data.copy()

    for obs in sample_num:
        a = 0
        b = img_ht
        a = a + img_ht*obs
        b = b + img_ht*obs
        sample_range.append(range(a, b))
    sample_range = list(chain(*sample_range))
    sample_data = sample_data.iloc[sample_range].reset_index(drop=True).values.tolist()

    # Extracting features
    features = pd.DataFrame(feature_ext(sample_data, feat_size, wid_feat_num, ht_feat_num))

    # Add training labels to feature set
    tmp = train_lbl.iloc[sample_num].reset_index(drop=True)
    tmp.rename(columns={0:feature_num}, inplace=True)
    features = pd.concat([features, tmp], axis=1)

    # P(Class)
    """Find all value counts of each unique class and divide by the total number of observations."""
    p_class = features.iloc[:, -1].value_counts()
    p_class.sort_index(inplace=True)
    p_class = p_class/obs_num

    # P(Data | Class)
    """Use the features dataframe (df) extracted earlier where each row of the df contains feature values
       for that particular observation. Convert this features df to a dataframe that contains contains all
       feature values and the number of times each feature value appears in the training set. Then calculate
       the probability that certain feature values appear per class for P(Data | Class).
    """
    p_data_class_total = []
    
    for classif in range(class_num):
        p_data_class = pd.DataFrame([[0 for col in range(feature_num)] for row in range(feat_size**2 + 1)])
        for feature in range(feature_num):           
            
            # Finds the unique feature values per class and inserts into our future P(Data | Class) df
            tmp = features.loc[features.iloc[:,-1] == classif][feature].value_counts()    
            for feature_val in tmp.index:
                p_data_class[feature][feature_val] = tmp[feature_val]
        
        # Convert number of feature values into probability
        p_data_class = p_data_class / features.iloc[:, -1].value_counts().sort_index()[classif]
        
        p_data_class.columns = pd.MultiIndex.from_product([[classif], range(feature_num)], names=['Digit', 'Feature'])
        if classif == 0:
            p_data_class_total = p_data_class
        else:
            p_data_class_total = p_data_class_total.join(p_data_class)
    p_data_class = p_data_class_total
    
    end = time.time()
    performance = end - start
    
    return p_class, p_data_class, performance

In [799]:
# Testing method for Naive Bayes model
def test_nb(test_data, test_lbl, class_num, feat_size, wid_feat_num, ht_feat_num, p_class, p_data_class):
    start = time.time()
    
    # Parameters
    obs_num = test_lbl.shape[0]
    feature_num = int(wid_feat_num*ht_feat_num)
    img_ht = int(test_data.shape[0] / test_lbl.shape[0])
    
    # Extracting features
    tmp = test_data.copy().values.tolist()
    observation = feature_ext(tmp, feat_size, wid_feat_num, ht_feat_num)
    
    # Replace when class probabilities are 0 with a small value to avoid P(Data | Class) = 0
    p_data_class = p_data_class.replace(0, 0.00000000001)
    
    # Calculates the probabilities of each class for each test observation
    total_correct = 0
    for obs, num in zip(observation, range(obs_num)):
        prob_f = []
        correct = 1
        for classif in range(class_num):
            tmp = p_data_class.iloc[:, p_data_class.columns.get_level_values(0)==classif]
            prob = []
            
            # P(Class | Data) = P(Data | Class) * P(Class)
            for feat in range(feature_num):
                prob.append(tmp.iloc[:, feat][obs[feat]])
            prob_f.append(np.prod(prob)*p_class[classif])
        
        # Checks if prediction matches image class
        if test_lbl[0][num] == prob_f.index(max(prob_f)):
            correct = True
            total_correct += 1
        else:
            correct = False
    end = time.time()
    
    print('Total classified correct:', total_correct/obs_num)
    print('Time:', end - start)

## Training and Testing Models

### Digit Classification

In [800]:
p_class, p_data_class, perf = train_nb(train_img_d, train_lbl_d_raw, 10, 4, 7, 7, 1)

In [801]:
test_nb(test_img_d, test_lbl_d_raw, 10, 4, 7, 7, p_class, p_data_class)

Total classified correct: 0.747
Time: 44.63312530517578


### Face Classification

In [802]:
p_class, p_data_class, perf = train_nb(train_img_f, train_lbl_f_raw, 2, 2, 30, 35, 1)

In [803]:
test_nb(test_img_f, test_lbl_f_raw, 2, 2, 30, 35, p_class, p_data_class)

Total classified correct: 0.6866666666666666
Time: 26.25709080696106


In [811]:
# Consider changing features

# Digit Classification - KNN

In [19]:
train_img_d_raw = pd.read_csv("data/digitdata/trainingimages",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_d_raw = pd.read_csv("data/digitdata/traininglabels",
                              skip_blank_lines=False, header=None)
test_img_d_raw = pd.read_csv("data/digitdata/testimages",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_d_raw = pd.read_csv("data/digitdata/testlabels",
                              skip_blank_lines=False, header=None)

In [20]:
# Converts string image (arr) into integer array
def convert(arr):
    train_img_d_arr = [[0 for col in range(len(arr[0]))] for row in range(len(arr))]
    for row in range(len(arr)):
        for col in range(len(arr[0])):
            if arr[row][col] == '+':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == '#':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == ' ':
                train_img_d_arr[row][col] = 0
    return train_img_d_arr

In [23]:
start = time.time()
obs_num = 5000 # train set num

tmp = convert(train_img_d_raw[0:28*obs_num])
train_dig_df = []
for dig in range(obs_num):
    train_dig_df.append(pd.DataFrame(tmp[0 + 28*dig:28 + 28*dig]))

test_obs = 50 # test set num

tmp2 = convert(test_img_d_raw[0:28*test_obs])
test_dig_df = []
for dig in range(test_obs):
    test_dig_df.append(pd.DataFrame(tmp2[0 + 28*dig:28 + 28*dig]))


correct = 0
for obs in range(test_obs):
    k = []
    for img in train_dig_df:
        k.append(((test_dig_df[obs] - img)**2).sum().sum())
    k = pd.DataFrame(k)
    k = k[0].sort_values()[0:7]
    maj_class = []


    for i in k.index:
        maj_class.append(train_lbl_d_raw[0][i])
    maj_val = pd.DataFrame(maj_class)[0].value_counts().index[0]
    if maj_val == test_lbl_d_raw[0][obs]:
        correct += 1
correct / test_obs
end = time.time()

print('Total classified correct:', correct / test_obs)
print('Time:', end - start)

Total classified correct: 0.9
Time: 7452.307159662247


# Face Classification

In [24]:
train_img_f_raw = pd.read_csv("data/facedata/facedatatrain",
                              skip_blank_lines=False, squeeze=True, header=None)
train_lbl_f_raw = pd.read_csv("data/facedata/facedatatrainlabels",
                              skip_blank_lines=False, header=None)
test_img_f_raw = pd.read_csv("data/facedata/facedatatest",
                              skip_blank_lines=False, squeeze=True, header=None)
test_lbl_f_raw = pd.read_csv("data/facedata/facedatatestlabels",
                              skip_blank_lines=False, header=None)

In [27]:
start = time.time()
obs_num = 451 # train set num

tmp = convert(train_img_f_raw[0:70*obs_num])
train_face_df = []
for face in range(obs_num):
    train_face_df.append(pd.DataFrame(tmp[0 + 70*face:70 + 70*face]))

test_obs = 150 # test set num

tmp2 = convert(test_img_f_raw[0:70*test_obs])
test_face_df = []
for face in range(test_obs):
    test_face_df.append(pd.DataFrame(tmp2[0 + 70*face:70 + 70*face]))


correct = 0
for obs in range(test_obs):
    k = []
    for img in train_face_df:
        k.append(((test_face_df[obs] - img)**2).sum().sum())
    k = pd.DataFrame(k)
    k = k[0].sort_values()[0:7]
    #print('k', k)
    maj_class = []

    for i in k.index:
        maj_class.append(train_lbl_f_raw[0][i])
    maj_val = pd.DataFrame(maj_class)[0].value_counts().index[0]
    #print(maj_val, test_lbl_f_raw[0][obs])
    if maj_val == test_lbl_f_raw[0][obs]:
        correct += 1
end = time.time()

print('Total classified correct:', correct / test_obs)
print('Time:', end - start)

Total classified correct: 0.5866666666666667
Time: 1003.8585085868835


In [28]:
start = time.time()
obs_num = 451 # train set num

tmp = convert(train_img_f_raw[0:70*obs_num])
train_face_df = []
for face in range(obs_num):
    train_face_df.append(pd.DataFrame(tmp[25 + 70*face:50 + 70*face]).iloc[:, 15:45])

test_obs = 150 # test set num

tmp2 = convert(test_img_f_raw[0:70*test_obs])
test_face_df = []
for face in range(test_obs):
    test_face_df.append(pd.DataFrame(tmp2[25 + 70*face:50 + 70*face]).iloc[:, 15:45])


correct = 0
for obs in range(test_obs):
    k = []
    for img in train_face_df:
        k.append(((test_face_df[obs] - img)**2).sum().sum())
    k = pd.DataFrame(k)
    k = k[0].sort_values()[0:7]
    #print('k', k)
    maj_class = []

    for i in k.index:
        maj_class.append(train_lbl_f_raw[0][i])
    maj_val = pd.DataFrame(maj_class)[0].value_counts().index[0]
    if maj_val == test_lbl_f_raw[0][obs]:
        correct += 1
end = time.time()

print('Total classified correct:', correct / test_obs)
print('Time:', end - start)

Total classified correct: 0.74
Time: 576.3693146705627


# Old Methods

In [None]:
# Divides a 28x28 digit image into 28/feat_size x 28/feat_size matrices
def partition(feat_size, arr):
    
    matrix_num = int(28/feat_size)
    features = [[0 for col in range(matrix_num)] for row in range(matrix_num)]
    
    for row in range(matrix_num):
        for col in range(matrix_num):
            for mat_row in range(feat_size):
                for mat_col in range(feat_size):
                    features[row][col] += arr[mat_row + row*feat_size][mat_col + col*feat_size]
    return features

# Find features for 2000 digits, last column being the response var
def feature_ext(feat_size, data):
    
    features = [[1 for col in range(int((28/feat_size)**2))] for row in range(int(len(data)/28))]
    
    for digit in range(int(len(data)/28)):
        tmp = partition(feat_size, data[0 + 28*digit:28 + 28*digit])
        count = 0
        
        for row in range(int(28/feat_size)):
            for col in range(int(28/feat_size)):
                features[digit][count] = tmp[row][col]
                count += 1    
    return features

# Old convert method
def convertX(arr):
    train_img_d_arr = [[0 for col in range(len(arr[0]))] for row in range(len(arr))]
    for row in range(len(arr)):
        for col in range(len(arr[0])):
            if arr[row][col] == '+':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == '#':
                train_img_d_arr[row][col] = 1
            if arr[row][col] == ' ':
                train_img_d_arr[row][col] = 0
    return train_img_d_arr



# Training method
start = time.time()
obs_num = 173                  # pixels*num of observations
feat_size = 4                     # pixels per feature that make up X by X matrix
feature_num = int((28/feat_size)**2) # number of features

train_img_d = convertX(train_img_d_raw[0:28*obs_num])

end = time.time()
print(end - start)

features = pd.DataFrame(feature_ext(feat_size, train_img_d))


# Add training labels to dataset (1000 default value)
tmp = train_lbl_d_raw[:obs_num]
tmp.rename(columns={0:int((28/feat_size)**2)}, inplace=True)
features = pd.concat([features, tmp], axis=1)

# P(Class)
p_class = features.iloc[:, -1].value_counts()
p_class.sort_index(inplace=True)
p_class = p_class/obs_num

# P(Data | Class)
final = []
for dig in range(10):
    p_data_class = pd.DataFrame([[0 for col in range(feature_num*1)] for row in range(feat_size**2 + 1)])
    for feature in range(feature_num):
        tmp = features.loc[features.iloc[:,-1] == dig][feature].value_counts()
        for feature_val in tmp.index:
            p_data_class[feature][feature_val] = tmp[feature_val]
    p_data_class = p_data_class / features.iloc[:, -1].value_counts().sort_index()[dig]
    p_data_class.columns = pd.MultiIndex.from_product([[dig], range(feature_num)], names=['Digit', 'Feature'])
    if dig == 0:
        final = p_data_class
    else:
        final = final.join(p_data_class)
p_data_class = final
# end = time.time()

# print('Time:', end - start)


# Testing method
start = time.time()
obs_num = 47                        # pixels*num of observations
feat_size = 4                        # pixels per feature that make up X by X matrix
feature_num = int((28/feat_size)**2) # number of features

observation = feature_ext(feat_size, convertX(test_img_d_raw[0:28*obs_num]))

p_data_class = p_data_class.replace(0, 0.00000000001) # Removes 0% probabilities

test_lbl_d_raw[:obs_num] # test set

total_correct = 0
for obs, num in zip(observation, range(obs_num)):
    prob_f = []
    correct = 1
    for dig in range(10):
        tmp = p_data_class.iloc[:, p_data_class.columns.get_level_values(0)==dig]
        prob = []
        for feat in range(feature_num):
            prob.append(tmp.iloc[:, feat][obs[feat]])
        prob_f.append(np.prod(prob)*p_class[dig])
    if test_lbl_d_raw[0][num] == prob_f.index(max(prob_f)):
        correct = True
        total_correct += 1
    else:
        correct = False
end = time.time()

print('Total classified correct:', total_correct/obs_num)
print('Time:', end - start)

# # Tried to improve time efficiency of algorithm

# def convert2(data):
#     converted = data.copy()
#     for i in range(converted.shape[0]):
#         converted[i] = converted[i].replace(' ', '0')
#         converted[i] = converted[i].replace('#', '1')
#         converted[i] = converted[i].replace('+', '1')
#     converted = converted.apply(lambda x: pd.Series(list(x)))
#     converted = converted.apply(pd.to_numeric)
#     return converted

# def feature_ext2(data, feat_len, img_wid, img_ht):
#     obs_num = int(data.shape[0] / img_ht)
#     features = pd.DataFrame([])
#     for obs in range(obs_num):
#         tmp = []
#         for row in range(int(img_ht / feat_len)):
#             for col in range(int(img_wid / feat_len)):
#                 tmp.append(data.iloc[(0 + feat_len*row) + obs*img_ht:(feat_len + feat_len*row) + obs*img_ht,
#                                      0 + feat_len*col:feat_len + feat_len*col].values.sum())
#         features = features.append(pd.DataFrame([tmp]))
#         features.reset_index(drop = True, inplace=True)
#     return features


# # Training Method
# start = time.time()
# obs_num = 500
# img_wid = 28
# img_ht = 28
# feat_len = 4
# feat_num = int(img_wid*img_ht / (feat_len**2))

# data = convert2(train_img_d_raw[0:img_ht*obs_num])

# features = feature_ext2(data, feat_len, img_wid, img_ht)

# # Add training labels to dataset (1000 default value)
# tmp = train_lbl_d_raw[:obs_num]
# tmp.rename(columns={0:feat_num}, inplace=True)
# features = features.join(tmp)

# # P(Class)
# p_class = features.iloc[:, -1].value_counts()
# p_class.sort_index(inplace=True)
# p_class = p_class/obs_num

# # P(Data | Class)
# # final = []
# # for dig in range(10):
# #     p_data_class = pd.DataFrame([[0 for feature in range(feature_num)] for feature_val in range(feat_len**2 + 1)])
# #     for feat in range(feat_num):
# #         feature_values = features[feat].value_counts()
# #         for feat_val in feature_values.index:
# #             p_data_class[feat][feat_val] = feature_values[feat_val]
# #     p_data_class = p_data_class / features.shape[0]
# #     p_data_class.columns = pd.MultiIndex.from_product([[dig], range(feature_num)], names=['Digit', 'Feature'])
# #     if dig == 0:
# #         final = p_data_class
# #         break
# #     else:
# #         final = final.join(p_data_class)
# # p_data_class = final

# # P(Data | Class)
# final = []
# for dig in range(10):
#     p_data_class = pd.DataFrame([[0 for col in range(feature_num)] for row in range(feat_len**2 + 1)])
#     for feature in range(feature_num):
#         tmp = features.loc[features.iloc[:,-1] == dig][feature].value_counts()
#         for feature_val in tmp.index:
#             p_data_class[feature][feature_val] = tmp[feature_val]
#     p_data_class = p_data_class / features.iloc[:, -1].value_counts().sort_index()[dig]
#     p_data_class.columns = pd.MultiIndex.from_product([[dig], range(feature_num)], names=['Digit', 'Feature'])
#     if dig == 0:
#         final = p_data_class
#     else:
#         final = final.join(p_data_class)
# p_data_class = final
# end = time.time()
# print(end - start)