In [1]:
import os
import urllib.request 
import operator
from collections import Counter

### USE OpenCV KERNEL ###
#import cv2
import numpy as np
from scipy.io import loadmat

from sklearn.decomposition import PCA
    
# default plot stying changes
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")
colors = sns.color_palette('Set2',12)

# ignore decpricated warnings, etc.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF

In [3]:
# plotly username and password
with open('../_credentials/plotly.txt', 'r') as infile:
    user, pw = infile.read().strip().split(', ')
    
plotly.tools.set_credentials_file(username=user, api_key=pw)

text_color = 'rgb(107, 107, 107)'

colors_dict = {'grey':'rgb(189, 195, 199)', 'aqua':'rgb( 54, 215, 183)', 'navy':'rgb( 31,  58, 147)',
            'purple':'rgb(142,  68, 173)', 'blue':'rgb( 25, 181, 254)', 'green':'rgb( 46, 204, 113)',
            'yellow':'rgb(253, 231,  76)', 'orange':'rgb(250, 121,  33)', 'red':'rgb(242,  38,  19)'}

colors_lst = [colors_dict['yellow'], colors_dict['orange'], colors_dict['red'], 
              colors_dict['green'], colors_dict['blue'], colors_dict['purple'], 
              colors_dict['navy'], colors_dict['aqua'], colors_dict['grey']]

# Download Data Sets

In [4]:
def get_datasets():
    # if output directory doesn't exist, make it
    raw_dir = 'data/raw_data'
    
    if not os.path.exists(raw_dir):
        os.makedirs(raw_dir)

    # base url and filenames for data sets from Stanford
    base_url = 'http://ufldl.stanford.edu/housenumbers/'
    train_file = 'train_32x32.mat'
    test_file = 'test_32x32.mat'
    extra_file = 'extra_32x32.mat'

    files = [train_file, test_file, extra_file]

    # download files if they do not exist
    for file in files:
        if not os.path.isfile('{0}/{1}'.format(raw_dir, file)):
            _ = urllib.request.urlretrieve('{0}/{1}'.format(base_url, file), '{0}/{1}'.format(raw_dir, file))
        if os.path.isfile('{0}/{1}'.format(raw_dir, file)):
            print('Downloaded: {0}'.format(file))
    
    return None

In [5]:
get_datasets()

Downloaded: train_32x32.mat
Downloaded: test_32x32.mat
Downloaded: extra_32x32.mat


# Load Raw Data (.mat Files) into Memory

In [6]:
def load_data(lim=0.1, _set='train', raw_dir='data/raw_data'):
    if (float(lim) <= 1.0) & (float(lim) > 0.0):
        
        # limit number of data points
        lim_train = round(73257*lim)
        lim_test = round(26032*lim)
        lim_extra = round(531131*lim)

        if _set=='train':
            train_data = loadmat(raw_dir+'/train_32x32.mat', variable_names='X').get('X')[:,:,:,:lim_train]
            train_labels = loadmat(raw_dir+'/train_32x32.mat', variable_names='y').get('y')[:lim_train]
            return train_data, train_labels
        
        if _set=='test':
            test_data = loadmat(raw_dir+'/test_32x32.mat', variable_names='X').get('X')[:,:,:,:lim_test]
            test_labels = loadmat(raw_dir+'/test_32x32.mat', variable_names='y').get('y')[:lim_test]
            return test_data, test_labels
        
        if _set=='extra':
            extra_data = loadmat(raw_dir+'/extra_32x32.mat', variable_names='X').get('X')[:,:,:,:lim_extra]
            extra_labels = loadmat(raw_dir+'/extra_32x32.mat', variable_names='y').get('y')[:lim_extra]
            return extra_data, extra_labels

In [23]:
train_data, train_labels = load_data(lim=0.1, _set='train')
test_data, test_labels = load_data(lim=0.1, _set='test')
extra_data, extra_labels = load_data(lim=0.05, _set='extra')

In [24]:
print('Training data shape: {0} | Training labels shape: {1}'.format(train_data.shape, train_labels.shape))
print('Testing data shape: {0} | Testing labels shape: {1}'.format(test_data.shape, test_labels.shape))
print('Extra data shape: {0} | Extra labels shape: {1}'.format(extra_data.shape, extra_labels.shape))

Training data shape: (32, 32, 3, 7326) | Training labels shape: (7326, 1)
Testing data shape: (32, 32, 3, 2603) | Testing labels shape: (2603, 1)
Extra data shape: (32, 32, 3, 26557) | Extra labels shape: (26557, 1)


# Distribution of Labels in Test and Train Datasets

In [25]:
# convert 10's to 0's
train_labels[train_labels == 10] = 0
test_labels[test_labels == 10] = 0
extra_labels[extra_labels == 10] = 0

In [26]:
# make dictionary of label counts
train_lbl_lst = np.ravel(train_labels).tolist()
train_cnt_dict = dict(Counter(train_lbl_lst))

test_lbl_lst = np.ravel(test_labels).tolist()
test_cnt_dict = dict(Counter(test_lbl_lst))

extra_lbl_lst = np.ravel(extra_labels).tolist()
extra_cnt_dict = dict(Counter(extra_lbl_lst))

In [27]:
def make_bar_trace(x_labels, y_counts, _set, clr):
    trace = go.Bar(
        x = x_labels, 
        y = y_counts, 
        name = _set, 
        marker=dict(color=clr), 
        opacity=0.8
        )
    return trace

In [28]:
traces = []
lst_dataset_dicts = [('Test', test_cnt_dict), ('Train',train_cnt_dict), ('Extra',extra_cnt_dict)]

for i, dataset_dict in enumerate(lst_dataset_dicts):
    traces.append(make_bar_trace(list(dataset_dict[1].keys()), 
                                 list(dataset_dict[1].values()), 
                                 dataset_dict[0], 
                                 colors_lst[i]))

In [29]:
data = traces

layout = go.Layout(
    title='Frequency Distribution of Class Labels',
    xaxis=dict(
        title='Class Labels',
        tickfont=dict(size=14, color='rgb(107, 107, 107)'),
        dtick=1
    ),
    yaxis=dict(
        title='Frequency',
        titlefont=dict(size=16, color='rgb(107, 107, 107)'),
        tickfont=dict(size=14, color='rgb(107, 107, 107)'),
    ),
    barmode='stack',)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='SVHN Label Distribution')

# Reshape Dataset

In [None]:
train_data = train_data.transpose((3,0,1,2))
test_data = test_data.transpose((3,0,1,2))
#extra_data = extra_data.transpose((3,0,1,2))

In [None]:
train_labels = np.ravel(train_labels)
test_labels = np.ravel(test_labels)
#extra_labels = np.ravel(extra_labels)

In [None]:
print('Training data shape: {0} | Training labels shape: {1}'.format(train_data.shape, train_labels.shape))
print('Testing data shape:  {0}  | Testing labels shape:  {1}'.format(test_data.shape, test_labels.shape))
#print('Extra data shape:    {0}     | Extra labels shape:    {1}'.format(extra_data.shape, extra_labels.shape))

# Look at One Image

In [None]:
one_img = train_data[:,:,:,:][0]
one_img.shape

In [None]:
fig = plt.gcf()
fig.set_size_inches(3, 3)
plt.xticks([]),plt.yticks([])
_ = plt.imshow(one_img)

# Custom Conversion with Limited Number of Colors

In [None]:
def gray_conv(image, num_shades):
    '''
    input: image array(h,w), and number of desired gray shades
    output: list of converted rgb values
    '''
    if num_shades < 255 or num_shades > 2:
        x, y, _ = image.shape
        gray = []
        
        for i in range(x):
            for j in range(y):
                conv = 255 / (num_shades - 1)
                avg = (sum(image[i][j])) / 3
                gray.append(int(int(round(avg / conv)) * conv))
    return np.array(gray)

In [None]:
def img_arr_conv(lst,row,col):
    '''
    ONLY NEEDED FOR VIEWING IMAGES - NOT ANALYSIS
    input: list with desired image height(row) and width(col)
    output: image in the form of ndarray(col,row)
    '''
    result = np.array(lst)
    result = np.ndarray.reshape(result, (row,col))
        
    return result

In [None]:
def conv_CustGray_2d_arr(img_arr, col_l, col_r, num_shades):
    '''
    input: array of images as ndarray, left col start and right col end for image slicing
            and number of gray shade limit
    output: ndarray of ndarrays (images, pixels) with limited number of greyscale values
    '''
    num_imgs, _, _, _ = img_arr.shape
    
    arr_2d = []
    
    for i in range(num_imgs):
        img = img_arr[:,:,:,:][i]
        img_trim = img[:, col_l:col_r]
        
        gray_img = gray_conv(img_trim, num_shades)
        
        arr_2d.append(gray_img)
    
    return np.asarray(arr_2d)

# Demo of Custom Conversion

In [None]:
img_num = 5
shades = 32

orig_img = train_data[:,:,:,:][img_num]
orig_img = orig_img[:,6:26]
scaled_img = img_arr_conv(gray_conv(orig_img, shades), 32, 20)

titles = ['Original', 'Custom Conversion']
images = [orig_img, scaled_img]

fig.set_size_inches(3, 3)
for i in range(2):
    plt.subplot(1,2,i+1),plt.imshow(images[i],'gray')
    plt.title(titles[i])
    plt.axis('off')
    plt.xticks([]),plt.yticks([])
plt.show()

# OpenCV Thresholds

In [None]:
select_img = 7
block_sz = 7
const = 2 # subtracted from the mean or weighted mean calculated

# original image
img = train_data[:,:,:,:][select_img]
img = img[:,6:26]
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Otsus Binariziation 
blur = cv2.GaussianBlur(img,(5,5),0)
ret, th1 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

# Adaptive mean
th2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, block_sz, const)

# Adaptive Gaussian
th3 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_sz, const)

titles = ['Gray Orig.', 'Otsu\'s Binarization', 'Adpt. Mean', 'Adpt. Gaussian']
images = [img, th1, th2, th3]

fig.set_size_inches(12, 6)
for i in range(4):
    
    plt.subplot(1,4,i+1),plt.imshow(images[i],'gray')
    plt.title(titles[i])
    plt.axis('off')
    plt.xticks([]),plt.yticks([])
plt.show()

In [None]:
plt.imshow(th3,'gray')
plt.axis('off')
plt.xticks([]),plt.yticks([])
plt.savefig('imgs/agt_7.png', bbox_inches='tight')

# Otsu's Binarization Thresholding

In [None]:
def conv_OBT_2d_arr(img_arr, col_l, col_r, block_size):
    '''
    input: array of images as ndarray, left col start and right col end for image slicing
    output: ndarray of ndarrays (images, pixels) thresholded using Otsu's binarization thresholding
    source: http://docs.opencv.org/trunk/d7/d4d/tutorial_py_thresholding.html#gsc.tab=0
    '''
    num_imgs, _, _, _ = img_arr.shape
    
    arr_2d = []
    
    for i in range(num_imgs):
        # grab individual image
        img = img_arr[:,:,:,:][i]
        
        # convert to grayscale
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
        # trim width (focus on digit)
        img = img[:, col_l:col_r]
        
        # make sure the blur block size is odd
        if block_size % 2 == 0:
            block_size += 1
            
        # Otsus Binariziation
        blur = cv2.GaussianBlur(img,(block_size,block_size),0)
            
        _, img = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        arr_2d.append(np.ravel(img))
    
    return np.asarray(arr_2d)

# Adaptive Mean Thresholding

In [None]:
def conv_AMT_2d_arr(img_arr, col_l, col_r, block_size, const):
    '''
    input: array of images as ndarray, left col start and right col end for image slicing
    output: ndarray of ndarrays (images, pixels) thresholded using adaptive mean
    source: http://docs.opencv.org/trunk/d7/d4d/tutorial_py_thresholding.html#gsc.tab=0
    '''
    num_imgs, _, _, _ = img_arr.shape
    
    arr_2d = []
    
    for i in range(num_imgs):
        # grab individual image
        img = img_arr[:,:,:,:][i]
        
        # convert to grayscale
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
        # trim width (focus on digit)
        img = img[:, col_l:col_r]
        
        # make sure the blur block size is odd
        if block_size % 2 == 0:
            block_size += 1
            
        # Adaptive mean
        img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, block_size, const)
        
        arr_2d.append(np.ravel(img))
    
    return np.asarray(arr_2d)

# Adaptive Gausian Thresholding

In [None]:
def conv_AGT_2d_arr(img_arr, col_l, col_r, block_size, const):
    '''
    input: array of images as ndarray, left col start and right col end for image slicing
    output: ndarray of ndarrays (images, pixels) thresholded using adaptive Gausian
    source: http://docs.opencv.org/trunk/d7/d4d/tutorial_py_thresholding.html#gsc.tab=0
    '''
    num_imgs, _, _, _ = img_arr.shape
    
    arr_2d = []
    
    for i in range(num_imgs):
        # grab individual image
        img = img_arr[:,:,:,:][i]
        
        # convert to grayscale
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
        # trim width (focus on digit)
        img = img[:, col_l:col_r]
        
        # make sure the blur block size is odd
        if block_size % 2 == 0:
            block_size += 1
            
        # Adaptive Gaussian
        img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, const)
        
        arr_2d.append(np.ravel(img))
    
    return np.asarray(arr_2d)

# PCA Transformation

In [None]:
def conv_PCA_2d_arr(img_arr, col_l, col_r, numb_components):
    '''
    input: array of images as ndarray, left col start and right col end for image slicing and number of components
    output: ndarray of ndarrays (images, pixels) flattened using PCA to limited number of colors
    source: https://medium.com/@dimart/pok%C3%A9mon-recognition-d3ad5cadc61e#.whsqst6us
    '''
    num_imgs, _, _, _ = img_arr.shape
    
    arr_2d = []
    
    for i in range(num_imgs):
        # grab individual image
        img = img_arr[:,:,:,:][i]
        
        # convert to grayscale
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
        # trim width (focus on digit)
        img = img[:, col_l:col_r]
        
        arr_2d.append(np.ravel(img))
    
    # converted images into 2d grayscale
    arr_2d = np.asarray(arr_2d)
    
    # apply principal component analysis
    pca = PCA(n_components=numb_components, whiten=True).fit(arr_2d)
    pca_arr = pca.transform(arr_2d)
    
    return pca_arr

# Pickle Numpy Arrays (Output Processed Data)

In [None]:
# if output directory doesn't exist, make it
output_directory = 'generated_data'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

## Custom Conversion Output

In [None]:
cl, cr = 6, 26
ns = 7 # number of shades

train_f = 'train_CustGray_2d.dat'
test_f = 'test_CustGray_2d.dat'

# remove existing files if they exist
try:
    os.remove(train_f)
    os.remove(test_f)
except OSError:
    pass

train_CustGray_arr = conv_CustGray_2d_arr(train_data, col_l=cl, col_r=cr, num_shades=ns)
train_CustGray_arr.dump('{0}/{1}'.format(output_directory, train_f))
              
test_CustGray_arr = conv_CustGray_2d_arr(test_data, col_l=cl, col_r=cr, num_shades=ns)
test_CustGray_arr.dump('{0}/{1}'.format(output_directory, test_f))

In [None]:
print(train_CustGray_arr.shape)
print(test_CustGray_arr.shape)

## Otsu's Binarization Thresholding Output

In [None]:
cl, cr = 6, 26
bs = 7 # block size

train_f = 'train_OBT_2d.dat'
test_f = 'test_OBT_2d.dat'

# remove existing files if they exist
try:
    os.remove(train_f)
    os.remove(test_f)
except OSError:
    pass

train_OBT_arr = conv_OBT_2d_arr(train_data, col_l=cl, col_r=cr, block_size=bs)
train_OBT_arr.dump('{0}/{1}'.format(output_directory, train_f))

test_OBT_arr = conv_OBT_2d_arr(test_data, col_l=cl, col_r=cr, block_size=bs)
test_OBT_arr.dump('{0}/{1}'.format(output_directory, test_f))

In [None]:
print(train_OBT_arr.shape)
print(test_OBT_arr.shape)

## Adaptive Mean Thresholding Output

In [None]:
cl, cr = 6, 26
bs = 7 # block size
c = 2 # constant

train_f = 'train_AMT_2d.dat'
test_f = 'test_AMT_2d.dat'

# remove existing files if they exist
try:
    os.remove(train_f)
    os.remove(test_f)
except OSError:
    pass

train_AMT_arr = conv_AMT_2d_arr(train_data, col_l=cl, col_r=cr, block_size=bs, const=c)
train_AMT_arr.dump('{0}/{1}'.format(output_directory, train_f))

test_AMT_arr = conv_AMT_2d_arr(test_data, col_l=cl, col_r=cr, block_size=bs, const=c)
test_AMT_arr.dump('{0}/{1}'.format(output_directory, test_f))

In [None]:
print(train_AMT_arr.shape)
print(test_AMT_arr.shape)

## Adaptive Gaussian Thresholding Output

In [None]:
cl, cr = 6, 26
bs = 7 # block size
c = 2 # constant

train_f = 'train_AGT_2d.dat'
test_f = 'test_AGT_2d.dat'

# remove existing files if they exist
try:
    os.remove(train_f)
    os.remove(test_f)
except OSError:
    pass

train_AGT_arr = conv_AGT_2d_arr(train_data, col_l=cl, col_r=cr, block_size=bs, const=c)
train_AGT_arr.dump('{0}/{1}'.format(output_directory, train_f))

test_AGT_arr = conv_AGT_2d_arr(test_data, col_l=cl, col_r=cr, block_size=bs, const=c)
test_AGT_arr.dump('{0}/{1}'.format(output_directory, test_f))

In [None]:
print(train_AGT_arr.shape)
print(test_AGT_arr.shape)

## Principle Component Analysis Output

In [None]:
cl, cr = 6, 26
nc = 40 # number of components

train_f = 'train_PCA_2d.dat'
test_f = 'test_PCA_2d.dat'

# remove existing files if they exist
try:
    os.remove(train_f)
    os.remove(test_f)
except OSError:
    pass

train_PCA_arr = conv_PCA_2d_arr(train_data, col_l=cl, col_r=cr, numb_components=nc)
train_PCA_arr.dump('{0}/{1}'.format(output_directory, train_f))

test_PCA_arr = conv_PCA_2d_arr(test_data, col_l=cl, col_r=cr, numb_components=nc)
test_PCA_arr.dump('{0}/{1}'.format(output_directory, test_f))

In [None]:
print(train_PCA_arr.shape)
print(test_PCA_arr.shape)

## Label Output

In [None]:
def pickle_labels(arr, directory, filename):
    arr.dump('{0}/{1}'.format(directory,filename))

train_f = 'train_labels.dat'
test_f = 'test_labels.dat'

# remove existing files if they exist
try:
    os.remove(train_f)
    os.remove(test_f)
except OSError:
    pass

# output to pickle files
pickle_labels(train_labels, output_directory, train_f)
pickle_labels(test_labels, output_directory, test_f)

In [None]:
print(train_labels.shape)
print(test_labels.shape)

# Demo of PCA

In [None]:
exp = []
for i in range(1000):
    orig_img = train_data[:,:,:,:][i]
    scaled_img = img_arr_conv(gray_conv(orig_img, 255), 32, 32)
    scaled_img = scaled_img[:,6:26]
    exp.append(scaled_img)

exp = np.asarray(exp)
exp = np.ndarray.reshape(exp, (1000,640))
exp.shape

In [None]:
# Apply PCA
h, w = 32, 20
n_components = 200

pca = PCA(n_components=n_components, whiten=True).fit(exp)
train_pca = pca.transform(exp)

In [None]:
from PIL import Image
reconstruction = pca.inverse_transform(train_pca[7])
im = Image.fromarray(reconstruction.reshape(h,w))

plt.imshow(im, cmap='Greys_r')
plt.axis('off')
_ = plt.show

plt.imshow(im,'gray')
plt.axis('off')
plt.xticks([]),plt.yticks([])
plt.savefig('imgs/pca_7.png', bbox_inches='tight')