# Dot Checks
Using a cleaned up version of the code from our other notebook, let's iterate over each image and see if our numbers are matching up.

In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib.image as mpimg
import numpy as np
from PIL import Image
from scipy import sparse
from scipy import ndimage
from scipy.ndimage import gaussian_filter
from skimage import data
from skimage import img_as_float
from skimage import morphology, measure
from skimage.color import label2rgb

%matplotlib inline

In [23]:
jpeg_threshold = 0.15


def find_all_dots(raw_path, dot_path, expected_counts, jpeg_threshold=0.15, mask_dilation=5):
    raw_image = mpimg.imread(raw_path)
    dot_image = mpimg.imread(dot_path)
    
    # Convert to floats. Will save us headache later.
    raw_image = raw_image.astype(float)
    dot_image = dot_image.astype(float)
    raw_image = raw_image / raw_image.max()
    dot_image = dot_image / dot_image.max()

    # Dot images have some black artifacts. Let's mask those out.
    dot_norm = np.linalg.norm(dot_image, axis=2)
    threshold = (dot_norm.max() - dot_norm.min()) * 0.005
    initial_mask_1d = dot_norm <= threshold

    for i in range(mask_dilation):
        initial_mask_1d = ndimage.binary_dilation(initial_mask_1d)

    # Broadcast to 3d for true image mask.
    _, initial_mask = np.broadcast_arrays(dot_image, initial_mask_1d[..., None])

    # Remove the background. Mask with the original image.
    dot_diff = np.linalg.norm(dot_image - raw_image, axis=2)
    dot_diff[initial_mask[:, :, 0]] = 0

    # Remove jpeg artifact noise.
    dot_diff[dot_diff < jpeg_threshold] = 0

    def mask_image(m, mask, c):
        mask_1d = mask < 0.01
        _, mask_3d = np.broadcast_arrays(m, mask_1d[..., None])
        m[mask_3d] = c

    dots = dot_image.copy()
    mask_image(dots, dot_diff, 0)

    # Other ways to consider removing noise.
    dot_diff_eroded = dot_diff > 0.01
    dot_diff_eroded = morphology.closing(dot_diff_eroded)
    dot_diff_eroded = ndimage.binary_erosion(dot_diff_eroded)
    #dot_diff_eroded = ndimage.binary_erosion(dot_diff_eroded)
    dots = dot_image.copy()
    mask_image(dots, dot_diff_eroded, 0)

    labeled_dots, label_count = morphology.label(dot_diff_eroded, return_num=True, connectivity=2)
    print 'Labels={}, Expected={}'.format(label_count, expected_counts)

In [21]:
values = pd.read_csv('../input/Train/train.csv')

def check_counts(i, threshold=0.15, dilation=5):
    expected_counts = sum(values.iloc[i][1:])
    raw_path = '../input/Train/{}.jpg'.format(i)
    dot_path = '../input/TrainDotted/{}.jpg'.format(i)
    find_all_dots(raw_path, dot_path, expected_counts, threshold, dilation)
    
for i in range(11):
    check_counts(i)

Labels=946, Expected=946
Labels=34, Expected=34
Labels=58, Expected=60
Labels=28333, Expected=99
Labels=17, Expected=17
Labels=47, Expected=47
Labels=45, Expected=45
Labels=2745, Expected=60
Labels=146, Expected=145
Labels=3289, Expected=116
Labels=21, Expected=21
