##### Final Solution
posted here: https://stackoverflow.com/questions/27969091/processing-an-image-of-a-table-to-get-data-from-it

In [None]:
import cv2
import numpy as np
import os

# the list of images (tables)
images = ['table1.png', 'table2.png', 'table3.png', 'table4.png', 'table5.png']

# the list of templates (used for template matching)
templates = ['train1.png']

def remove_duplicates(lines):
    # remove duplicate lines (lines within 10 pixels of eachother)
    for x1, y1, x2, y2 in lines:
        for index, (x3, y3, x4, y4) in enumerate(lines):
            if y1 == y2 and y3 == y4:
                diff = abs(y1-y3)
            elif x1 == x2 and x3 == x4:
                diff = abs(x1-x3)
            else:
                diff = 0
            if diff < 10 and diff is not 0:
                del lines[index]
    return lines


def sort_line_list(lines):
    # sort lines into horizontal and vertical
    vertical = []
    horizontal = []
    for line in lines:
        if line[0] == line[2]:
            vertical.append(line)
        elif line[1] == line[3]:
            horizontal.append(line)
    vertical.sort()
    horizontal.sort(key=lambda x: x[1])
    return horizontal, vertical


def hough_transform_p(image, template, tableCnt):
    # open and process images
    img = cv2.imread('imgs/'+image)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)

    # probabilistic hough transform
    lines = cv2.HoughLinesP(edges, 1, np.pi/180, 200, minLineLength=20, maxLineGap=999)[0].tolist()

    # remove duplicates
    lines = remove_duplicates(lines)

    # draw image
    for x1, y1, x2, y2 in lines:
        cv2.line(img, (x1, y1), (x2, y2), (0, 0, 255), 1)

    # sort lines into vertical & horizontal lists
    horizontal, vertical = sort_line_list(lines)

    # go through each horizontal line (aka row)
    rows = []
    for i, h in enumerate(horizontal):
        if i < len(horizontal)-1:
            row = []
            for j, v in enumerate(vertical):
                if i < len(horizontal)-1 and j < len(vertical)-1:
                    # every cell before last cell
                    # get width & height
                    width = horizontal[i+1][1] - h[1]
                    height = vertical[j+1][0] - v[0]

                else:
                    # last cell, width = cell start to end of image
                    # get width & height
                    width = tW
                    height = tH
                tW = width
                tH = height

                # get roi (region of interest) to find an x
                roi = img[h[1]:h[1]+width, v[0]:v[0]+height]

                # save image (for testing)
                dir = 'imgs/table%s' % (tableCnt+1)
                if not os.path.exists(dir):
                     os.makedirs(dir)
                fn = '%s/roi_r%s-c%s.png' % (dir, i, j)
                cv2.imwrite(fn, roi)

                # if roi contains an x, add x to array, else add _
                roi_gry = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
                ret, thresh = cv2.threshold(roi_gry, 127, 255, 0)
                contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

                if len(contours) > 1:
                    # there is an x for 2 or more contours
                    row.append('x')
                else:
                    # there is no x when len(contours) is <= 1
                    row.append('_')
            row.pop()
            rows.append(row)

    # save image (for testing)
    fn = os.path.splitext(image)[0] + '-hough_p.png'
    cv2.imwrite('imgs/'+fn, img)


def process():
    for i, img in enumerate(images):
        # perform probabilistic hough transform on each image
        hough_transform_p(img, templates[0], i)


if __name__ == '__main__':
    process()

##### Playing around with the Hough Transformation
From the python cv2 page: http://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_imgproc/py_houghlines/py_houghlines.html?highlight=detect%20lines

In [47]:
import cv2
import numpy as np

img = cv2.imread('images/sudoku-original.jpg')
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray,50,150,apertureSize = 3)

lines = cv2.HoughLines(edges, 1, np.pi/180, 1)
for rho,theta in lines[0]:
    a = np.cos(theta)
    b = np.sin(theta)
    x0 = a*rho
    y0 = b*rho
    x1 = int(x0 + 1000*(-b))
    y1 = int(y0 + 1000*(a))
    x2 = int(x0 - 1000*(-b))
    y2 = int(y0 - 1000*(a))

    cv2.line(img,(x1,y1),(x2,y2),(0,0,255),2)

cv2.imwrite('images/sudoku-original-gray.jpg', gray)
cv2.imwrite('images/sudoku-original-edges.jpg', edges)
cv2.imwrite('images/houghlines1.jpg', img)


True

<img src='images/sudoku-original-edges.jpg'>

<img src='images/houghlines1.jpg'>

##### Playing around with the Hough Transformation 2
From this stack overflow question: https://stackoverflow.com/questions/45322630/how-to-detect-lines-in-opencv

In [121]:
img = cv2.imread('images/voltest-original.png')
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray,50,150,apertureSize = 3)

rho = 1  # distance resolution in pixels of the Hough grid (originally 1)
theta = np.pi / 180  # angular resolution in radians of the Hough grid
threshold = 200  # minimum number of votes (intersections in Hough grid cell) (originally 15)
min_line_length = 55  # minimum number of pixels making up a line (originally 50)
max_line_gap = 1  # maximum gap in pixels between connectable line segments (originally 20)
line_image = np.copy(img) * 0  # creating a blank to draw lines on

# Run Hough on edge detected image
# Output "lines" is an array containing endpoints of detected line segments
lines = cv2.HoughLinesP(edges, rho, theta, threshold, np.array([]),
                    min_line_length, max_line_gap)

#for line in lines:
#    for x1,y1,x2,y2 in line:
#        cv2.line(line_image, (x1,y1), (x2,y2), (255,0,0), 5)

In [117]:
# Draw the lines on the  image
lines_edges = cv2.addWeighted(img, 0.8, line_image, 1, 0)
cv2.imwrite('images/voltest-lines.jpg', lines_edges)
cv2.imwrite('images/voltest-lines-separate.jpg', line_image)

True

##### Best Hough Transformation
Below is my approach to tuning parameters and then applying the hough transformation to the image

In [96]:
def draw_image(img, lines, params):
    
    # Create a blank image to draw lines on
    line_image = np.copy(img) * 0  
    
    # Draw lines from hough transform
    for line in lines:
        for x1,y1,x2,y2 in line:
            cv2.line(line_image, (x1,y1), (x2,y2), (255,0,0), 5)
    
    # Output image to file
    filename = 'images/voltest3-lines-{}-{}-{}.jpg'.format(params[0], params[1], params[2]) 
    cv2.imwrite(filename, line_image)
    
def hough_transform(img, threshold, min_line_length, max_line_gap):
    
    # Convert to grayscale and detect edges
    gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray,50,150,apertureSize = 3) 
    
    rho = 1  # distance resolution in pixels of the Hough grid (originally 1)
    theta = np.pi / 180  # angular resolution in radians of the Hough grid
    #threshold = 200  # minimum number of votes (intersections in Hough grid cell) (originally 15)
    #min_line_length = 55  # minimum number of pixels making up a line (originally 50)
    #max_line_gap = 1  # maximum gap in pixels between connectable line segments (originally 20)
    params = [threshold, min_line_length, max_line_gap] # for output to draw_image
    
    # Apply Hough Transformation
    lines = cv2.HoughLinesP(edges, rho, theta, threshold, np.array([]),
                    min_line_length, max_line_gap)

    # Draw Image & Output
    draw_image(img, lines, params)

# Load original image
img = cv2.imread('images/voltest3.png')

# Tuning parameters
thresholds = np.arange(200, 300, 100)
min_line_lengths = np.arange(10, 26, 25)
max_line_gaps = np.arange(0,15,5)

# Run hough transform with all parameters
for threshold_val in thresholds:
    for minline_val in min_line_lengths:
        for maxgap_val in max_line_gaps:
            hough_transform(img, threshold_val, minline_val, maxgap_val)

    

consider countour detection with hough transform, and template matching

Helpful Links:
https://stackoverflow.com/questions/10196198/how-to-remove-convexity-defects-in-a-sudoku-square

https://stackoverflow.com/questions/27969091/processing-an-image-of-a-table-to-get-data-from-it

https://www.google.com/search?q=opencv+dave.jpg&rlz=1C1SQJL_enUS762US762&tbm=isch&tbo=u&source=univ&sa=X&ved=2ahUKEwiclLHf3d3cAhVowFQKHd7-BmwQsAR6BAgFEAE&biw=2000&bih=958#imgrc=VS-1FritL6V2MM

http://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_imgproc/py_houghlines/py_houghlines.html?highlight=detect%20lines

https://stackoverflow.com/questions/27969091/processing-an-image-of-a-table-to-get-data-from-it

https://stackoverflow.com/questions/45322630/how-to-detect-lines-in-opencv

##### Contour Detection
Below is my approach to using contour detection (instead of the hough transformation) to detect the tables in the image

In [30]:
### Import libraries
import numpy as np
import cv2

# Prep image
img = cv2.imread('images/voltest-original.png')
imgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

# Run contour analysis
ret, thresh = cv2.threshold(imgray, 127, 255, 0)
im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

# Sort contours
contours = sorted(contours, key=cv2. contourArea, reverse = True)
manual_tbl_countours = contours[1:4]

# next two lines were pulled from stack question
perimeters = [cv2.arcLength(contours[i],True) for i in range(len(contours))]
listindex=[i for i in range(15) if perimeters[i]>perimeters[0]/2]

# from here: https://www.pyimagesearch.com/2017/07/17/credit-card-ocr-with-opencv-and-python/
c = contours[1]
(x, y, w, h) = cv2.boundingRect(c)

# Show image
imgcont = img.copy()
#[cv2.drawContours(imgcont, [contours[i]], 0, (0,255,0), 5) for i in listindex]
#plt.imshow(imgcont)
cv2.drawContours(imgcont, contours, 10, (0,255,0), 3)
cv2.imwrite("filename.jpg", imgcont)

True

In [None]:
# from here: https://www.pyimagesearch.com/2017/07/17/credit-card-ocr-with-opencv-and-python/
c = contours[2]
(x, y, w, h) = cv2.boundingRect(c)

print(x,y,w,h)

1. first, get sort the contours by size. 4 contours with largest area.
2. second, sort by left to right

In [62]:
# From here: https://stackoverflow.com/questions/28759253/how-to-crop-the-internal-area-of-a-contour

def remove_duplicates(lines):
    # remove duplicate lines (lines within 10 pixels of eachother)
    
    for line in lines:
        for x1,y1,x2,y2 in line:
        #for x1, y1, x2, y2 in lines:
            for index, (x3, y3, x4, y4) in enumerate(lines):
                if y1 == y2 and y3 == y4:
                    diff = abs(y1-y3)
                elif x1 == x2 and x3 == x4:
                    diff = abs(x1-x3)
                else:
                    diff = 0
                if diff < 10 and diff is not 0:
                    del lines[index]
        return lines

def sort_line_list(lines):
    # sort lines into horizontal and vertical
    vertical = []
    horizontal = []
    for line in lines:
        if line[0] == line[2]:
            vertical.append(line)
        elif line[1] == line[3]:
            horizontal.append(line)
    vertical.sort()
    horizontal.sort(key=lambda x: x[1])
    return horizontal, vertical

# Below script gets the table outline for contour index 3
idx = 3
mask = np.zeros_like(img) # Create mask where white is what we want, black otherwise
cv2.drawContours(mask, contours, idx, (255,255,255), -1) # Draw filled contour in mask
out = np.zeros_like(img) # Extract out the object and place into output image
out[mask == 255] = img[mask == 255]
# Crop masked image
#(x, y) = np.where(mask == 255)
#print(np.where(mask == 255))
#(topx, topy) = (np.min(x), np.min(y))
#(bottomx, bottomy) = (np.max(x), np.max(y))
#out = out[topx:bottomx+1, topy:bottomy+1]

# Convert to grayscale and detect edges
gray = cv2.cvtColor(out, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray, 50, 150, apertureSize = 3)

# Apply Hough Transformation
lines = cv2.HoughLinesP(edges, rho = 1, theta = np.pi / 180, threshold = 200, minLineLength = 20, maxLineGap = 50)

# Create a blank image to draw lines on
line_image = np.copy(img) * 0  

# Draw lines from hough transform
vertical = []
horizontal = []
for line in lines:
    for x1, y1, x2, y2 in line:
        cv2.line(out, (x1,y1), (x2,y2), (255,255,255), 5)
        if x1 == x2:
            vertical.append(line)
            print("vertical")
            print(y2 - y1)
        elif y1 == y2:
            horizontal.append(line)
            print("horizontal")
            print(x2 - x1)



cv2.imwrite("output.jpg", out)      
# sort them
#vertical = np.sort(vertical, axis=0)
#print(vertical)
#horizontal = np.sort(horizontal, axis = 1)

# remove duplicates
#lines = remove_duplicates(lines)            

# sort lines into vertical & horizontal lists
#horizontal, vertical = sort_line_list(lines)


horizontal
935
horizontal
935
horizontal
935
horizontal
935
horizontal
931
horizontal
935
horizontal
935
horizontal
935
horizontal
935
horizontal
931
horizontal
935
horizontal
935
horizontal
935
horizontal
935
horizontal
935
horizontal
935
vertical
-606
vertical
-613
vertical
-613
vertical
-606
vertical
-605
vertical
-605
vertical
-606
vertical
-613
vertical
-605
vertical
-605
vertical
-613
vertical
-606


True

In [52]:
# use my function defined above
hough_transform(out, 200, 20, 50)

In [69]:
try:
    import Image
except ImportError:
    from PIL import Image
import pytesseract

# If you don't have tesseract executable in your PATH, include the following:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'

# Define config parameters.
# '-l eng'  for using the English language
# '--oem 1' for using LSTM OCR Engine
config = ('-l eng --oem 1 --psm 3')
 
# Read image from disk
#im = cv2.imread(imPath, cv2.IMREAD_COLOR)
img_crop = cv2.imread('output_cropped.jpg')
# Run tesseract OCR on image
text = pytesseract.image_to_string(img_crop, config=config)

counts = text.split()
print(counts)

['14', '20', '10', '30', '36', '22', '1061', '1071', '535', '889', '815', '382', '116', '98', '48', '74', '64', '45', '1191', '1189', '593', '993', '915', '449']


New Workflow:

1. Detect large boxes using contours
2. Crop image (function)
3. Using hough transformation, detect table lines (function)
4. Recolor table lines to white, crop image
5. Run tesseract on resulting table, extract numbers

In [3]:
### Import libraries
import numpy as np
import cv2
from operator import itemgetter, attrgetter
try:
    import Image
except ImportError:
    from PIL import Image
import pytesseract

# Set path to tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
# Define config parameters.
# '-l eng'  for using the English language
# '--oem 1' for using LSTM OCR Engine
config = ('-l eng --oem 0 --psm 10000 -c tessedit_char_whitelist=0123456789')

def GetContours(img):
    # Prep image
    imgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    ret, thresh = cv2.threshold(imgray, 127, 255, 0)
    # Run contour analysis, sort by contour area (descending)
    im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse = True)
    return(contours)

def CropImage(img, contour):
    (x, y, w, h) = cv2.boundingRect(contour)
    crop_img = img[y:y+h, x:x+w]
    return(crop_img)

def TesseractText(img):
    text = pytesseract.image_to_string(img, config=config)
    counts = list(map(int, text.split()))
    # hmm, maybe here i shoudl be concatenating everything, instead of eventually
    # only returning the first object in the list
    return(counts)

def ExtractCellVal(cells, img):
    vol = []
    # for each cell, crop & extract text
    for cell in cells:
        (x, y, w, h) = cell[1], cell[2], cell[3], cell[4]
        crop_img = img[y:y+h, x:x+w]
        val = TesseractText(crop_img)
        vol.append(val[0])
    return(vol)
    
def SortPedCells(contours):
    # Get the bounding box of each contour
    contour_list = []
    contour_len = len(contours)
    for contour in contours:
        (x, y, w, h) = cv2.boundingRect(contour)
        contour_list.append([contour, x, y, w, h])
    contour_a = np.array(contour_list)
    # Sort by x coordinate, split by number of columns (in this case, 2)
    contour_a = contour_a[contour_a[:,1].argsort()]
    pedvol = contour_a[:6]
    schvol = contour_a[6:]
    # Sort top to bottom (descending) by y coordinate
    pedvol = pedvol[pedvol[:,2].argsort()]
    schvol = schvol[schvol[:,2].argsort()]
    return(pedvol, schvol)
    
def AnalyzePedCrossingTable(img, pedtbl_contour):
    # Crop Image, get new contours
    crop_img = CropImage(img, pedtbl_contour[0])
    pedvol_contours = GetContours(crop_img)
    pedvol_cells = pedvol_contours[2:14]
    pedvol_cells, schvol_cells = SortPedCells(pedvol_cells)
    pedvol = ExtractCellVal(pedvol_cells, crop_img)
    schvol = ExtractCellVal(schvol_cells, crop_img)
    return(dict([("Ped", pedvol), ("Sch", schvol)]))
    
def GetPedData(img):
    ped_tbl_contours = GetContours(img)[5:9]
    ped_tbls = []
    for ped_tbl_contour in ped_tbl_contours:
        (x, y, w, h) = cv2.boundingRect(ped_tbl_contour)
        ped_tbls.append([ped_tbl_contour, x, y, w, h])
    ped_tbls = np.array(ped_tbls)
    ped_tbls = sorted(ped_tbls, key=itemgetter(1))
    ped_tbls = sorted(ped_tbls, key=itemgetter(2))
    
    ped_sch_extract = {}
    ped_sch_extract['SL'] = AnalyzePedCrossingTable(img, ped_tbls[:1][0])
    ped_sch_extract['NL'] = AnalyzePedCrossingTable(img, ped_tbls[1:2][0])
    ped_sch_extract['WL'] = AnalyzePedCrossingTable(img, ped_tbls[2:3][0])
    ped_sch_extract['EL'] = AnalyzePedCrossingTable(img, ped_tbls[3:4][0])
    
    # Format as final df
    ped_sch_data = []
    for leg in ped_sch_extract:
        for pedtype in ped_sch_extract[leg]:

            ped_sch_dict = {}
            ped_sch_dict['xing_leg'] = leg
            ped_sch_dict['type'] = pedtype
            ped_sch_dict['volume'] = sum(ped_sch_extract[leg][pedtype])
            ped_sch_data.append(ped_sch_dict)
    
    return(ped_sch_data)

img = cv2.imread('images2/1ST.FRESNO.160608-MAN.png')
ManualTC['Pedestrian'] = GetPedData(img)


[{'xing_leg': 'SL', 'type': 'Ped', 'volume': 73}, {'xing_leg': 'SL', 'type': 'Sch', 'volume': 29}, {'xing_leg': 'NL', 'type': 'Ped', 'volume': 0}, {'xing_leg': 'NL', 'type': 'Sch', 'volume': 0}, {'xing_leg': 'WL', 'type': 'Ped', 'volume': 32}, {'xing_leg': 'WL', 'type': 'Sch', 'volume': 1}, {'xing_leg': 'EL', 'type': 'Ped', 'volume': 35}, {'xing_leg': 'EL', 'type': 'Sch', 'volume': 9}]


()

##### Convert PDF Images to PNG for Processing with CV2
Before doing conversions from PDF to PNG, make sure to install ImageMagick (along with wand) as well as Ghostscript. Add more detail.

In [188]:
from wand.image import Image, Color
import glob, os

# Grab all PDFs within the folder
files = glob.glob('C:/Users/Tim/Documents/GitHub/vehicle-vol-pdf-scrape/HoughTranformTest/images2/*.pdf')
file_names = [os.path.abspath(file) for file in files]

for file_name in file_names:
    fin, file_extension = os.path.splitext(file_name)
    fout = fin + '.png'

    with Image(filename=file_name, resolution=300) as img:
        img.background_color = Color("white")
        img.alpha_channel = 'remove'
        img.save(filename=fout)



##### Testing Stuctural Similarity Index (SSIM)


In [203]:
# import the necessary packages
from skimage.measure import compare_ssim as ssim
from skimage.measure import compare_mse as mse
import matplotlib.pyplot as plt
import numpy as np
import cv2

#hist1 = cv2.calcHist([image],[0],None,[256],[0,256])
#hist2 = cv2.calcHist([image1],[0],None,[256],[0,256])
#compare = cv2.compareHist(hist1,hist2,CV_COMP_CORREL)

# Import images
imageA = cv2.imread("images2/1ST.ALAMEDA.140409LATESHIFT-MAN.png")
#imageA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
histA = cv2.calcHist([imageA],[0],None,[256],[0,256])

# Grab all PDFs within the folder
files = glob.glob('C:/Users/Tim/Documents/GitHub/vehicle-vol-pdf-scrape/HoughTranformTest/images2/*.png')
file_names = [os.path.abspath(file) for file in files]

for file_name in file_names:
    imageB = cv2.imread(file_name)
    
    # Print filename
    print(file_name)

    # convert the images to grayscale
    #imageB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)
    histB = cv2.calcHist([imageB],[0],None,[256],[0,256])

    # compute SSIM
    #s = ssim(imageA, imageB)
    #m = mse(imageA, imageB)
    #print(s, m)
    compare = cv2.compareHist(histA, histB, 0)
    print(compare)

C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.ALAMEDA.140409LATESHIFT-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.ALAMEDAEARLYSHIFT.140227-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.BEAUDRY.150203-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.CUMMINGS.120614-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.DACOTAH.120615-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.DACOTAH.160602-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.EVERGREEN.160519-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.FRESNO.160608-MAN.png
1.0
C:\Users\Tim\Documents\GitHub\vehicle-vol-pdf-scrape\HoughTranformTest\images2\1ST.HILL.160202-MAN-