# Load package and define the preprocess functions

In [17]:
import cv2
import numpy as np
import pytesseract
from pytesseract import Output
import pandas as pd
pytesseract.pytesseract.tesseract_cmd = r"d:\Program Files\Tesseract-OCR\tesseract.exe"


img = cv2.imread('image.jpg')

# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)

##
def meme_process(img):
    if img is None:
        return

    for i in range(img.shape[0]):
        for j in range(img.shape[1]):
            if all([elem >= 250 for elem in img[i][j]]):
                img[i][j] = (0, 0, 0)
            else:
                img[i][j] = (255, 255, 255)
    return img

def detect_text_range(img):
    d = pytesseract.image_to_data(img, output_type=Output.DICT)
    n_boxes = len(d['text'])
    for i in range(n_boxes):
        if int(d['conf'][i]) > 60:
            (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
            img = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
    cv2.imshow('img', img)
    cv2.waitKey(0)
    return(d)


## Read meme and detect text

In [15]:
image = cv2.imread('img/4.jpg')
detect_text_range(image)
custom_config = r'-l eng --psm 6'
print(pytesseract.image_to_string(image, config=custom_config))

image = meme_process(image)
d = detect_text_range(image)
print(pytesseract.image_to_string(image, config=custom_config))

'INH\nLT\na. sk\nMe\nSes\nMea se\n\x0c'

# Preprocess

In [117]:
cap = df['caption'][36]
img_id = df['img_id'][36]
image = cv2.imread('memes/'+str(img_id)+'.jpg')
image == None

True

In [138]:
df = pd.read_csv("data.csv")
custom_config = r'-l eng --psm 1 -c tessedit_char_blacklist=1?|><.'
arr = 0 
total = 0 
arr2 = 0
reses = []
df2=pd.DataFrame(columns=['Caption from web','Exraction by OCR'])
for i in range(20):

    cap = df['caption'][i]
    img_id = df['img_id'][i]
    image = cv2.imread('memes/'+str(img_id)+'.jpg')
    if image is None:
        continue
    result1 = pytesseract.image_to_string(image, config=custom_config)
    image_new = meme_process(image)
    result2 = pytesseract.image_to_string(image_new, config=custom_config)
    words_real = cap.split()
    words1 = result1.split()
    words2 = result2.split()
    test_list1 = [x.lower() for x in words2]

    res = len(set(test_list1) & set(words_real)) / float(len(set(test_list1) | set(words_real))) * 100
    print( words_real,'Caption from website')
    print(test_list1,'Extraction result')
    reses.append(res)
    df2 = df2.append({'Caption from web':  words_real, 'Exraction by OCR': test_list1},
              ignore_index=True)
print(np.mean(res))

['i', 'bet', 'you', 'cant', 'press', 'ctrl', 'and', 'like', 'this', 'meme'] Caption from website
['i', 'bet', 'you', 'cant', 'press', 'ctrl', 'and', 'like', 'this', 'meme'] Extraction result
['i', 'dont', 'always', 'make', 'a', 'meme', 'but', 'when', 'i', 'do', 'i', 'immediately', 'press', 'th'] Caption from website
['i', 'dont', 'always', 'make', 'a', 'meme', 'but', 'when', 'ido', 'immediately', 'press', 'the', 'up', 'arrow'] Extraction result
['if', 'you', 'are', 'what', 'you', 'eat', 'then', 'arent', 'cannibals', 'the', 'only', 'real', 'humans'] Caption from website
['if', 'you', 'are', 'what', 'you', 'eat', 'then', "aren't", 'cannibals', 'the', 'only', 'real', 'humans'] Extraction result
['if', 'bars', 'cant', 'serve', 'drunk', 'people', 'mcdonalds', 'shouldnt', 'be', 'able', 'to', 'serve', 'fat', 'people'] Caption from website
['if', 'bars', "can't", 'serve', 'drunk', 'people', "mcdonald's", "shouldn't", 'be', 'able', 'to', 'serve', 'fat', 'people'] Extraction result
['you', 'get'

In [139]:
df2.head()

Unnamed: 0,Caption from web,Exraction by OCR
0,"[i, bet, you, cant, press, ctrl, and, like, th...","[i, bet, you, cant, press, ctrl, and, like, th..."
1,"[i, dont, always, make, a, meme, but, when, i,...","[i, dont, always, make, a, meme, but, when, id..."
2,"[if, you, are, what, you, eat, then, arent, ca...","[if, you, are, what, you, eat, then, aren't, c..."
3,"[if, bars, cant, serve, drunk, people, mcdonal...","[if, bars, can't, serve, drunk, people, mcdona..."
4,"[you, get, b, on, blood, test, failure, run, t...","[you, get, b+, on, blood, test, failure, run, ..."
