# The purpose of this notebook is to use different methods to calculate the similarity between two documents. 

## Check the similarity of the entire document text

In [1]:
import json
import spacy
import time

startTime = time.time()
nlp = spacy.load('en')

#The json is provided by the OCR process and is stored in Azure
jsonPath1 = "" #Enter the path to the document json here.
jsonPath2 = "" #Enter the path to the document json here. 

with open(jsonPath1, encoding='utf-8') as doc1Json:
    doc1Data = json.load(doc1Json)

with open(jsonPath2, encoding='utf-8') as doc2Json:
    doc2Data = json.load(doc2Json)
    
doc1Text = nlp(doc1Data["Text"].strip())
doc2Text = nlp(doc2Data["Text"].strip())
docSimilarity = doc1Text.similarity(doc2Text)
totalSimilarityTime = time.time() - startTime

## Check the luminance of every pixel, then match pixels between the two images. 
## This assumes a pixel is either black or white based on the luminance calculation.

In [2]:
import cv2
startLuminanceTime = time.time()

imgPath1 = "" #Add the path of the image you want to analyze here
imgPath2 = "" #Add the path of the image you want to analyze here
img = cv2.imread(imgPath1)
img2 = cv2.imread(imgPath2)
smallImg2 = cv2.resize(img2, (img.shape[1], img.shape[0]))
smallImg = cv2.resize(img, None, fx=0.05, fy=0.05, interpolation = cv2.INTER_AREA)
smallImg2 = cv2.resize(img2, None, fx=0.05, fy=0.05, interpolation = cv2.INTER_AREA)

def GetLuminancePerPixel(image):
    testtime = time.time()
    pixelLuminance = []
    height, width = image.shape[0], image.shape[1]
    for y in range(height):
        for x in range(width):
            R = image[y, x][2]
            G = image[y, x][1]
            B = image[y, x][0]
            luminance = ((R / 255.0) * 0.3 + (G / 255.0) * 0.59 + (B / 255.0) * 0.11);
            if(luminance < 0.75):
                pixelLuminance.append(True)
            else:
                pixelLuminance.append(False)
    return pixelLuminance

imgLuminance = GetLuminancePerPixel(smallImg)
img2Luminance = GetLuminancePerPixel(smallImg2)

pixelsMatched = 0;
for a, b in zip(imgLuminance, img2Luminance):
    if(a == b):
        pixelsMatched += 1
totalLuminanceTime = time.time() - startLuminanceTime

# To see the output images, use the code below
# cv2.imshow("small", smallImg) 
# cv2.imshow("small2", smallImg2)
# cv2.waitKey(0)


## SSIM (Structural similarity index) calculation

In [3]:
from skimage.measure import compare_ssim
import argparse
import imutils

startSSIMTime = time.time()

imageA = img
imageB = img2

height, width, channel = imageA.shape
imageB = cv2.resize(imageB, (width, height))

grayA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
grayB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)

grayA = cv2.bitwise_not(grayA)
grayB = cv2.bitwise_not(grayB)

(score, diff) = compare_ssim(grayA, grayB, full=True)
diff = (diff * 255).astype("uint8")

totalSSIMTime = time.time() - startSSIMTime

# To see the output images, use the code below
# thresh = cv2.threshold(diff, 0, 255,
# cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
# cnts = cv2.findContours(thresh.copy(), cv2.RETR_EXTERNAL,
# cv2.CHAIN_APPROX_SIMPLE)
# cnts = cnts[0] if imutils.is_cv2() else cnts[1]
# for c in cnts:
#     (x, y, w, h) = cv2.boundingRect(c)
#     cv2.rectangle(imageA, (x, y), (x + w, y + h), (0, 0, 255), 2)
#     cv2.rectangle(imageB, (x, y), (x + w, y + h), (0, 0, 255), 2)

# imageAResized = imutils.resize(imageA, height = 900)
# imageBResized = imutils.resize(imageB, height = 900)
# diff = imutils.resize(diff, height = 900)

# cv2.imshow("Original", imageAResized)
# cv2.imshow("Modified", imageBResized)
# cv2.imshow("Diff", diff)
# cv2.waitKey(0)


## Compare the semantic similarity of each paragraph on two given pages and then get the average semantic similarity between the two pages.

In [4]:
startParagraphTime = time.time()

def getPageParagraphs(path):
    with open(path, encoding='utf-8') as jsonFile:
        data = json.load(jsonFile)

    pageParagraphs = []
    for i in range(len(data)):
        pageParagraphs.append(data[i]["Text"])

    return pageParagraphs

page1JsonPath = "" #Json of the page to analyze
page2JsonPath = "" #Json of the page to analyze
p1Paragraphs = getPageParagraphs(page1JsonPath)
p2Paragraphs = getPageParagraphs(page2JsonPath)

def getSemanticSimilarityForParagraph(paragraph, paragraphList): 
    highestSemanticSimilarity = 0;
    for i in range(len(paragraphList)):
        text1 = nlp(paragraph.strip())
        text2 = nlp(paragraphList[i].strip())
        similarity = text1.similarity(text2)
        if(similarity == 100):
            return similarity
        if(similarity > highestSemanticSimilarity):
            highestSemanticSimilarity = similarity
    return highestSemanticSimilarity

similaritySum = 0

for i in range(len(p1Paragraphs)):
    similaritySum  += getSemanticSimilarityForParagraph(p1Paragraphs[i], p2Paragraphs)

totalParagraphTime = time.time() - startParagraphTime
totalTime = time.time() - startTime

## The results

In [7]:
print("This operation took:", "%.2f" %totalTime,"s")
print("Semantic similarity for the entire doc text:", "%.2f" % (docSimilarity * 100), "%. This took:", "%.2f" % totalSimilarityTime, "s")
print("Pixel match:", pixelsMatched, "out of", len(imgLuminance), "px.", "%.2f" % (pixelsMatched / len(imgLuminance) * 100), "% match. This took:", "%.2f" % totalLuminanceTime, "s")
print("SSIM:", "%.2f" % (score * 100), "%. This took:", "%.2f" % totalSSIMTime, "s")
print("Average semantic similarity across paragraphs:", "%.2f" % (similaritySum / len(p1Paragraphs) * 100), "%. This took:", "%.2f" % totalParagraphTime, "s")

This operation took: 10.00 s
Semantic similarity for the entire doc text: 99.09 %. This took: 0.88 s
Pixel match: 17039 out of 20111 px. 84.72 % match. This took: 0.43 s
SSIM: 76.60 %. This took: 1.30 s
Average semantic similarity across paragraphs: 85.24 %. This took: 7.00 s
