In [3]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import os
import gensim
from gensim.corpora import Dictionary
from gensim.similarities import MatrixSimilarity
from gensim.models import ldamodel, doc2vec, LsiModel 
import nltk
# nltk.download('punkt')
import string
import csv
import math
import statistics
import datetime
from nltk.corpus import stopwords
from nltk.util import ngrams
# nltk.download('stopwords')
from collections import OrderedDict, Counter, namedtuple
import random
import codecs, difflib, Levenshtein, distance
import rpy2
from datasketch import MinHash
import warnings
warnings.filterwarnings('ignore')
nlp = spacy.load('en_core_web_lg')
%load_ext rpy2.ipython


In [4]:
%%R 
install.packages("textreuse", repos='http://cran.us.r-project.org', quiet=TRUE)
install.packages("readr", repos='http://cran.us.r-project.org', quiet=TRUE)
library("textreuse")
library("readr")

R[write to console]: 
Attaching package: ‘readr’


R[write to console]: The following object is masked from ‘package:textreuse’:

    tokenize




In [4]:
#Import text data

In [5]:
# 1. First Test: Egyptian Gazette 1947

In [6]:
eg_unordered = pd.read_csv('./data/ocr_test_newspaper_egyptian_gazette_one_page_unordered.csv')
eg_ordered = pd.read_csv('./data/ocr_test_newspaper_egyptian_gazette_one_page_ordered.csv')
ocr_values = [eg_unordered['base_file_name'].iloc[0]]

In [8]:
# 2. Arab Scribe January 5 1964

In [7]:

eg_unordered = pd.read_csv('./data/ocr_test_magazine_arab_scribe_unordered.csv')
eg_ordered = pd.read_csv('./data/ocr_test_magazine_arab_scribe_ordered.csv')

eg_ordered['contains_image'].fillna(value=False, inplace=True)

for index, row in eg_ordered.iterrows():
    if math.isnan(row['page_number']):
        pgn = row['base_file_name'].split('imagefile')[0][-3:]
        pgn = pgn.split('_')[0]
        eg_ordered.loc[index, 'page_number'] = int(pgn)

groupby_df = eg_ordered.groupby('page_number')['google_vision_text'].apply(' '.join).reset_index()
eg_ordered = eg_ordered.drop_duplicates(subset=['page_number'], keep='first')
eg_ordered = eg_ordered.drop(columns='google_vision_text')
final_df = pd.merge(eg_ordered, groupby_df, on='page_number', how='outer')
eg_ordered = final_df.drop(columns='id')

In [10]:
# Full Issue

In [11]:
# Tokenize words w/ or w/o punctuation and stopwords

In [10]:
def custom_tokenize(text):
    if not text:
#       print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

def process_text(df, punc):

    final_doc = []
    for index, row in df.iterrows():
        page = []
        raw_text = row['google_vision_text']
        tokens = custom_tokenize(raw_text)
        for t in tokens:
            if punc:
                if t.lower() in string.punctuation:
                    continue
                elif t.lower() in stopwords.words('english'):
                    continue
                else:
                    final_doc.append(t.lower())
                    page.append(t.lower())
            else: 
                final_doc.append(t.lower())
    text = ' '.join(final_doc)
    return final_doc, text


In [11]:
order_list, order_text = process_text(eg_ordered, True)
unorder_list, unorder_text = process_text(eg_unordered, True)
all_documents = [order_text, unorder_text]

In [12]:
random_order = eg_ordered.sample(frac=1).reset_index(drop=True)
random_unorder = eg_unordered.sample(frac=1).reset_index(drop=True)
rorder_list, rorder_text = process_text(random_order, True)
runorder_list, runorder_text = process_text(random_unorder, True)
random_all_documents = [rorder_text, runorder_text]

In [89]:
def process_full_issue(all_documents, ocr_values, full_issues_ocr):
    # Count n grams frequencies and calculate cosine similarity between two docs. 
    counts = CountVectorizer(ngram_range=(1,5))
    counts_matrix = counts.fit_transform(all_documents)
    cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
    print('Count Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])
    
    # Calculate tf-idf cosine similarity (nltk or spacy text the same)
    tokenize = lambda doc: doc.lower().split(" ")
    tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,1))
    tfidf_matrix = tfidf.fit_transform(all_documents)

    cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    print('TF-IDF Vectorizer', cos)
    ocr_values.append(cos[0][1])
    
    # Calculate similarity using GLOVE and SPACY
    order_doc = nlp(order_text)
    unorder_doc = nlp(unorder_text)
    sim_doc = order_doc.similarity(unorder_doc)
    print('Spacy GLOVE', sim_doc)
    #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
    ocr_values.append(sim_doc)
    
    # Calculate jaccard ratio. Takes list of tokens
    jac = 1 - distance.jaccard(order_list, unorder_list)
    print('Jaccard', jac)
    ocr_values.append(jac)
    
    # use gensim's similarity matrix and lsi to calculate cosine
    all_tokens = [order_list, unorder_list]
    dictionary = Dictionary(all_tokens)
    corpus = [dictionary.doc2bow(text) for text in all_tokens]
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
    sim = MatrixSimilarity(lsi[corpus])
    lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
    lsi_cos = lsi_cos[0]
    print('LSI', lsi_cos)
    ocr_values.append(lsi_cos)
    #https://radimrehurek.com/gensim/tut3.html
    
    if os.path.isfile(full_issues_ocr):
        final_metrics = pd.read_csv(full_issues_ocr)
        ocr_values.append(datetime.date.today())
        final_metrics.loc[len(final_metrics.index)] = ocr_values
        final_metrics.to_csv(full_issues_ocr, index=False)
    else:
        cols = ['base_file_name', 'num_pages', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos', 'date_run']
        ocr_values.append(datetime.date.today())
        final_df = pd.DataFrame([ocr_values], columns=cols)
        final_df.to_csv(full_issues_ocr, index=False)
    

In [30]:
ocr_values = [eg_unordered['base_file_name'].iloc[0], len(eg_unordered.index)]
process_full_issue(all_documents, ocr_values, 'ocr_accuracy_full_issue_arab_scribe.csv')

Count Vectorizer 0.959538261301
TF-IDF Vectorizer [[ 1.          0.93357973]]
Spacy GLOVE 0.999952728742
Jaccard 0.8636978579481398
LSI 0.997655


In [56]:
def align_pages(order, unorder):
    %%R -i order
    %%R -i unorder
#     order <- read_file("order_doc.txt")
#     unorder <- read_file("unorder_doc.txt")
    %%R perfect = align_local(order, order)
    %%R actual = align_local(order, unorder)
    %%R smw <- actual$score / perfect$score
    %%R smw
    %%R -o smw
    print(smw)
    return smw[0]

In [15]:
def process_page(all_documents, order_text, unorder_text, order_list, unorder_list, ocr_values, page_ocr):
    # Count n grams frequencies and calculate cosine similarity between two docs. 
    counts = CountVectorizer(ngram_range=(1,5))
    counts_matrix = counts.fit_transform(all_documents)
    cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
    print('Count Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])
    
    # Calculate tf-idf cosine similarity (nltk or spacy text the same)
    tokenize = lambda doc: doc.lower().split(" ")
    tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,1))
    tfidf_matrix = tfidf.fit_transform(all_documents)

    cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    print('TF-IDF Vectorizer', cos)
    ocr_values.append(cos[0][1])
    
#     # Calculate similarity using GLOVE and SPACY
#     order_doc = nlp(order_text)
#     unorder_doc = nlp(unorder_text)
#     sim_doc = order_doc.similarity(unorder_doc)
#     print('Spacy GLOVE', sim_doc)
#     #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
#     ocr_values.append(sim_doc)
    
    # Calculate jaccard ratio. Takes list of tokens
    jac = 1 - distance.jaccard(order_list, unorder_list)
    print('Jaccard', jac)
    ocr_values.append(jac)
    
    # use gensim's similarity matrix and lsi to calculate cosine
    all_tokens = [order_list, unorder_list]
    dictionary = Dictionary(all_tokens)
    corpus = [dictionary.doc2bow(text) for text in all_tokens]
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
    sim = MatrixSimilarity(lsi[corpus])
    lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
    lsi_cos = lsi_cos[0]
    print('LSI', lsi_cos)
    ocr_values.append(lsi_cos)
    #https://radimrehurek.com/gensim/tut3.html
    
#     align = align_pages(order_text, unorder_text)
#     print('smw', align)
#     ocr_values.append(align)
    
#     if os.path.isfile(page_ocr):
#         final_metrics = pd.read_csv(page_ocr)
#         ocr_values.append(len(order_text))
#         ocr_values.append(len(unorder_text))
#         ocr_values.append(datetime.date.today())
#         final_metrics.loc[len(final_metrics.index)] = ocr_values
#         print(final_metrics)
#         final_metrics.to_csv(page_ocr, index=False)
#     else:
#         ocr_values.append(len(order_text))
#         ocr_values.append(len(unorder_text))
#         ocr_values.append(datetime.date.today())
#         cols = ['base_file_name', 'page_number', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos','smw_align', 'len_order', 'len_unorder', 'date_run']
#         final_df = pd.DataFrame([ocr_values], columns=cols)
#         final_df.to_csv(page_ocr, index=False)

In [17]:
for i in range(0, len(eg_unordered.index)):
    ocr_values = [eg_unordered['base_file_name'].iloc[i], i]
    order_list, order_text = process_text(eg_ordered.iloc[[i]], True)
    unorder_list, unorder_text = process_text(eg_unordered.iloc[[i]], True)
    all_documents = [order_text, unorder_text]
    process_page(all_documents, order_text, unorder_text, order_list, unorder_list, ocr_values, 'ocr_accuracy_page_level_arab_scribe.csv')

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.82814880195
TF-IDF Vectorizer [[ 1.          0.89692841]]
Jaccard 0.8888888888888888
LSI 0.960863
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.577078469596
TF-IDF Vectorizer [[ 1.          0.90497357]]
Jaccard 0.902127659574468
LSI 0.97168
Count Vectorizer 0.667523014697
TF-IDF Vectorizer [[ 1.          0.98945976]]
Jaccard 0.9911504424778761
LSI 0.995149
Count Vectorizer 0.880603989861
TF-IDF Vectorizer [[ 1.         0.8319078]]
Jacca

Count Vectorizer 0.877887273731
TF-IDF Vectorizer [[ 1.          0.90131917]]
Jaccard 0.92
LSI 0.964979
Count Vectorizer 0.565495039179
TF-IDF Vectorizer [[ 1.          0.81886335]]
Jaccard 0.8441558441558441
LSI 0.935804
Count Vectorizer 0.946328221974
TF-IDF Vectorizer [[ 1.          0.93843714]]
Jaccard 0.9318181818181818
LSI 0.987714
Count Vectorizer 0.607973599046
TF-IDF Vectorizer [[ 1.          0.78125839]]
Jaccard 0.7939110070257611
LSI 0.923838
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1. 

In [16]:
random_order = eg_ordered.sample(frac=1).reset_index(drop=True)
random_unorder = eg_unordered.sample(frac=1).reset_index(drop=True)
rorder_list, rorder_text = process_text(random_order, True)
runorder_list, runorder_text = process_text(random_unorder, True)
random_all_documents = [rorder_text, runorder_text]
for i in range(0, len(random_order.index)):
    ocr_values = [random_order['base_file_name'].iloc[i], i]
    rorder_list, rorder_text = process_text(random_order.iloc[[i]], True)
    order_list, order_text = process_text(eg_ordered.iloc[[i]], True)
#     runorder_list, runorder_text = process_text(random_unorder.iloc[[i]], True)
    random_all_documents = [rorder_text, order_text]
    process_page(all_documents, rorder_text, order_text, rorder_list, rorder_list, ocr_values, 'ocr_accuracy_page_level_arab_scribe.csv')

Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [

Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0
Count Vectorizer 0.94301968224
TF-IDF Vectorizer [[ 1.         0.9322488]]
Jaccard 1.0
LSI 1.0


In [8]:
# Count n grams frequencies and calculate cosine similarity between two docs. 

In [9]:
counts = CountVectorizer(ngram_range=(1,5))
counts_matrix = counts.fit_transform(all_documents)
cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
print(cos[0][1])
ocr_values.append(cos[0][1])

0.959538261301


In [10]:
# Calculate tf-idf cosine similarity (nltk or spacy text the same)

In [11]:

tokenize = lambda doc: doc.lower().split(" ")
tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,1))
tfidf_matrix = tfidf.fit_transform(all_documents)

cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
print(cos)
ocr_values.append(cos[0][1])

[[ 1.          0.93357973]]


In [12]:
# Calculate similarity using GLOVE and SPACY

In [14]:
nlp = spacy.load('en_core_web_lg')
order_doc = nlp(order_text)
unorder_doc = nlp(unorder_text)
sim_doc = order_doc.similarity(unorder_doc)
print(sim_doc)
#https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
ocr_values.append(sim_doc)

0.999952728742


In [15]:
# Write spacy texts for R
f = open('order_doc.txt', 'wt', encoding='utf-8')
f.write(order_text)
f = open('unorder_doc.txt', 'wt', encoding='utf-8')
f.write(unorder_text)
#Create tokens from spacy tokens
# order_doc_tokens = []
# for t in order_doc:
#     order_doc_tokens.append(t.text)
# unorder_doc_tokens = []
# for t in jane_doc:
#     unorder_doc_tokens.append(t.text)

205680

In [16]:
# Test with completely random text

In [17]:
# print(len(unorder_text))
# with open('jane_austen.txt', 'r') as myfile:
#   jane = myfile.read()
# jane = jane[:17457]

In [18]:
# Calculate jaccard ratio. Takes list of tokens

In [19]:
# intersection : new set with elements common to s and t
# union : new set with elements from both s and t
# difference: new set with elements in s but not in t
# symmetric difference: new set with elements in either s or t but not both

In [20]:
jac = 1 - distance.jaccard(order_list, unorder_list)
print(jac)
# m1, m2 = MinHash(num_perm=256), MinHash(num_perm=256)
# for d in order_list:
#     m1.update(d.encode('utf8'))
# for d in unorder_list:
#     m2.update(d.encode('utf8'))
# print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
ocr_values.append(jac)
# ocr_values.append(0)

0.8636978579481398


In [21]:
# use gensim's similarity matrix and lsi to calculate cosine

In [22]:
all_tokens = [order_list, unorder_list]
dictionary = Dictionary(all_tokens)
corpus = [dictionary.doc2bow(text) for text in all_tokens]
lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
sim = MatrixSimilarity(lsi[corpus])
lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
lsi_cos = lsi_cos[0]
print(lsi_cos)
ocr_values.append(lsi_cos)
#https://radimrehurek.com/gensim/tut3.html

0.997655


In [None]:
# Use textreuse align local for Smith Waterman

In [23]:
%load_ext rpy2.ipython

In [24]:
%%R 
install.packages("textreuse", repos='http://cran.us.r-project.org', quiet=TRUE)
install.packages("readr", repos='http://cran.us.r-project.org', quiet=TRUE)
library("textreuse")
library("readr")

In [None]:
%%R -o smw
order <- read_file("order_doc.txt")
unorder <- read_file("unorder_doc.txt")
perfect = align_local(order, order)
actual = align_local(order, unorder)
actual
smw <- actual$score / perfect$score

In [None]:
ocr_values.append(smw[0])


In [None]:
smw

In [553]:
cols = ['base_file_name', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos', 'smw_align']
final_df = pd.DataFrame([ocr_values], columns=cols)
final_df
# final_df.to_csv('ocr_quality_metrics.csv')

Unnamed: 0,base_file_name,countsvec_cos,tfidfvec_cos,spacy_sim,jaccard_sim,smw_align
0,image_lucida_app/media/Egyptian_Gazette_1947_J...,0.524825,0.58027,0.999314,0.522356,0.019961


In [None]:
#An important class of problems that Jaccard similarity addresses well is that of finding textually similar documents in a large corpus such as the Web or a collection of news articles. We should understand that the aspect of similarity we are looking at here is character-level similarity, not “similar meaning,” which requires us to examine the words in the documents and their uses.