In [None]:
# Import required libraries

In [8]:
import spacy
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import gensim
from gensim.corpora import Dictionary
from gensim.similarities import MatrixSimilarity
from gensim.models import ldamodel, doc2vec, LsiModel 
import nltk
# nltk.download('punkt')
import string
import csv
import math
import statistics
import datetime
from nltk.corpus import stopwords
from nltk.util import ngrams
# nltk.download('stopwords')
from collections import OrderedDict, Counter, namedtuple
import random
import codecs, difflib, distance
import rpy2
from datasketch import MinHash
import warnings
warnings.filterwarnings('ignore')
nlp = spacy.load('en_core_web_lg')
%load_ext rpy2.ipython

In [9]:
%%R 
install.packages("textreuse", repos='http://cran.us.r-project.org', quiet=TRUE)
install.packages("readr", repos='http://cran.us.r-project.org', quiet=TRUE)
library("textreuse")
library("readr")

In [414]:
#Import text data

In [415]:
# 1. First Test: Egyptian Gazette 1947

In [544]:
eg_unordered = pd.read_csv('ocr_test_newspaper_egyptian_gazette_one_page_unordered.csv')
eg_ordered = pd.read_csv('ocr_test_newspaper_egyptian_gazette_one_page_ordered.csv')
ocr_values = [eg_unordered['base_file_name'].iloc[0]]

In [541]:
# 2. Arab Scribe January 5 1964

In [None]:
eg_unordered = pd.read_csv('ocr_test_magazine_arab_scribe_unordered.csv')
eg_ordered = pd.read_csv('ocr_test_magazine_arab_scribe_ordered.csv')

eg_ordered['contains_image'].fillna(value=False, inplace=True)

for index, row in eg_ordered.iterrows():
    if math.isnan(row['page_number']):
        pgn = row['base_file_name'].split('imagefile')[0][-3:]
        pgn = pgn.split('_')[0]
        eg_ordered.loc[index, 'page_number'] = int(pgn)

groupby_df = eg_ordered.groupby('page_number')['google_vision_text'].apply(' '.join).reset_index()
eg_ordered = eg_ordered.drop_duplicates(subset=['page_number'], keep='first')
eg_ordered = eg_ordered.drop(columns='google_vision_text')
final_df = pd.merge(eg_ordered, groupby_df, on='page_number', how='outer')
eg_ordered = final_df.drop(columns='id')

In [542]:
# Tokenize words w/ or w/o punctuation and stopwords

In [520]:
def custom_tokenize(text):
    if not text:
#       print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

def process_text(df, punc):

    final_doc = []
    for index, row in df.iterrows():
        raw_text = row['google_vision_text']
        tokens = custom_tokenize(raw_text)
        for t in tokens:
            if punc:
                if t in string.punctuation:
                    pass
                elif t in stopwords.words('english'):
                    pass
                else:
                    final_doc.append(t.lower())
            else: 
                final_doc.append(t.lower())
    text = ' '.join(final_doc)
    return final_doc, text


In [545]:
order_list, order_text = process_text(eg_ordered, True)
unorder_list, unorder_text = process_text(eg_unordered, True)
all_documents = [order_text, unorder_text]

In [None]:
# Process Full Issue

In [None]:
def process_full_issue(all_documents, ocr_values, full_issues_ocr):
    # Count n grams frequencies and calculate cosine similarity between two docs. 
    counts = CountVectorizer(ngram_range=(1,5))
    counts_matrix = counts.fit_transform(all_documents)
    cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
    print('Count Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])
    
    # Calculate tf-idf cosine similarity (nltk or spacy text the same)
    tokenize = lambda doc: doc.lower().split(" ")
    tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,5))
    tfidf_matrix = tfidf.fit_transform(all_documents)

    cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    print('TF-IDF Vectorizer', cos)
    ocr_values.append(cos[0][1])
    
    # Calculate similarity using GLOVE and SPACY
    order_doc = nlp(order_text)
    unorder_doc = nlp(unorder_text)
    sim_doc = order_doc.similarity(unorder_doc)
    print('Spacy GLOVE', sim_doc)
    #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
    ocr_values.append(sim_doc)
    
    # Calculate jaccard ratio. Takes list of tokens
    jac = 1 - distance.jaccard(order_list, unorder_list)
    print('Jaccard', jac)
    ocr_values.append(jac)
    
    # use gensim's similarity matrix and lsi to calculate cosine
    all_tokens = [order_list, unorder_list]
    dictionary = Dictionary(all_tokens)
    corpus = [dictionary.doc2bow(text) for text in all_tokens]
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
    sim = MatrixSimilarity(lsi[corpus])
    lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
    lsi_cos = lsi_cos[0]
    print('LSI', lsi_cos)
    ocr_values.append(lsi_cos)
    #https://radimrehurek.com/gensim/tut3.html
    
    if os.path.isfile(full_issues_ocr):
        final_metrics = pd.read_csv(full_issues_ocr)
        ocr_values.append(datetime.date.today())
        final_metrics.loc[len(final_metrics.index)] = ocr_values
        final_metrics.to_csv(full_issues_ocr, index=False)
    else:
        cols = ['base_file_name', 'num_pages', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos', 'date_run']
        ocr_values.append(datetime.date.today())
        final_df = pd.DataFrame([ocr_values], columns=cols)
        final_df.to_csv(full_issues_ocr, index=False)
    

In [None]:
ocr_values = [eg_unordered['base_file_name'].iloc[0], len(eg_unordered.index)]
process_full_issue(all_documents, ocr_values, 'ocr_accuracy_full_issue_arab_scribe.csv')

In [None]:
# Process Individual Pages

In [None]:
def align_pages(order, unorder):
    %%R -i order
    %%R -i unorder
#     order <- read_file("order_doc.txt")
#     unorder <- read_file("unorder_doc.txt")
    %%R perfect = align_local(order, order)
    %%R actual = align_local(order, unorder)
    %%R smw <- actual$score / perfect$score
    %%R smw
    %%R -o smw
    print(smw)
    return smw[0]

In [None]:
def process_page(all_documents, order_text, unorder_text, order_list, unorder_list, ocr_values, page_ocr):
    # Count n grams frequencies and calculate cosine similarity between two docs. 
    counts = CountVectorizer(ngram_range=(1,5))
    counts_matrix = counts.fit_transform(all_documents)
    cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
    print('Count Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])
    
    # Calculate tf-idf cosine similarity (nltk or spacy text the same)
    tokenize = lambda doc: doc.lower().split(" ")
    tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,5))
    tfidf_matrix = tfidf.fit_transform(all_documents)

    cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    print('TF-IDF Vectorizer', cos)
    ocr_values.append(cos[0][1])
    
    # Calculate similarity using GLOVE and SPACY
    order_doc = nlp(order_text)
    unorder_doc = nlp(unorder_text)
    sim_doc = order_doc.similarity(unorder_doc)
    print('Spacy GLOVE', sim_doc)
    #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
    ocr_values.append(sim_doc)
    
    # Calculate jaccard ratio. Takes list of tokens
    jac = 1 - distance.jaccard(order_list, unorder_list)
    print('Jaccard', jac)
    ocr_values.append(jac)
    
    # use gensim's similarity matrix and lsi to calculate cosine
    all_tokens = [order_list, unorder_list]
    dictionary = Dictionary(all_tokens)
    corpus = [dictionary.doc2bow(text) for text in all_tokens]
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
    sim = MatrixSimilarity(lsi[corpus])
    lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
    lsi_cos = lsi_cos[0]
    print('LSI', lsi_cos)
    ocr_values.append(lsi_cos)
    #https://radimrehurek.com/gensim/tut3.html
    
    align = align_pages(order_text, unorder_text)
    print('smw', align)
    ocr_values.append(align)
    
    if os.path.isfile(page_ocr):
        final_metrics = pd.read_csv(page_ocr)
        ocr_values.append(datetime.date.today())
        final_metrics.loc[len(final_metrics.index)] = ocr_values
        print(final_metrics)
        final_metrics.to_csv(page_ocr, index=False)
    else:
        ocr_values.append(datetime.date.today())
        cols = ['base_file_name', 'page_number', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos','smw_align', 'date_run']
        final_df = pd.DataFrame([ocr_values], columns=cols)
        final_df.to_csv(page_ocr, index=False)

In [None]:
for i in range(0, len(eg_unordered.index)):
    ocr_values = [eg_unordered['base_file_name'].iloc[i], i]
    order_list, order_text = process_text(eg_ordered.iloc[[i]], True)
    unorder_list, unorder_text = process_text(eg_unordered.iloc[[i]], True)
    all_documents = [order_text, unorder_text]
    process_page(all_documents, order_text, unorder_text, order_list, unorder_list, ocr_values, 'ocr_accuracy_page_level_arab_scribe.csv')