In [92]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import os
import gensim
from gensim.corpora import Dictionary
from gensim.similarities import MatrixSimilarity
from gensim.models import ldamodel, doc2vec, LsiModel 
import nltk
# nltk.download('punkt')
import string
import csv
import math
import statistics
import datetime
from nltk.corpus import stopwords
from nltk.util import ngrams
# nltk.download('stopwords')
from collections import OrderedDict, Counter, namedtuple
import networkx as nx
import matplotlib.pyplot as plt
from networkx.readwrite import json_graph
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.layouts import row, column
import random
import codecs, difflib, Levenshtein, distance
import rpy2
from datasketch import MinHash
import warnings
warnings.filterwarnings('ignore')
nlp = spacy.load('en_core_web_lg')
%load_ext rpy2.ipython


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [18]:
%%R 
install.packages("textreuse", repos='http://cran.us.r-project.org', quiet=TRUE)
install.packages("readr", repos='http://cran.us.r-project.org', quiet=TRUE)
library("textreuse")
library("readr")

In [4]:
#Import text data

In [5]:
# 1. First Test: Egyptian Gazette 1947

In [6]:
eg_unordered = pd.read_csv('ocr_test_newspaper_egyptian_gazette_one_page_unordered.csv')
eg_ordered = pd.read_csv('ocr_test_newspaper_egyptian_gazette_one_page_ordered.csv')
ocr_values = [eg_unordered['base_file_name'].iloc[0]]

In [7]:
# 2. Arab Scribe January 5 1964

In [42]:

eg_unordered = pd.read_csv('ocr_test_magazine_arab_scribe_unordered.csv')
eg_ordered = pd.read_csv('ocr_test_magazine_arab_scribe_ordered.csv')

eg_ordered['contains_image'].fillna(value=False, inplace=True)

for index, row in eg_ordered.iterrows():
    if math.isnan(row['page_number']):
        pgn = row['base_file_name'].split('imagefile')[0][-3:]
        pgn = pgn.split('_')[0]
        eg_ordered.loc[index, 'page_number'] = int(pgn)

groupby_df = eg_ordered.groupby('page_number')['google_vision_text'].apply(' '.join).reset_index()
eg_ordered = eg_ordered.drop_duplicates(subset=['page_number'], keep='first')
eg_ordered = eg_ordered.drop(columns='google_vision_text')
final_df = pd.merge(eg_ordered, groupby_df, on='page_number', how='outer')
eg_ordered = final_df.drop(columns='id')

Unnamed: 0,base_file_name,contains_image,page_number,google_vision_text
0,image_lucida_app/media/The_Scribe_And_The_Arab...,False,0.0,The Scribe\nTHE ARAB REVIEW\n4th Year - Vol. V...


In [10]:
# Full Issue

In [11]:
# Tokenize words w/ or w/o punctuation and stopwords

In [100]:
def custom_tokenize(text):
    if not text:
#       print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

def process_text(df, punc):

    final_doc = []
    for index, row in df.iterrows():
        page = []
        raw_text = row['google_vision_text']
        tokens = custom_tokenize(raw_text)
        for t in tokens:
            if punc:
                if t in string.punctuation:
                    pass
                elif t in stopwords.words('english'):
                    pass
                else:
                    final_doc.append(t.lower())
                    page.append(t.lower())
            else: 
                final_doc.append(t.lower())
    text = ' '.join(final_doc)
    return final_doc, text


In [101]:
order_list, order_text = process_text(eg_ordered, True)
unorder_list, unorder_text = process_text(eg_unordered, True)
all_documents = [order_text, unorder_text]





In [89]:
def process_full_issue(all_documents, ocr_values, full_issues_ocr):
    # Count n grams frequencies and calculate cosine similarity between two docs. 
    counts = CountVectorizer(ngram_range=(1,5))
    counts_matrix = counts.fit_transform(all_documents)
    cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
    print('Count Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])
    
    # Calculate tf-idf cosine similarity (nltk or spacy text the same)
    tokenize = lambda doc: doc.lower().split(" ")
    tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,1))
    tfidf_matrix = tfidf.fit_transform(all_documents)

    cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    print('TF-IDF Vectorizer', cos)
    ocr_values.append(cos[0][1])
    
    # Calculate similarity using GLOVE and SPACY
    order_doc = nlp(order_text)
    unorder_doc = nlp(unorder_text)
    sim_doc = order_doc.similarity(unorder_doc)
    print('Spacy GLOVE', sim_doc)
    #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
    ocr_values.append(sim_doc)
    
    # Calculate jaccard ratio. Takes list of tokens
    jac = 1 - distance.jaccard(order_list, unorder_list)
    print('Jaccard', jac)
    ocr_values.append(jac)
    
    # use gensim's similarity matrix and lsi to calculate cosine
    all_tokens = [order_list, unorder_list]
    dictionary = Dictionary(all_tokens)
    corpus = [dictionary.doc2bow(text) for text in all_tokens]
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
    sim = MatrixSimilarity(lsi[corpus])
    lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
    lsi_cos = lsi_cos[0]
    print('LSI', lsi_cos)
    ocr_values.append(lsi_cos)
    #https://radimrehurek.com/gensim/tut3.html
    
    if os.path.isfile(full_issues_ocr):
        final_metrics = pd.read_csv(full_issues_ocr)
        ocr_values.append(datetime.date.today())
        final_metrics.loc[len(final_metrics.index)] = ocr_values
        final_metrics.to_csv(full_issues_ocr, index=False)
    else:
        cols = ['base_file_name', 'num_pages', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos', 'date_run']
        ocr_values.append(datetime.date.today())
        final_df = pd.DataFrame([ocr_values], columns=cols)
        final_df.to_csv(full_issues_ocr, index=False)
    

In [30]:
ocr_values = [eg_unordered['base_file_name'].iloc[0], len(eg_unordered.index)]
process_full_issue(all_documents, ocr_values, 'ocr_accuracy_full_issue_arab_scribe.csv')

Count Vectorizer 0.959538261301
TF-IDF Vectorizer [[ 1.          0.93357973]]
Spacy GLOVE 0.999952728742
Jaccard 0.8636978579481398
LSI 0.997655


In [56]:
def align_pages(order, unorder):
    %%R -i order
    %%R -i unorder
#     order <- read_file("order_doc.txt")
#     unorder <- read_file("unorder_doc.txt")
    %%R perfect = align_local(order, order)
    %%R actual = align_local(order, unorder)
    %%R smw <- actual$score / perfect$score
    %%R smw
    %%R -o smw
    print(smw)
    return smw[0]

In [90]:
def process_page(all_documents, order_text, unorder_text, order_list, unorder_list, ocr_values, page_ocr):
    # Count n grams frequencies and calculate cosine similarity between two docs. 
    counts = CountVectorizer(ngram_range=(1,5))
    counts_matrix = counts.fit_transform(all_documents)
    cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
    print('Count Vectorizer', cos[0][1])
    ocr_values.append(cos[0][1])
    
    # Calculate tf-idf cosine similarity (nltk or spacy text the same)
    tokenize = lambda doc: doc.lower().split(" ")
    tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,1))
    tfidf_matrix = tfidf.fit_transform(all_documents)

    cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
    print('TF-IDF Vectorizer', cos)
    ocr_values.append(cos[0][1])
    
    # Calculate similarity using GLOVE and SPACY
    order_doc = nlp(order_text)
    unorder_doc = nlp(unorder_text)
    sim_doc = order_doc.similarity(unorder_doc)
    print('Spacy GLOVE', sim_doc)
    #https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
    ocr_values.append(sim_doc)
    
    # Calculate jaccard ratio. Takes list of tokens
    jac = 1 - distance.jaccard(order_list, unorder_list)
    print('Jaccard', jac)
    ocr_values.append(jac)
    
    # use gensim's similarity matrix and lsi to calculate cosine
    all_tokens = [order_list, unorder_list]
    dictionary = Dictionary(all_tokens)
    corpus = [dictionary.doc2bow(text) for text in all_tokens]
    lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
    sim = MatrixSimilarity(lsi[corpus])
    lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
    lsi_cos = lsi_cos[0]
    print('LSI', lsi_cos)
    ocr_values.append(lsi_cos)
    #https://radimrehurek.com/gensim/tut3.html
    
    align = align_pages(order_text, unorder_text)
    print('smw', align)
    ocr_values.append(align)
    
    if os.path.isfile(page_ocr):
        final_metrics = pd.read_csv(page_ocr)
        ocr_values.append(len(order_text))
        ocr_values.append(len(unorder_text))
        ocr_values.append(datetime.date.today())
        final_metrics.loc[len(final_metrics.index)] = ocr_values
        print(final_metrics)
        final_metrics.to_csv(page_ocr, index=False)
    else:
        ocr_values.append(len(order_text))
        ocr_values.append(len(unorder_text))
        ocr_values.append(datetime.date.today())
        cols = ['base_file_name', 'page_number', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos','smw_align', 'len_order', 'len_unorder', 'date_run']
        final_df = pd.DataFrame([ocr_values], columns=cols)
        final_df.to_csv(page_ocr, index=False)

In [91]:
for i in range(0, len(eg_unordered.index)):
    ocr_values = [eg_unordered['base_file_name'].iloc[i], i]
    order_list, order_text = process_text(eg_ordered.iloc[[i]], True)
    unorder_list, unorder_text = process_text(eg_unordered.iloc[[i]], True)
    all_documents = [order_text, unorder_text]
    process_page(all_documents, order_text, unorder_text, order_list, unorder_list, ocr_values, 'ocr_accuracy_page_level_arab_scribe.csv')

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                      base_file_name  page_number  \
0  image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1  image_lucida_app/media/The_Scribe_And_The_Arab...            1   

   countsvec_cos  tfidfvec_cos  spacy_sim  jaccard_sim  lsi_cos  smw_align  \
0            1.0           1.0        1.0          1.0      1.0        1.0   
1            1.0           1.0        1.0          1.0      1.0        1.0   

   len_order  len_unorder    date_run  
0        180          180  2018-06-13  
1         36           36  2018-06-13  
Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                      base_file_name  page_number  \
0  image_lucida_app/media/The_Scribe_And_The_Arab...          

TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                      base_file_name  page_number  \
0  image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1  image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2  image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3  image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4  image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5  image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6  image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7  image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8  image_lucida_app/media/The_Scribe_And_The_Arab...            8   

   countsvec_cos  tfidfvec_cos  spacy_sim  jaccard_sim   lsi_cos  smw_align  \
0       1.000000      1.000000    1.00000     1.000000  1.000000   1.000000   
1       1.000000      1.000000    1.00000     1.000000  1.000000   1.0

Count Vectorizer 0.516269850126
TF-IDF Vectorizer [[ 1.          0.95875673]]
Spacy GLOVE 0.999750555616
Jaccard 0.9547872340425532
LSI 0.986802
[1] 0.04142012

smw 0.04142011834319527
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The

Spacy GLOVE 0.994012154592
Jaccard 0.8164556962025317
LSI 0.92853
[1] 0.1147059

smw 0.11470588235294117
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...  

Count Vectorizer 0.816143163213
TF-IDF Vectorizer [[ 1.          0.85869652]]
Spacy GLOVE 0.997999986515
Jaccard 0.8498168498168498
LSI 0.956048
[1] 0.4954268

smw 0.4954268292682927
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.974818130946
TF-IDF Vectorizer [[ 1.          0.99168803]]
Spacy GLOVE 0.999999954616
Jaccard 0.9918367346938776
LSI 0.998028
[1] 0.6003236

smw 0.6003236245954693
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.918908315728
TF-IDF Vectorizer [[ 1.          0.93300623]]
Spacy GLOVE 0.999653236166
Jaccard 0.9296875
LSI 0.982053
[1] 0.9464286

smw 0.9464285714285714
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...   

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Count Vectorizer 0.88091570418
TF-IDF Vectorizer [[ 1.         0.9057245]]
Spacy GLOVE 0.999570149496
Jaccard 0.9
LSI 0.977332
[1] 0.1696078

smw 0.1696078431372549
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           1

Count Vectorizer 0.647918850689
TF-IDF Vectorizer [[ 1.          0.85257017]]
Spacy GLOVE 0.999441237932
Jaccard 0.8010471204188482
LSI 0.977484
[1] 0.02875399

smw 0.02875399361022364
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The

Count Vectorizer 0.666386711464
TF-IDF Vectorizer [[ 1.          0.89812423]]
Spacy GLOVE 0.999645621987
Jaccard 0.8763157894736842
LSI 0.983354
[1] 0.03935599

smw 0.03935599284436494
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The

Count Vectorizer 0.576459773536
TF-IDF Vectorizer [[ 1.          0.90484069]]
Spacy GLOVE 0.99930718627
Jaccard 0.8873563218390804
LSI 0.975601
[1] 0.04545455

smw 0.045454545454545456
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The

Count Vectorizer 0.556305820904
TF-IDF Vectorizer [[ 1.         0.8719533]]
Spacy GLOVE 0.999636652339
Jaccard 0.856353591160221
LSI 0.968646
[1] 0.03887689

smw 0.038876889848812095
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   
12  image_lucida_app/media/The_Scribe_And_The_Arab...           12 

Count Vectorizer 0.907757114443
TF-IDF Vectorizer [[ 1.          0.95225154]]
Spacy GLOVE 0.999669112627
Jaccard 0.9554140127388535
LSI 0.980395
[1] 0.8228883

smw 0.8228882833787466
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.900528790392
TF-IDF Vectorizer [[ 1.          0.91116682]]
Spacy GLOVE 0.999636950764
Jaccard 0.9006211180124224
LSI 0.985789
[1] 0.3468271

smw 0.34682713347921224
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_

Count Vectorizer 0.932257064692
TF-IDF Vectorizer [[ 1.          0.93690718]]
Spacy GLOVE 0.999586546369
Jaccard 0.9351351351351351
LSI 0.984099
[1] 0.6689189

smw 0.668918918918919
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ar

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Count Vectorizer 0.971767066141
TF-IDF Vectorizer [[ 1.          0.98979063]]
Spacy GLOVE 0.999969074052
Jaccard 0.9881656804733728
LSI 0.997626
[1] 0.370019

smw 0.3700189753320683
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ar

Count Vectorizer 0.937854649541
TF-IDF Vectorizer [[ 1.          0.94467639]]
Spacy GLOVE 0.999826493856
Jaccard 0.9307228915662651
LSI 0.993082
[1] 0.2935421

smw 0.29354207436399216
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_

Count Vectorizer 0.96567394909
TF-IDF Vectorizer [[ 1.          0.98286543]]
Spacy GLOVE 0.999073758799
Jaccard 0.9777070063694268
LSI 0.970788
[1] 0.9237805

smw 0.9237804878048781
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ar

Count Vectorizer 0.986769248913
TF-IDF Vectorizer [[ 1.          0.96877249]]
Spacy GLOVE 0.999859745143
Jaccard 0.9702970297029703
LSI 0.992042
[1] 0.9937759

smw 0.9937759336099585
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.714836744175
TF-IDF Vectorizer [[ 1.          0.92838193]]
Spacy GLOVE 0.99983119968
Jaccard 0.9092872570194385
LSI 0.983511
[1] 0.2562777

smw 0.25627769571639586
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.685340804492
TF-IDF Vectorizer [[ 1.          0.89363558]]
Spacy GLOVE 0.999672580751
Jaccard 0.88
LSI 0.970701
[1] 0.2272727

smw 0.22727272727272727
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...       

Count Vectorizer 0.96489261736
TF-IDF Vectorizer [[ 1.          0.94765481]]
Spacy GLOVE 0.9998270706
Jaccard 0.9442231075697212
LSI 0.981566
[1] 0.4095442

smw 0.40954415954415957
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ara

Count Vectorizer 0.981025805172
TF-IDF Vectorizer [[ 1.          0.96271204]]
Spacy GLOVE 0.999906775845
Jaccard 0.9520833333333333
LSI 0.993364
[1] 0.9827586

smw 0.9827586206896551
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Spacy GLOVE 0.999496070884
Jaccard 0.8454706927175843
LSI 0.959105
[1] 0.4948119

smw 0.49481193255512324
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab... 

Count Vectorizer 0.962875646372
TF-IDF Vectorizer [[ 1.         0.9333224]]
Spacy GLOVE 0.999824991039
Jaccard 0.9315589353612167
LSI 0.982353
[1] 0.4133598

smw 0.41335978835978837
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ar

Count Vectorizer 0.85823604858
TF-IDF Vectorizer [[ 1.          0.84944778]]
Spacy GLOVE 0.999289656419
Jaccard 0.8243512974051896
LSI 0.954488
[1] 0.4803371

smw 0.4803370786516854
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ar

Spacy GLOVE 0.999780284757
Jaccard 0.9333333333333333
LSI 0.979129
[1] 0.2616099

smw 0.26160990712074306
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab... 

LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   
12  image_lucida_app/media/The_Scribe_And_The_Arab...           12 

                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   
12  image_lucida_app/media/The_Scribe_And_The_Arab...           12   
13  image_lucida_app

Count Vectorizer 0.701598816845
TF-IDF Vectorizer [[ 1.          0.75706785]]
Spacy GLOVE 0.996702081802
Jaccard 0.7545454545454545
LSI 0.918226
[1] 0.6170635

smw 0.6170634920634921
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.491184211853
TF-IDF Vectorizer [[ 1.          0.96101213]]
Spacy GLOVE 0.999922385384
Jaccard 0.9594272076372315
LSI 0.990211
[1] 0.04007634

smw 0.04007633587786259
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The

Count Vectorizer 0.470578146603
TF-IDF Vectorizer [[ 1.          0.80702399]]
Spacy GLOVE 0.998004837182
Jaccard 0.8114406779661016
LSI 0.936267
[1] 0.05722326

smw 0.05722326454033771
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The

Count Vectorizer 0.98453038674
TF-IDF Vectorizer [[ 1.          0.93589298]]
Spacy GLOVE 0.999442189838
Jaccard 0.9416058394160584
LSI 0.98036
[1] 0.9907407

smw 0.9907407407407407
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ara

Count Vectorizer 0.660813445942
TF-IDF Vectorizer [[ 1.          0.82869391]]
Spacy GLOVE 0.998713582334
Jaccard 0.8297872340425532
LSI 0.954642
[1] 0.2474359

smw 0.24743589743589745
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_

Count Vectorizer 0.603063949168
TF-IDF Vectorizer [[ 1.          0.96614247]]
Spacy GLOVE 0.999799474659
Jaccard 0.9671532846715328
LSI 0.986854
[1] 0.2954545

smw 0.29545454545454547
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Count Vectorizer 0.732095258271
TF-IDF Vectorizer [[ 1.         0.7959094]]
Spacy GLOVE 0.998821000556
Jaccard 0.8262411347517731
LSI 0.92161
[1] 0.2442623

smw 0.2442622950819672
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab

Count Vectorizer 0.917741100403
TF-IDF Vectorizer [[ 1.          0.96091627]]
Spacy GLOVE 0.999397325101
Jaccard 0.9669421487603306
LSI 0.971473
[1] 0.3505535

smw 0.3505535055350554
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.730609935912
TF-IDF Vectorizer [[ 1.          0.79581167]]
Spacy GLOVE 0.997773090862
Jaccard 0.7723214285714286
LSI 0.941721
[1] 0.1728625

smw 0.17286245353159851
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_

Count Vectorizer 0.94655712533
TF-IDF Vectorizer [[ 1.          0.96292462]]
Spacy GLOVE 0.999907571653
Jaccard 0.9698275862068966
LSI 0.989424
[1] 0.3356164

smw 0.3356164383561644
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ar

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   
12  image_lucida_app/media/The_Scribe_A

Count Vectorizer 0.374355890853
TF-IDF Vectorizer [[ 1.         0.6392834]]
Spacy GLOVE 0.988131112417
Jaccard 0.6509090909090909
LSI 0.83632
[1] 0.04237288

smw 0.0423728813559322
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Ara

Count Vectorizer 0.492310752064
TF-IDF Vectorizer [[ 1.          0.96127528]]
Spacy GLOVE 0.999840809258
Jaccard 0.9610091743119266
LSI 0.99055
[1] 0.03611111

smw 0.03611111111111111
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_

Count Vectorizer 0.42068087859
TF-IDF Vectorizer [[ 1.        0.813102]]
Spacy GLOVE 0.995657106398
Jaccard 0.8253968253968254
LSI 0.925948
[1] 0.05938697

smw 0.05938697318007663
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

TF-IDF Vectorizer [[ 1.          0.77644421]]
Spacy GLOVE 0.997488584154
Jaccard 0.8024193548387096
LSI 0.930267
[1] 0.04545455

smw 0.045454545454545456
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  im

Spacy GLOVE 0.998163977629
Jaccard 0.9222222222222223
LSI 0.970231
[1] 0.4080189

smw 0.4080188679245283
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...  

Spacy GLOVE 0.996789193587
Jaccard 0.8481012658227848
LSI 0.939044
[1] 0.3625

smw 0.3625
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   


Count Vectorizer 0.949757852811
TF-IDF Vectorizer [[ 1.          0.94377258]]
Spacy GLOVE 0.999697775538
Jaccard 0.9354838709677419
LSI 0.988852
[1] 0.8948413

smw 0.8948412698412699
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

Count Vectorizer 0.622105065185
TF-IDF Vectorizer [[ 1.          0.78814975]]
Spacy GLOVE 0.999045275316
Jaccard 0.7972972972972973
LSI 0.926165
[1] 0.3398357

smw 0.3398357289527721
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_A

[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   
12  image_lucida_app/media/The_Scribe_And_The_Arab...           12   
13  i

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   
12  image_lucida_app/media/The_Scribe_A

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Count Vectorizer 1.0
TF-IDF Vectorizer [[ 1.  1.]]
Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...     

Spacy GLOVE 1.0
Jaccard 1.0
LSI 1.0
[1] 1

smw 1.0
                                       base_file_name  page_number  \
0   image_lucida_app/media/The_Scribe_And_The_Arab...            0   
1   image_lucida_app/media/The_Scribe_And_The_Arab...            1   
2   image_lucida_app/media/The_Scribe_And_The_Arab...            2   
3   image_lucida_app/media/The_Scribe_And_The_Arab...            3   
4   image_lucida_app/media/The_Scribe_And_The_Arab...            4   
5   image_lucida_app/media/The_Scribe_And_The_Arab...            5   
6   image_lucida_app/media/The_Scribe_And_The_Arab...            6   
7   image_lucida_app/media/The_Scribe_And_The_Arab...            7   
8   image_lucida_app/media/The_Scribe_And_The_Arab...            8   
9   image_lucida_app/media/The_Scribe_And_The_Arab...            9   
10  image_lucida_app/media/The_Scribe_And_The_Arab...           10   
11  image_lucida_app/media/The_Scribe_And_The_Arab...           11   
12  image_lucida_app/media/The_Scribe_A

In [8]:
# Count n grams frequencies and calculate cosine similarity between two docs. 

In [9]:
counts = CountVectorizer(ngram_range=(1,5))
counts_matrix = counts.fit_transform(all_documents)
cos = cosine_similarity(counts_matrix[0:1], counts_matrix)
print(cos[0][1])
ocr_values.append(cos[0][1])

0.959538261301


In [10]:
# Calculate tf-idf cosine similarity (nltk or spacy text the same)

In [11]:

tokenize = lambda doc: doc.lower().split(" ")
tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=tokenize, ngram_range=(1,1))
tfidf_matrix = tfidf.fit_transform(all_documents)

cos = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
print(cos)
ocr_values.append(cos[0][1])

[[ 1.          0.93357973]]


In [12]:
# Calculate similarity using GLOVE and SPACY

In [14]:
nlp = spacy.load('en_core_web_lg')
order_doc = nlp(order_text)
unorder_doc = nlp(unorder_text)
sim_doc = order_doc.similarity(unorder_doc)
print(sim_doc)
#https://stats.stackexchange.com/questions/304217/how-is-the-similarity-method-in-spacy-computed
ocr_values.append(sim_doc)

0.999952728742


In [15]:
# Write spacy texts for R
f = open('order_doc.txt', 'wt', encoding='utf-8')
f.write(order_text)
f = open('unorder_doc.txt', 'wt', encoding='utf-8')
f.write(unorder_text)
#Create tokens from spacy tokens
# order_doc_tokens = []
# for t in order_doc:
#     order_doc_tokens.append(t.text)
# unorder_doc_tokens = []
# for t in jane_doc:
#     unorder_doc_tokens.append(t.text)

205680

In [16]:
# Test with completely random text

In [17]:
# print(len(unorder_text))
# with open('jane_austen.txt', 'r') as myfile:
#   jane = myfile.read()
# jane = jane[:17457]

In [18]:
# Calculate jaccard ratio. Takes list of tokens

In [19]:
# intersection : new set with elements common to s and t
# union : new set with elements from both s and t
# difference: new set with elements in s but not in t
# symmetric difference: new set with elements in either s or t but not both

In [20]:
jac = 1 - distance.jaccard(order_list, unorder_list)
print(jac)
# m1, m2 = MinHash(num_perm=256), MinHash(num_perm=256)
# for d in order_list:
#     m1.update(d.encode('utf8'))
# for d in unorder_list:
#     m2.update(d.encode('utf8'))
# print("Estimated Jaccard for data1 and data2 is", m1.jaccard(m2))
ocr_values.append(jac)
# ocr_values.append(0)

0.8636978579481398


In [21]:
# use gensim's similarity matrix and lsi to calculate cosine

In [22]:
all_tokens = [order_list, unorder_list]
dictionary = Dictionary(all_tokens)
corpus = [dictionary.doc2bow(text) for text in all_tokens]
lsi = LsiModel(corpus, id2word=dictionary, num_topics=2)
sim = MatrixSimilarity(lsi[corpus])
lsi_cos = [ t[1][1] for t in list(enumerate(sim))]
lsi_cos = lsi_cos[0]
print(lsi_cos)
ocr_values.append(lsi_cos)
#https://radimrehurek.com/gensim/tut3.html

0.997655


In [None]:
# Use textreuse align local for Smith Waterman

In [23]:
%load_ext rpy2.ipython

In [24]:
%%R 
install.packages("textreuse", repos='http://cran.us.r-project.org', quiet=TRUE)
install.packages("readr", repos='http://cran.us.r-project.org', quiet=TRUE)
library("textreuse")
library("readr")

In [None]:
%%R -o smw
order <- read_file("order_doc.txt")
unorder <- read_file("unorder_doc.txt")
perfect = align_local(order, order)
actual = align_local(order, unorder)
actual
smw <- actual$score / perfect$score

In [None]:
ocr_values.append(smw[0])


In [None]:
smw

In [553]:
cols = ['base_file_name', 'countsvec_cos', 'tfidfvec_cos', 'spacy_sim', 'jaccard_sim', 'lsi_cos', 'smw_align']
final_df = pd.DataFrame([ocr_values], columns=cols)
final_df
# final_df.to_csv('ocr_quality_metrics.csv')

Unnamed: 0,base_file_name,countsvec_cos,tfidfvec_cos,spacy_sim,jaccard_sim,smw_align
0,image_lucida_app/media/Egyptian_Gazette_1947_J...,0.524825,0.58027,0.999314,0.522356,0.019961


In [None]:
#An important class of problems that Jaccard similarity addresses well is that of finding textually similar documents in a large corpus such as the Web or a collection of news articles. We should understand that the aspect of similarity we are looking at here is character-level similarity, not “similar meaning,” which requires us to examine the words in the documents and their uses.

In [None]:
def get_ngram(df):

    num_words = []
    for index, row in df.iterrows():
        raw_text = row['google_vision_text']
        sents = nltk.sent_tokenize(raw_text)
        for s in sents:
            tokens = nltk.word_tokenize(s)
            tokens = [t for t in tokens if t not in stopwords.words('english')]
            num_words.append(len(tokens))
    print(num_words)
    med = statistics.median(num_words)
    avg = statistics.mean(num_words)
    pstd = statistics.pstdev(num_words)
    sstd = statistics.stdev(num_words)
    return med, avg, pstd, sstd

val1, val2, val3, val4 = get_ngram(eg_ordered)
print(val1, val2, val3, val4)

def count_ngrams(words, ngram):
    counts_n = ngrams(words,ngram)
    collects_n = Counter(counts_n)
    print(collects_n)
    
def smith_waterman(a: str, b: str, alignment_score: float = 1, gap_cost: float = 1) -> float:
  """
  Compute the Smith-Waterman alignment score for two strings.
  See https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm#Algorithm
  This implementation has a fixed gap cost (i.e. extending a gap is considered
  free). In the terminology of the Wikipedia description, W_k = {c, c, c, ...}.
  This implementation also has a fixed alignment score, awarded if the relevant
  characters are equal.
  Kinda slow, especially for large (50+ char) inputs.
  https://gist.github.com/nornagon/6326a643fc30339ece3021013ed9b48c
  """
  # H holds the alignment score at each point, computed incrementally
  H = np.zeros((len(a) + 1, len(b) + 1))
  for i in range(1, len(a) + 1):
    for j in range(1, len(b) + 1):
      # The score for substituting the letter a[i-1] for b[j-1]. Generally low
      # for mismatch, high for match.
      match = H[i-1,j-1] + (alignment_score if a[i-1] == b[j-1] else 0)
      # The scores for for introducing extra letters in one of the strings (or
      # by symmetry, deleting them from the other).
      delete = H[1:i,j].max() - gap_cost if i > 1 else 0
      insert = H[i,1:j].max() - gap_cost if j > 1 else 0
      H[i,j] = max(match, delete, insert, 0)
  # The highest score is the best local alignment.
  # For our purposes, we don't actually care _what_ the alignment was, just how
  # aligned the two strings were.
  return H.max()

def smith_waterman_distance(seq1, seq2, match=3, mismatch=-1, insertion=-1, deletion=-1, normalize=1):
    '''simple and general smith waterman distance for NLP feature extraction'''
    # switch sequences, so that seq1 is the longer sequence to search for seq2
    if len(seq2) > len(seq1): seq1, seq2 = seq2, seq1
    # create the distance matrix
    mat = np.zeros((len(seq2) + 1, len(seq1) + 1))
    # iterate over the matrix column wise
    for i in range(1, mat.shape[0]):
        # iterate over the matrix row wise
        for j in range(1, mat.shape[1]):
            # set the current matrix element with the maximum of 4 different cases
            mat[i, j] = max(
                # negative values are not allowed
                0,
                # if previous character matches increase the score by match, else decrease it by mismatch
                mat[i - 1, j - 1] + (match if seq1[j - 1] == seq2[i - 1] else mismatch),
                # one character is missing in seq2, so decrease the score by deletion
                mat[i - 1, j] + deletion,
                # one additional character is in seq2, so decrease the scare by insertion
                mat[i, j - 1] + insertion
            )
    # the maximum of mat is now the score, which is returned raw or normalized (with a range of 0-1)
    return np.max(mat) / (len(seq2) * match) if normalize else np.max(mat)

def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

def sublinear_term_frequency(term, tokenized_document):
    count = tokenized_document.count(term)
    if count == 0:
        return 0
    return 1 + math.log(count)

def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values

def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents

def cosine_similarity(vector1, vector2):
    dot_product = sum(p*q for p,q in zip(vector1, vector2))
    magnitude = math.sqrt(sum([val**2 for val in vector1])) * math.sqrt(sum([val**2 for val in vector2]))
    if not magnitude:
        return 0
    return dot_product/magnitude

tfidf_representation = tfidf(all_documents)
our_tfidf_comparisons = []
for count_0, doc_0 in enumerate(tfidf_representation):
    for count_1, doc_1 in enumerate(tfidf_representation):
        our_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

skl_tfidf_comparisons = []
for count_0, doc_0 in enumerate(sklearn_representation.toarray()):
    for count_1, doc_1 in enumerate(sklearn_representation.toarray()):
        skl_tfidf_comparisons.append((cosine_similarity(doc_0, doc_1), count_0, count_1))

for x in zip(sorted(our_tfidf_comparisons, reverse = True), sorted(skl_tfidf_comparisons, reverse = True)):
    print(x)

def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    print(len(intersection))
    union = set(query).union(set(document))
    print(len(union))
    return len(intersection)/len(union)

def jaccard_distance(a, b):
    """Calculate the jaccard distance between sets A and B"""
    a = set(a)
    b = set(b)
    print(a, b)
    return 1.0 * len(a&b)/len(a|b)

all_diff = []
all_lev = []
all_sor = []
for sent1, sent2 in list(zip(order_doc.sents, jane)):
    t1 = sent1.text
    t2 = sent2
    diffl = difflib.SequenceMatcher(isjunk=None, a=t1, b=t2, autojunk=True).ratio()
    lev = Levenshtein.ratio(t1, t2) 
    sor = 1 - distance.sorensen(t1, t2)
    all_diff.append(diffl)
    all_lev.append(lev)
    all_sor.append(sor)
    
# diffl = difflib.SequenceMatcher(isjunk=None, a=order_doc.text, b=unorder_doc.text, autojunk=True).real_quick_ratio()
seq = difflib.SequenceMatcher()
seq.set_seqs(order_doc.text, jane)
blocks = seq.get_matching_blocks()
diffl = seq.ratio()
med = statistics.median(all_sor)
avg = statistics.mean(all_sor)
pstd = statistics.pstdev(all_sor)
sstd = statistics.stdev(all_sor)
lev = Levenshtein.ratio(order_doc.text, jane) 
sor = 1 - distance.sorensen(order_doc.text, jane)

print(diffl, lev, sor, med, avg, pstd, sstd)

def cosine_similarity_ngrams(a, b, ngram):
    counts_a = ngrams(a, ngram)
    counts_b = ngrams(b, ngram)
    vec1 = Counter(counts_a)
    vec2 = Counter(counts_b)
#     print('vec1', set(vec1.keys()), 'sum1', sum([vec1[x]**2 for x in vec1.keys()]), 'vec2', set(vec2.keys()), 'sum2', sum([vec2[x]**2 for x in vec2.keys()]))
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
#     print('intersection', numerator)
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
#     print('union', denominator)
    if not denominator:
        return 0.0
    return float(numerator) / denominator
print("Cosine: {}".format(cosine_similarity_ngrams(order_list, unorder_list, 3)))
print("Cosine: {}".format(cosine_similarity_ngrams(order_doc_tokens, unorder_doc_tokens, 2)))

print(len(ordered_text), len(unordered_text))
diff_words = set(unordered_text).difference(ordered_text)
print(len(diff_words))
sys_diff = set(unordered_text).symmetric_difference(ordered_text)
print(len(sys_diff))

test = difflib.SequenceMatcher(None, unorder_text, order_text, autojunk=False).ratio()
# https://stackoverflow.com/questions/4802137/how-to-use-sequencematcher-to-find-similarity-between-two-strings
#difflib.SequenceMatcher uses the Ratcliff/Obershelp algorithm it computes the doubled number of matching characters divided by the total number of characters in the two strings.

for t1, t2 in list(zip(order_tx, unorder_tx)):
    diffl = difflib.SequenceMatcher(isjunk=None, a=t1, b=t2, autojunk=True).quickratio()
    lev = Levenshtein.ratio(t1, t2) 
    sor = 1 - distance.sorensen(t1, t2)
    all_metrics = [diffl, lev, sor]
    med = statistics.median(all_metrics)
    avg = statistics.mean(all_metrics)
    pstd = statistics.pstdev(all_metrics)
    sstd = statistics.stdev(all_metrics)
    print('t1', t1, 't2', t2, all_metrics, med, avg, pstd, sstd)
    
def get_ngram(df1, df2):

    df1_tokens = []
    df1_text = []
    for index, row in df1.iterrows():
        raw_text = row['google_vision_text']
        sents = nltk.sent_tokenize(raw_text)
        for s in sents:
            tokens = nltk.word_tokenize(s)
            tokens = [t for t in tokens if t not in stopwords.words('english') and t not in string.punctuation]
            text = ' '.join(tokens)
            df1_tokens.append(tokens)
            df1_text.append(text)
        
    df2_tokens = []
    df2_text = []
    for index, row in df2.iterrows():
        raw_text = row['google_vision_text']
        sents = nltk.sent_tokenize(raw_text)
        for s in sents:
            tokens = nltk.word_tokenize(s)
            tokens = [t for t in tokens if t not in stopwords.words('english') and t not in string.punctuation]
            text = ' '.join(tokens)
            df2_tokens.append(tokens)
            df2_text.append(text)
    
    return df1_tokens, df1_text, df2_tokens, df2_text
order_tokens, order_tx, unorder_tokens, unorder_tx = get_ngram(eg_ordered, eg_unordered)