In [1]:
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import gensim
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full
import nltk
# nltk.download('punkt')
import string
import csv
from nltk.corpus import stopwords
# nltk.download('stopwords')
from collections import OrderedDict
from sklearn.feature_extraction.text import CountVectorizer

import networkx as nx
import matplotlib.pyplot as plt
from networkx.readwrite import json_graph
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.layouts import row, column
import random

import pyLDAvis.gensim as gensimvis
import pyLDAvis

In [2]:
#LOAD DATA for each case
full_df=pd.read_csv('full_page_image_lucida_test.csv')
split_df = pd.read_csv('split_page_image_lucida_test.csv', encoding='utf-8')
# order_df.isnull().values.any()
order_df = pd.read_csv('ordered_text_image_lucida_test.csv', encoding="ISO-8859-1")
order_df.fillna(0, inplace=True)


In [9]:
nlp = spacy.load('en_core_web_lg')

In [None]:
# Check Co-Occurence Matrices

In [43]:
def custom_tokenize(text):
    if not text:
#       print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

def process_text(df, types, graph_settings):
    doc = []
    final_doc = []
    for index, row in df.iterrows():
        raw_text = row['google_vision_text']
        tokens = custom_tokenize(raw_text)
        page_terms = ''
        for t in tokens:
            if t in string.punctuation:
                pass
            elif t in stopwords.words('english'):
                pass
            else:
                page_terms += t.lower() + ' '
        doc.append(page_terms)

    for sent in doc:
        sent_terms = ''
        spacy_text = nlp(sent)
        for ent in spacy_text.ents:
            if ent.label_ in types:
                sent_terms += ent.text + ' '
        final_doc.append(sent_terms)
    return create_matrix(final_doc, graph_settings)
    
def create_matrix(ents, graph_settings):
    count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
    X = count_model.fit_transform(ents)
    Xc = (X.T * X)
    vocab = count_model.vocabulary_
    vocab2 = {y:x for x,y in vocab.items()}
    return create_network(Xc, vocab2, graph_settings)
    #ALTERNATIVE WAY TO COMPUTE MATRIX
    # occurrences = OrderedDict((name, OrderedDict((name, 0) for name in termSplit)) for name in termSplit)
    # # Find the co-occurrences:
    # for l in document:
    #     for i in range(len(l)):
    #         for item in l[:i] + l[i + 1:]:
    #             occurrences[l[i]][item] += 1
    # # Print the matrix:
    # print(' ', ' '.join(occurrences.keys()))
    # for name, values in occurrences.items():
    #     print(name, ' '.join(str(i) for i in values.values()))
    
def create_network(matrix, vocab, graph_settings):
    G = nx.from_scipy_sparse_matrix(matrix)
    H = nx.relabel_nodes(G, vocab)
    data = json_graph.node_link_data(H)
    T = json_graph.node_link_graph(data)
    ns = list(T.nodes)
    es = list(T.edges)
    final_nodes = []
    for n in G.nodes:
        nod = {'name': ns[n], 'id':n}
        final_nodes.append(nod)

    N = len(T.nodes)
    counts = np.zeros((N, N))
    for e in G.edges(data=True):
        source, target, w = e
        counts[[source], [target]] = w['weight']
        counts[[target], [source]] = w['weight']
    print(len(final_nodes))
    return draw_graph(counts, final_nodes, graph_settings, ns)
        
def draw_graph(counts, nodes, graph_settings, list_nodes):
    xname = []
    yname = []
    color = []
    alpha = []
#     colormap = ["#444444", "#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99","#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a"]
    for i, node1 in enumerate(nodes):
        for j, node2 in enumerate(nodes):
            xname.append(node1['name'])
            yname.append(node2['name'])

            alpha.append(min(counts[i,j]/4.0, 0.9) + 0.1)

    for i in range(len(xname)):
        al = alpha[i]
        if  al == 0.35:
            color.append('#ce93d8')
        elif al == 0.6:
            color.append('#ba68c8')
        elif al == 0.85:
            color.append('#9c27b0')
        elif al == 1.0:
            color.append('#7b1fa2')
        else:
            color.append('lightgrey')
    
    source = ColumnDataSource(data=dict(
        xname=xname,
        yname=yname,
        colors=color,
        alphas=alpha,
        count=counts.flatten(),
    ))

    p = figure(title=graph_settings['title'],
               x_axis_location="above", tools="hover,save",
               x_range=list(reversed(list_nodes)), y_range=list_nodes)

    p.plot_width = graph_settings['width']
    p.plot_height = graph_settings['height']
    p.grid.grid_line_color = None
    p.axis.axis_line_color = None
    p.axis.major_tick_line_color = None
    p.axis.major_label_text_font_size = "5pt"
    p.axis.major_label_standoff = 0
    p.xaxis.major_label_orientation = np.pi/3

    p.rect('xname', 'yname', 0.9, 0.9, source=source,
           color='colors', alpha='alphas', line_color=None,
           hover_line_color='black', hover_color='colors')

    p.select_one(HoverTool).tooltips = [
        ('names', '@yname, @xname'),
        ('count', '@count'),
    ]

   

    return p # show the plot

In [44]:
output_file("OCR_coocurence_matrix.html", title='Co-Occurence Test')
order_settings = {'title': 'Ordered_Text_AO', 'height': 600, 'width': 600}
order_types = ['GPE']
order_p = process_text(order_df, order_types, order_settings)

full_settings = {'title': 'Full_Text_AO', 'height': 600, 'width': 600}
full_types = ['GPE']
full_p = process_text(full_df, full_types, full_settings)

split_settings = {'title': 'Split_Text_AO', 'height': 600, 'width': 600}
split_types = ['GPE']
split_p = process_text(split_df, split_types, split_settings)

show(row(order_p, split_p, full_p))

48
48
46


In [None]:
#Check Topics

In [3]:
def custom_tokenize(text):
    if not text:
#       print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

def process_model_text(df):

    final_doc = []
    for index, row in df.iterrows():
        raw_text = row['google_vision_text']
        tokens = custom_tokenize(raw_text)
        doc = []
        for t in tokens:
            
            if t in string.punctuation:
                pass
            elif t in stopwords.words('english'):
                pass
            else:
                doc.append(t.lower())
        final_doc.append(doc)
    create_models(final_doc)

def create_models(texts):
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20, passes=10)
    print(lda.show_topics())
    return pyLDAvis.gensim.prepare(lda, corpus, dictionary)

In [5]:
pyLDAvis.enable_notebook()
vis_data = process_model_text(order_df)
vis_data

[(11, '0.012*"the" + 0.009*"one" + 0.005*"world" + 0.005*"workers" + 0.004*"kennedy" + 0.004*"egypt" + 0.004*"it" + 0.003*"american" + 0.003*"mrs." + 0.003*"time"'), (18, '0.016*"the" + 0.009*"arab" + 0.008*"workers" + 0.006*"one" + 0.005*"petroleum" + 0.005*"dr." + 0.005*"\'s" + 0.004*"beethoven" + 0.004*"cairo" + 0.004*"federation"'), (5, '0.024*"the" + 0.020*"funds" + 0.016*"mr." + 0.013*"hammer" + 0.013*"chairman" + 0.011*"uia" + 0.010*"zionist" + 0.009*"agency" + 0.008*"shekel" + 0.007*"received"'), (13, '0.016*"the" + 0.011*"workers" + 0.010*"it" + 0.007*"federation" + 0.006*"arab" + 0.006*"\'s" + 0.006*"would" + 0.006*"president" + 0.005*"poetry" + 0.005*"petroleum"'), (17, '0.008*"poetry" + 0.007*"art" + 0.007*"professor" + 0.006*"new" + 0.005*"decamps" + 0.005*"ziegler" + 0.004*"the" + 0.004*"though" + 0.004*"work" + 0.004*"way"'), (0, '0.020*"the" + 0.012*"arab" + 0.008*"\'s" + 0.008*"kennedy" + 0.007*"israel" + 0.006*"president" + 0.005*"made" + 0.005*"man" + 0.005*"party" +

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [None]:
# Check Most Similar Words

In [None]:
def custom_tokenize(text):
    if not text:
#       print('The text to be tokenized is a None type. Defaulting to blank string.')
        text = ''
    return nltk.word_tokenize(text)

def process_text(df):
    doc = []
    final_doc = []
    for index, row in df.iterrows():
        raw_text = row['google_vision_text']
        tokens = custom_tokenize(raw_text)
        page_terms = ''
        for t in tokens:
            if t in string.punctuation:
                pass
            elif t in stopwords.words('english'):
                pass
            else:
                page_terms += t.lower() + ' '
        doc.append(page_terms)

    for sent in doc:
        sent_terms = ''
        spacy_text = nlp(sent)
        for ent in spacy_text.ents:
            if ent.label_ in types:
                sent_terms += ent.text + ' '
        final_doc.append(sent_terms)
    return create_matrix(final_doc, graph_settings)