In [78]:
%matplotlib inline

import json, os, sys, re, math
import numpy as np
import pandas as pd
import cv2  
from shutil import copy
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [79]:
path = '../data/CNNTest/'
train_bbs = json.load(open(path+'train/bounding_box_data.json','r'))

In [None]:
papers = {}
papers_df = pd.DataFrame()
i = 0
for paper_page in train_bbs:
    paper = paper_page.split('-')[0]
    page = f"page{paper_page.replace('.png','').split('-')[-1]}"
    if paper in papers:
        papers[paper]['pages_count'] +=1
        
    else:
        papers[paper]={}
        papers[paper]['pages_count']=0
        papers[paper]['pages'] = {}
        
    temp_papers = []
    for region in train_bbs[paper_page]['regions']:
        temp = train_bbs[paper_page]['regions'][region]
        width = temp['shape_attributes']['all_points_x'][2]-temp['shape_attributes']['all_points_x'][0]
        left = temp['shape_attributes']['all_points_x'][0]
        top = temp['shape_attributes']['all_points_y'][0]
        entity_id = temp['shape_attributes']['entity_id']
        height = temp['shape_attributes']['all_points_y'][1]-temp['shape_attributes']['all_points_y'][0]
        papers_df = papers_df.append({'paper':paper,'page':page,'width':width,'height':height, \
                                      'entity_id':entity_id, 'top':top,'left':left}, ignore_index=True)
        temp_papers.append((width, height))
    papers[paper]['pages'][page] = temp_papers

num_papers = 0
num_pages = 0
num_eqns = 0

widths = []
heights = []

eqns_per_page = []
eqns_per_paper = []

pages_per_paper = []

for paper in papers:
    num_papers+=1
    num_pages+= papers[paper]['pages_count']
    pages_per_paper.append(papers[paper]['pages_count'])
    eqns_for_this_paper = 0
    for page in papers[paper]['pages']:
        num_eqns+=len(papers[paper]['pages'][page])
        eqns_per_page.append(len(papers[paper]['pages'][page]))
        eqns_for_this_paper +=len(papers[paper]['pages'][page])
        for eqn in papers[paper]['pages'][page]:
            widths.append(eqn[0])
            heights.append(eqn[1])
        
    eqns_per_paper.append(eqns_for_this_paper)
    

In [None]:
num_papers, num_pages, num_eqns

In [None]:


fig = go.Figure()
# fig.add_trace(go.Histogram(x=widths,bingroup=1,name="Widths"))
# fig.add_trace(go.Histogram(x=heights,bingroup=1,name="Heights"))
fig.add_trace(go.Histogram(x=papers_df["width"],bingroup=1,name="Widths", histnorm="percent"))
fig.add_trace(go.Histogram(x=papers_df["height"],bingroup=1,name="Heights", histnorm="percent"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equation Heights and Widths', # title of plot
    xaxis_title_text='Pixels', # xaxis label
    yaxis_title_text='% Eqns', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()



In [None]:


fig = go.Figure()
# fig.add_trace(go.Histogram(x=widths,bingroup=1,name="Widths"))
# fig.add_trace(go.Histogram(x=heights,bingroup=1,name="Heights"))
fig.add_trace(go.Histogram(x=papers_df["width"],bingroup=1,name="Widths", histnorm="percent",cumulative_enabled=True))
fig.add_trace(go.Histogram(x=papers_df["height"],bingroup=1,name="Heights", histnorm="percent",cumulative_enabled=True))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equation Heights and Widths', # title of plot
    xaxis_title_text='Pixels', # xaxis label
    yaxis_title_text='% Eqns', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()


papers_df['area'] = papers_df['height'] * papers_df['width']

fig = go.Figure()
# fig.add_trace(go.Histogram(x=widths,bingroup=1,name="Widths"))
# fig.add_trace(go.Histogram(x=heights,bingroup=1,name="Heights"))
fig.add_trace(go.Histogram(x=papers_df["area"],bingroup=1,name="Area", histnorm="percent"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equation Area', # title of plot
    xaxis_title_text='Pixels^2', # xaxis label
    yaxis_title_text='% Eqns', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()



In [None]:


fig = go.Figure()
fig.add_trace(go.Histogram(x=eqns_per_page,bingroup=1,name="Eqns_per_page", histnorm="percent",cumulative_enabled=True))
# fig.add_trace(go.Histogram(x=eqns_per_page,bingroup=1,name="Eqns_per_page", histnorm="percent"))

# Overlay both histograms
fig.update_layout(barmode='overlay')
# Reduce opacity to see both histograms
fig.update_traces(opacity=0.65)
fig.update_layout(
    title_x=0.5,
    title_text='Histograms of Equations per Page', # title of plot
    xaxis_title_text='Eqns', # xaxis label
    yaxis_title_text='% Pages', # yaxis label
    bargap=0, # gap between bars of adjacent location coordinates
    bargroupgap=0.1 # gap between bars of the same location coordinates
)

fig.show()



In [None]:
paper_df = papers_df.sort_values(['paper','page','entity_id','top'], ascending=True)
papers_df.head()

In [None]:
papers_df[papers_df.height<7].sort_values(['height','width'],ascending=False)
#Height <7 is the magic number to merge 

In [None]:
papers_df[papers_df.height>=0].sort_values(['height'],ascending=False).head(5)


In [None]:
# print(sorted(papers_df.width.unique()))
papers_df[(papers_df.width<=10) & (papers_df.height>=7)].sort_values(['paper','page'],ascending=False).head(20)

In [None]:
# print(sorted(papers_df.width.unique()))
papers_df[(papers_df.width<=10)].sort_values(['width'],ascending=False)

In [None]:
total_eqns = papers_df.groupby(['paper','page']).size()



total_papers = list(set(list(papers_df.paper)))

papers_to_remove = []
papers_to_remove = papers_to_remove + ["1501.00009","1503.00066","0705.00116"] #page orientation
papers_to_remove = papers_to_remove + ["0705.00017"] #Random figures and artifacts as eqns
papers_to_remove = papers_to_remove +  list(set(list(papers_df[(papers_df.width<=10)].paper)))
papers_to_remove = papers_to_remove +  list(set(list(papers_df[(papers_df.height<7)].paper)))


def remove_papers(row):

    if row in papers_to_remove:
        return 1
    else:
        return 0

In [None]:
len(total_papers), len(papers_to_remove)

In [None]:
total_eqns = papers_df.groupby(['paper','page']).size().reset_index().sort_values(['paper','page'])
total_eqns.columns = ['paper','page','eqns']
new_papers_df = papers_df.merge(total_eqns, on=['paper','page'], how="left")
new_papers_df.head()

In [None]:
new_papers_df[new_papers_df['eqns']<=20]

In [None]:
new_papers_df['remove'] = new_papers_df.paper.apply(lambda x: remove_papers(x))

In [None]:
new_papers_df[(new_papers_df['eqns']<=20) & (new_papers_df['remove']==0)].drop_duplicates()

In [None]:
a = pd.DataFrame()
b = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']})

In [71]:
a = pd.concat([a,b])

In [72]:
a

Unnamed: 0,A,B,C,D
0,A8,B8,C8,D8
1,A9,B9,C9,D9
2,A10,B10,C10,D10
3,A11,B11,C11,D11
