# part 1: process element into html_element

In [7]:
import pandas as pd
import json

In [8]:
# convert media name to bias rating
media_df = pd.read_csv("../data/core_results.csv")

def get_bias_leaning(media_name, media_df = media_df):
    media_name = media_name.replace("-", " ")
    result = media_df.loc[media_df['news'] == media_name, 'bias'].values
    return result[0]

In [9]:
# read json file

def read_json(path):
    # return [dict, name, date]
    with open(path, "r") as infile:
        j = json.load(infile)
    
    name_and_date = path.split("/")[-1].split('_')
    name = name_and_date[0]
    date = name_and_date[1][:-5] # before .json
    
    return [j, name, date]

In [10]:
read_json("../data/sites_jsons/json/ABC-News-(Online)_20220101000143.json")

[[[{'tag_name': 'span',
    'text': 'ABC News',
    'location': {'x': 272, 'y': 7233},
    'size': {'height': 0, 'width': 0},
    'NER': [],
    'SA': {}},
   [{'tag_name': 'h4',
     'text': 'VIDEO',
     'location': {'x': 432, 'y': 7188},
     'size': {'height': 15, 'width': 48},
     'NER': [],
     'SA': {}},
    {'tag_name': 'h4',
     'text': 'LIVE',
     'location': {'x': 524, 'y': 7188},
     'size': {'height': 15, 'width': 34},
     'NER': [],
     'SA': {}},
    {'tag_name': 'h4',
     'text': 'SHOWS',
     'location': {'x': 602, 'y': 7188},
     'size': {'height': 15, 'width': 57},
     'NER': [],
     'SA': {}},
    {'tag_name': 'h4',
     'text': 'CORONAVIRUS',
     'location': {'x': 702, 'y': 7188},
     'size': {'height': 15, 'width': 112},
     'NER': [],
     'SA': {}}],
   {'tag_name': 'button',
    'text': 'LOG IN',
    'location': {'x': 1580, 'y': 7187},
    'size': {'height': 16, 'width': 52},
    'NER': [],
    'SA': {}}],
  [[{'tag_name': 'h2',
     'text': 'Bide

In [11]:
# class represent a html element
class html_element:
    def __init__(self, json_ele, media_name, date, leaning = None):
        # x, y, width, height, children = list, dict or None
        self.media_name = media_name
        self.date = date
        if leaning is None:
            self.leaning = get_bias_leaning(self.media_name)
        else:
            self.leaning = leaning
        
        # if no children, tag, text and attr variable are avilable
        self.__construct_children(json_ele)
        self.__construct_cord(json_ele)
        self.__construct_size(json_ele)
        
    def __construct_children(self, json_ele):
        if isinstance(json_ele, dict):
            # no children
            self.children = None
            self.attr = {}
            # also construct tag and text
            for k,v in json_ele.items():
                self.attr[k] = v
        else:
            self.children = []
            for x in json_ele:
                self.children.append(html_element(json_ele = x, media_name = self.media_name, date = self.date, \
                                                 leaning = self.leaning))
            
    def __construct_cord(self, json_ele):
        if isinstance(json_ele, dict):
            # no children
            self.x = json_ele["location"]["x"]
            self.y = json_ele["location"]["y"]
        else:
            children_x = [child.x for child in self.children]
            children_y = [child.y for child in self.children]
            
            # cords of a nested element are the smallest x and y (topleft corner)
            try:
                self.x = min(children_x)
                self.y = min(children_y)
            except Exception as e:
                print(children_x)
                print(children_y)
                raise e
            
    def __construct_size(self, json_ele):
        if isinstance(json_ele, dict):
            # no children
            self.width = json_ele["size"]["width"]
            self.height = json_ele["size"]["height"]
        else:
            right_bound = [child.width + child.x for child in self.children]
            bottom_bound = [child.height + child.y for child in self.children]
            
            # size of a nested element are from topleft corner to bottomright corner
            self.width = max(right_bound) - self.x
            self.height = max(bottom_bound) - self.y
        
    def __eq__(self, other):
        # may need a more robust matching strategy
        # may need to include class attribute
        if isinstance(other, type(self)) and isinstance(self.children, type(other.children)):
            if isinstance(self.children, list):
                # is nested element  
                return self.children == other.children
            else:
                return self.text == other.text and self.tag == other.tag
        else:
            return False
    
    def get_json_repr(self):
        # return a list or dict
        if self.children:
            return [x.get_json_repr() for x in self.children]
        else:
            return {'tag_name': self.tag,
                'text': self.text,
                'location': {'x': self.x, 'y': self.y},
                'size': {'height': self.height, 'width': self.width}}
        

In [12]:
def processing_element(html_element):
    result = []
    
    if html_element.children:
        for x in html_element.children:
            result +=  processing_element(x)
    else:
        element = {}
        
        # x, y and size
        element["x"] = html_element.attr["location"]["x"]
        element["y"] = html_element.attr["location"]["y"]
        element["height"] = html_element.attr["size"]["height"]
        element["width"] = html_element.attr["size"]["width"]
        element["NER"] = html_element.attr["NER"]
        element["SA"] = html_element.attr["SA"]
        
        element["tag"] = html_element.attr["tag_name"]
        element["text"] = html_element.attr["text"]
        
        element["name"] = html_element.media_name
        element["date"] = html_element.date
        element["leaning"] = html_element.leaning
        
        result.append(element)
    
    return result

In [13]:
d1 = html_element(*read_json("../data/sites_jsons/json/ABC-News-(Online)_20220101000143.json"))

e_sizes = processing_element(d1)
e_sizes[0]

{'x': 272,
 'y': 7233,
 'height': 0,
 'width': 0,
 'NER': [],
 'SA': {},
 'tag': 'span',
 'text': 'ABC News',
 'name': 'ABC-News-(Online)',
 'date': '20220101000143',
 'leaning': 'Lean Left'}

# create parquet

In [14]:
import os
import concurrent
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
import time

In [15]:
def size_worker(file_name):
    path = os.path.join(dir_path, file_name)
    e = html_element(*read_json(path))
    return processing_element(e)


dir_path = "../data/sites_jsons/json"
e_sizes = []
count = 0

with ProcessPoolExecutor(max_workers=6) as executor:
    future_to_file = {executor.submit(size_worker, file_name): file_name for file_name in os.listdir(dir_path)}

    for future in concurrent.futures.as_completed(future_to_file):
        result = future.result()
        count += len(result)
        e_sizes += result

e_sizes = pd.DataFrame(e_sizes)

In [16]:
e_sizes.to_parquet('./processed_data/raw_data.pkl')

In [18]:
e_sizes

Unnamed: 0,x,y,height,width,NER,SA,tag,text,name,date,leaning
0,8,13,15,1904,[],"{'pos': 0.12073664367198944, 'neg': 0.87926328...",pre,Could not establish a tunnel,Associated-Press,20220101004109,Lean Left
1,0,9051,9,78,[],{},a,Skip to main content,Axios,20220101001505,Lean Left
2,511,9090,23,71,[],{},button,Sections,Axios,20220101001505,Lean Left
3,646,9090,23,91,[],{},button,Local news,Axios,20220101001505,Lean Left
4,801,9090,23,95,[],{},button,About Axios,Axios,20220101001505,Lean Left
...,...,...,...,...,...,...,...,...,...,...,...
1725607,912,10219,13,80,[],{},a,Terms of Service,Time-Magazine,20220301204820,Lean Left
1725608,998,10219,13,70,[],{},a,Privacy Policy,Time-Magazine,20220301204820,Lean Left
1725609,1074,10219,13,147,[],{},a,Your California Privacy Rights,Time-Magazine,20220301204820,Lean Left
1725610,1248,10219,13,183,[],"{'pos': 0.7204211950302124, 'neg': 0.279578834...",a,Do Not Sell My Personal Information,Time-Magazine,20220301204820,Lean Left


In [25]:
pd.set_option('display.max_colwidth', None)
e_sizes[e_sizes["text"] == "Exclusive Book: How the Specter of Communism Is Ruling Our World"]

Unnamed: 0,x,y,height,width,NER,SA,tag,text,name,date,leaning
8147,1252,4410,40,300,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220101005134,Lean Right
1332067,1252,8043,40,300,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220102105136,Lean Right
1334325,1252,3873,40,300,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220106105134,Lean Right
1335227,8,7824,18,1889,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220104104449,Lean Right
1336879,8,7759,18,1889,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220108205143,Lean Right
...,...,...,...,...,...,...,...,...,...,...,...
1672335,8,7774,18,1889,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220314160646,Lean Right
1672950,8,7351,18,1889,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220310064704,Lean Right
1673501,1252,3854,40,188,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220324163438,Lean Right
1674220,8,7351,18,1889,"[{'entity': 'B-PER', 'score': '0.8569692', 'index': 7, 'word': 'S', 'start': 24, 'end': 25}, {'entity': 'I-PER', 'score': '0.3834895', 'index': 8, 'word': '##pect', 'start': 25, 'end': 29}, {'entity': 'B-MISC', 'score': '0.6980022', 'index': 11, 'word': 'Co', 'start': 35, 'end': 37}, {'entity': 'I-MISC', 'score': '0.87380266', 'index': 12, 'word': '##mm', 'start': 37, 'end': 39}, {'entity': 'I-MISC', 'score': '0.8429744', 'index': 13, 'word': '##uni', 'start': 39, 'end': 42}, {'entity': 'I-MISC', 'score': '0.9559942', 'index': 14, 'word': '##sm', 'start': 42, 'end': 44}, {'entity': 'I-MISC', 'score': '0.9219736', 'index': 20, 'word': 'World', 'start': 59, 'end': 64}]","{'pos': 0.8878241181373596, 'neg': 0.1121758371591568}",div,Exclusive Book: How the Specter of Communism Is Ruling Our World,The-Epoch-Times,20220326163251,Lean Right


# filter text length

In [None]:
# drop words:

df_one = df_frontpage[[len(x) <= 19 for x in df_frontpage["text"]]]
# df_one = df_one[df_one["word"].notna()]
df_one

In [None]:
# drop words:

df_one = df_frontpage[[len(x) <= 19 for x in df_frontpage["text"]]]
# df_one = df_one[df_one["word"].notna()]
df_one

In [None]:
import matplotlib.pyplot as plt

# Unzipping the list of tuples
x, y = zip(*percentages)

# Creating the plot
plt.plot(x, y, marker='o')  # You can change the marker style if you wish
plt.xlabel('length of text (characters)')  # Customize with your label
plt.ylabel('percentage with at lest one NER')  # Customize with your label
plt.title('length of text vs percentage with NER')  # Customize with your title

# Display the plot
plt.show()


In [None]:
cal_perc(df_frontpage, 20) 

In [None]:
df_frontpage = df_frontpage[[len(x) > 19 for x in df_frontpage["text"]]]
df_frontpage.sample(10)

# Merge tokens

In [None]:
# filter out short text

In [26]:
# merge may be too agressive

In [1]:
import seaborn as sns    
import pandas as pd
import json
import copy
import regex as re

import matplotlib.pyplot as plt
import matplotlib.patches as patches

In [2]:
# load pickel 
e_sizes = pd.read_pickle('elements_stats.pkl')
e_sizes['text'] = e_sizes['text'].str.strip()

# ensure the e is not too small or invisible
e_sizes = e_sizes.loc[(e_sizes["width"] > 15) & (e_sizes["height"] > 15) & e_sizes["text"]] 

In [3]:
# remove some tags
to_remove = ["button", "label", "date", "time", "u", "legend", "pre", "nav-search-bucket", "media-badge", "lit-timestamp"]
e_sizes = e_sizes[~e_sizes['tag'].isin(to_remove)]

In [4]:
e_sizes

Unnamed: 0,x,y,height,width,NER,SA,tag,text,name,date,leaning
6,272,167,19,325,"[{'entity': 'B-PER', 'score': '0.9978671', 'in...","{'pos': 0.1687677949666977, 'neg': 0.831232190...",h2,Biden threaten Putin with sanctions over Ukraine,ABC-News-(Online),20220101000143,Lean Left
7,272,207,19,324,[],"{'pos': 0.13421954214572906, 'neg': 0.86578053...",h2,Coaches made teen eat pizza despite religion: ...,ABC-News-(Online),20220101000143,Lean Left
8,272,247,19,297,[],"{'pos': 0.09297964721918106, 'neg': 0.90702039...",h2,Suspect arrested in shooting of police officers,ABC-News-(Online),20220101000143,Lean Left
9,272,287,19,309,"[{'entity': 'B-ORG', 'score': '0.9989704', 'in...","{'pos': 0.1945149451494217, 'neg': 0.805485069...",h2,DOJ drops case against cops in Epstein suicide,ABC-News-(Online),20220101000143,Lean Left
10,272,327,19,263,[],"{'pos': 0.8186860680580139, 'neg': 0.181313917...",h2,Powerball jackpot raised to $500 million,ABC-News-(Online),20220101000143,Lean Left
...,...,...,...,...,...,...,...,...,...,...,...
1725571,338,9594,71,285,"[{'entity': 'B-ORG', 'score': '0.9852402', 'in...","{'pos': 0.0774001032114029, 'neg': 0.922599911...",h3,Climate Change Will Mean More Expensive Grocer...,Time-Magazine,20220301204820,Lean Left
1725572,653,9594,47,285,"[{'entity': 'B-LOC', 'score': '0.99959725', 'i...","{'pos': 0.8407050371170044, 'neg': 0.159294947...",h3,Iowa is the Front Line in a Battle Over Carbon...,Time-Magazine,20220301204820,Lean Left
1725573,967,9594,71,285,"[{'entity': 'B-LOC', 'score': '0.999783', 'ind...","{'pos': 0.6308440566062927, 'neg': 0.369155913...",h3,Russia Tensions Could Push Europe to a Clean E...,Time-Magazine,20220301204820,Lean Left
1725574,1281,9594,47,285,"[{'entity': 'B-ORG', 'score': '0.9946989', 'in...","{'pos': 0.512549102306366, 'neg': 0.4874509572...",h3,The U.S. Military Might Never Be Sustainable,Time-Magazine,20220301204820,Lean Left


In [5]:
e_sizes.iloc[4]["text"]

'Powerball jackpot raised to $500 million'

In [61]:
# NER post processing 

# 1. merge adjacent word
def merge_adjacent_ner(ner_list, text):
    ner_list = copy.deepcopy(ner_list)
    p = re.compile(r'[\s\\\p{P}]+$')

    new_ner_list = []
    if not ner_list:
        return new_ner_list
    else:
        new_ner_list.append(ner_list[0])
    
    for x in ner_list[1:]:
        # same entity and connected, then merge
        if x["entity"].startswith("I") or \
        (x["entity"] == new_ner_list[-1]["entity"] and all(char in [" ", "'"] for char in text[new_ner_list[-1]["end"] : x["start"]])):
            # merge I entity
#             if new_ner_list[-1]["end"] != x["start"]:
#                 print(text[new_ner_list[-1]["end"] : x["start"]])
            
            new_ner_list[-1]["score"] = new_ner_list[-1]["score"] if new_ner_list[-1]["score"] < x["score"] else x["score"]
            new_ner_list[-1]["word"] = text[new_ner_list[-1]["start"] : x["end"]]
            new_ner_list[-1]["end"] = x["end"]
        else:
            new_ner_list.append(x)
        
    for x in new_ner_list:
        if x["end"] < len(text) and text[x["end"]] != " ":
            # expand to include the whole word
            x["end"] = text.find(' ', x["end"])
            x["word"] = text[x["start"] : x["end"]]
        
        old_len = len(x["word"])
        x["word"] = p.sub('', x["word"])
        x["end"] -= old_len - len(x["word"])

    
    return new_ner_list

row = e_sizes.loc[38]
print(row["NER"])
print("******")
merge_adjacent_ner(row["NER"], row["text"])

[{'entity': 'B-PER', 'score': '0.87566996', 'index': 1, 'word': 'Con', 'start': 0, 'end': 3}, {'entity': 'I-MISC', 'score': '0.59498155', 'index': 2, 'word': '##fe', 'start': 3, 'end': 5}, {'entity': 'B-MISC', 'score': '0.9719853', 'index': 7, 'word': 'New', 'start': 25, 'end': 28}, {'entity': 'I-MISC', 'score': '0.842094', 'index': 8, 'word': 'Year', 'start': 29, 'end': 33}, {'entity': 'I-MISC', 'score': '0.6991049', 'index': 9, 'word': "'", 'start': 33, 'end': 34}, {'entity': 'I-MISC', 'score': '0.983053', 'index': 10, 'word': 's', 'start': 34, 'end': 35}, {'entity': 'I-MISC', 'score': '0.8973793', 'index': 11, 'word': 'Eve', 'start': 36, 'end': 39}, {'entity': 'B-LOC', 'score': '0.9979153', 'index': 14, 'word': 'Times', 'start': 55, 'end': 60}, {'entity': 'I-LOC', 'score': '0.99908066', 'index': 15, 'word': 'Square', 'start': 61, 'end': 67}]
******


[{'entity': 'B-PER',
  'score': '0.59498155',
  'index': 1,
  'word': 'Confetti',
  'start': 0,
  'end': 8},
 {'entity': 'B-MISC',
  'score': '0.6991049',
  'index': 7,
  'word': "New Year's Eve",
  'start': 25,
  'end': 39},
 {'entity': 'B-LOC',
  'score': '0.9979153',
  'index': 14,
  'word': 'Times Square',
  'start': 55,
  'end': 67}]

In [48]:
sub_words = []

def post_processing(row):
    # merge NER
    ner_list, text = row["NER"], row["text"]
    result = merge_adjacent_ner(ner_list, text)
    
     
    row["NER"] = result
    return row

def print_rows(df):
    for x in df.iterrows():
        original = x[1]["NER"]
        row = post_processing(x[1])

        if row["NER"]:
            word = [x["word"] for x in row["NER"]] 
            score = [x["score"] for x in row["NER"]] 

            print(x[0])
            print(x[1]["text"])
            print("original: " + str([y["word"] for y in original]))
            print("processed: " + str(word))
            print("score: " + str(score))
            print()

In [63]:
# 3. report (half people half org)
post_processed_data = e_sizes.apply(post_processing, axis = 1)

In [62]:
print_rows(post_processed_data[500:600])

693
The Ugly History of New Year’s Is Too Real for White GOPers
original: ['Ugly History of New Year’s']
processed: ['Ugly History of New Year’s']
score: ['0.6156665']

695
Enslaved Black people dreaded New Year’s in Civil War-era America, when they might be separated from loved ones. Just don’t tell the people yelling about “critical race theory.”
original: ['Black', 'Civil War-era', 'America']
processed: ['Black', 'Civil War-era', 'America']
score: ['0.99879706', '0.9985281', '0.99925584']

697
Betty White, Everybody’s Favorite Foxy Grandmother, Has Died
original: ['Betty White']
processed: ['Betty White']
score: ['0.99536884']

701
Take a Moment From Your Vacation, Ron, and Reassure Florida
original: ['Ron', 'Florida']
processed: ['Ron', 'Florida']
score: ['0.99628437', '0.86698437']

704
The 11 Best Movies of the Year
original: ['Best Movies of the']
processed: ['Best Movies of the']
score: ['0.78730094']

710
What’s With All the Singing in ‘Emily in Paris?’
original: ['Paris']
pro

In [None]:
# remove reCAPTCHA
post_processed_data = post_processed_data[["reCAPTCHA" not in x for x in df_frontpage["text"]]]

In [64]:
post_processed_data.to_pickle('./processed_data/elements_stats.pkl')