In [1]:
### project steps 

################## get position on homepages
# step 1. layout detection to get bounding boxes 
# step 2. OCR the derived boxes and get text from the boxes
# step 3. filter out all ocred text that doesn't match text extracted in `links.json`
# step 4. draw a distance matrix from ocr'ed text to link text in `links.json`
# step 5. use hungarian matching algorithm to draw the min-cost bipartite graph between ocr'ed text and links

################## classifiers 
# step 6. now, we have x,y,w,h for each link and block of text on the homepage
# step 7. use common crawl to get article text for each link json 
# step 8. train classifiers to predict x,y,w,h for each text for each homepage

################## analysis
# step 9. analyze agreement between different outlets 
# step 10. analyze rank-ordering for unlabeled corpora
# step 11. get journalists to give feedback on which rank-orderings work best for them

In [149]:
# exists a heuristic for parsing a URL to determine if it is an article?

In [157]:
training_data = '../bin/storysniffer/_notebooks/input/labeled.csv'

In [172]:
is_article_df = pd.read_csv(training_data)

In [234]:
from urllib.parse import urlparse
import re
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
import tldextract
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split

In [235]:
is_article_df = (
    is_article_df[['url', 'is_story']]
     .assign(domain = lambda df: df['url'].apply(lambda x: tldextract.extract(x).domain))
     .assign(subdomain = lambda df: df['url'].apply(lambda x: tldextract.extract(x).subdomain))
     .assign(path = lambda df: df['url'].apply(lambda x: urlparse(x).path))
     .assign(num_url_parts = lambda df: 
                     df['path']
                         .apply(lambda x: re.split( '/|-', x))
                         .apply(lambda x: len(list(filter(lambda y: y != '', x))))
            )
)

In [244]:
DOMAIN_BLACKLIST = (
    "google",
    "twitter",
    "facebook",
    "doubleclick",
    "eventbrite",
    "youtube",
    "vimeo",
    "instagram",
    "ceros",
)

SUBDOMAIN_BLACKLIST = (
    "careers",
    "mail",
    "account",
    "events",
)

In [248]:
(is_article_df
#  .loc[lambda df: df['path'].apply(lambda x: any(map(lambda y: y in x, SUBDOMAIN_BLACKLIST)))]
  .loc[lambda df: df['subdomain'].isin(SUBDOMAIN_BLACKLIST)]
 .head(2)
)

Unnamed: 0,url,is_story,domain,subdomain,path,num_url_parts
197,https://account.amestrib.com/,False,amestrib,account,/,0
474,https://careers.bleacherreport.com/,False,bleacherreport,careers,/,0


In [247]:
(is_article_df
#  .loc[lambda df: df['path'].apply(lambda x: any(map(lambda y: y in x, SUBDOMAIN_BLACKLIST)))]
  .loc[lambda df: df['path'].str.replace('/', '').isin(SUBDOMAIN_BLACKLIST)]
 .head(2)
)

Unnamed: 0,url,is_story,domain,subdomain,path,num_url_parts
51,/events,False,,,/events,1
401,/events/,False,,,/events/,1


In [249]:
is_article_df.assign(label=True).pipe(lambda df: f1_score(df['is_story'], df['label']))

0.5831903945111492

In [255]:
(is_article_df
 .assign(label=lambda df: df['num_url_parts'] > 3).pipe(lambda df: f1_score(df['is_story'], df['label']))
)

0.8839736127279781

In [256]:
(is_article_df
 .assign(label=lambda df: df['num_url_parts'] > 5)
 .pipe(lambda df: f1_score(df['is_story'], df['label']))
)

0.8952213941253836

In [193]:
(t_is_article_df
 .assign(has_date=lambda df: df['url_path'].apply(lambda x: re.search('\d{2,4}\d+', x) is not None))
 .pipe(lambda df: f1_score(df['is_story'], df['has_date']))
)

0.7784145176695321

In [326]:
is_article_df = (is_article_df
 .assign(path_words=lambda df: 
         df['path'].apply(lambda x: re.split('-|/|#|%|@', x))
                   .apply(lambda x: list(filter(lambda y: y != '', x)))
                   .apply(lambda x: list(map(lambda y: '#' if y.isdigit() else y, x)))
                   .apply(lambda x: ' '.join(x))
        )
)

In [366]:
train_df, test_df = train_test_split(is_article_df, train_size=.9)

In [431]:
char_pipe = Pipeline([
    ('cv', CountVectorizer(min_df=0.01, max_df=0.5, ngram_range=(1, 5), analyzer="char")),
    ('lr', LogisticRegressionCV(max_iter=5000))
])

In [432]:
char_pipe.fit(X=train_df['path'] , y=train_df['is_story'])

In [477]:
labels = char_pipe.predict(test_df['path'])

In [478]:
f1_score(test_df['is_story'], labels)

0.9451476793248945

In [612]:
class Modeling():
    def __init__(self, *args, **kwargs):
        self.cv = None
        self.lr = None

    def training(self, X_train, y_train):
        self.cv = CountVectorizer(min_df=0.01, max_df=0.5, ngram_range=(1, 5), analyzer="char")
        self.lr = LogisticRegressionCV(max_iter=5000)
        x_mat_train = self.cv.fit_transform(X_train)
        self.lr.fit(x_mat_train, y_train)

    def get_predictions(self, candidate_url_list):
        assert not (self.cv is None or self.lr is None), 'Train first...'
        if not isinstance(candidate_url_list, list):
            candidate_url_list = [candidate_url_list]
            
        clean_str = list(map(lambda x: urlparse(x).path, candidate_url_list))
        cand_mat = self.cv.transform(clean_str)
        cand_probas = self.lr.predict_proba(cand_mat)
        # rules 
        
        return cand_probas        
        
    def to_json(self):
        model_output = {}
        model_output['lr_coef'] = self.lr.coef_.tolist()
        model_output['lr_intercept'] = self.lr.intercept_.tolist()
        model_output['vocab'] = self.cv.vocabulary_
        model_output['classes'] = self.lr.classes_.tolist()
        model_output['ngram_range'] = self.cv.ngram_range
        model_output['cv_analyzer'] = self.cv.analyzer
        return model_output

    @classmethod
    def from_json(cls, json_obj):
        model = cls(desired_keys=json_obj['classes'], cache_test_data=False)
        cv = CountVectorizer(
            vocabulary=json_obj['vocab'],
            analyzer=json_obj['cv_analyzer'],
            ngram_range=json_obj['ngram_range']
        )
        lr = LogisticRegression()
        lr.coef_ = np.array(json_obj['lr_coef'])
        lr.intercept_ = np.array(json_obj['lr_intercept'])
        lr.classes_ = np.array(json_obj['classes'])
        model.classes = json_obj['classes']
        model.lr = lr
        model.cv = cv
        return model

In [613]:
m = Modeling()

In [616]:
m.training(is_article_df['path'], is_article_df['is_story'])

In [617]:
j = m.to_json()

In [634]:
with open('../bin/news-homepages/newshomepages/bin/model_files/trained_lr_obj.json', 'w') as f:
    json.dump(j, f)

In [572]:
_white_spaces = re.compile(r"\s\s+")
ngram_range = (1, 5)

def _char_ngrams(text_document):
    """Tokenize text_document into a sequence of character n-grams"""
    # normalize white spaces
    text_document = _white_spaces.sub(" ", text_document)

    text_len = len(text_document)
    min_n, max_n = ngram_range
    if min_n == 1:
        # no need to do any slicing for unigrams
        # iterate through the string
        ngrams = list(text_document)
        min_n += 1
    else:
        ngrams = []

    # bind method outside of loop to reduce overhead
    ngrams_append = ngrams.append

    for n in range(min_n, min(max_n + 1, text_len + 1)):
        for i in range(text_len - n + 1):
            ngrams_append(text_document[i : i + n])
    return ngrams

In [580]:
n = _char_ngrams(test_df['path'].iloc[0])

In [591]:
t2 = m.cv.transform([test_df['path'].iloc[0]]).toarray()[0]

In [605]:
list(filter(lambda x: x is not None, list(map(lambda n_i: m.cv.vocabulary_.get(n_i) , n))))

[1418,
 1870,
 1096,
 707,
 707,
 1846,
 290,
 1438,
 1388,
 1653,
 1765,
 1585,
 218,
 526,
 1131,
 884,
 1871,
 98,
 1099,
 1345,
 1258,
 857,
 1555,
 41,
 689,
 1549,
 1810,
 1604,
 506,
 724,
 839,
 1468,
 19,
 540,
 1250,
 708,
 41,
 665,
 1159,
 1413,
 1858,
 1237,
 293,
 1446,
 1391,
 1667,
 1767,
 1589,
 222,
 530,
 99,
 1351,
 1261,
 858,
 1559,
 56,
 1550,
 508,
 730,
 840,
 1469,
 25,
 545,
 1251,
 712,
 48,
 1415,
 1668,
 731,
 841,
 26,
 546,
 27]

In [566]:
word_pipe = Pipeline([
    ('cv', CountVectorizer(min_df=0.001, max_df=0.9, ngram_range=(1, 4), analyzer="char_wb")),
    ('lr', LogisticRegressionCV(max_iter=5000))
])

In [567]:
word_pipe.fit(X=train_df['path_words'], y=train_df['is_story'])

In [568]:
labels = word_pipe.predict(test_df['path_words'])

In [569]:
f1_score(test_df['is_story'], labels)

0.9145299145299146

In [776]:
function_js = '''
get_parents = function (node) {
  var nodes = [node]
  for (; node; node = node.parentNode) {
    nodes.unshift(node)
  }
  return nodes
}

function get_common_parent(node1, node2, return_common) {
  if (return_common === undefined)
    return_common = true

  var parents1 = get_parents(node1)
  var parents2 = get_parents(node2)

  if (parents1[0] != parents2[0]){
      throw "No common ancestor!"
    }

  // parents are in order of top -> bottom
  for (var i = 0; i < parents1.length; i++) {
    if (parents1[i] != parents2[i]){
      if (return_common)
        return parents1[i - 1] 
      else
        return parents1[i]
    }
  }
}

function is_smaller_child(child_candidate, parent_candidate){
    var child_parents = get_parents(child_candidate)
    var parent_parents = get_parents(parent_candidate)
    return child_parents.length > parent_parents.length
}

function get_most_parent_with_joining(a_href, as, a_counts){
    var same_links = a_counts[a_href]
    if (same_links.length == 1){
        // If this is the only time this link appears, get the uppermost parent
        // that isn't also the parent of any other link.
        var i = same_links[0]
        var a = as[i]
        var curr_parent = document
        for (j = 0; j < as.length; j++){
          if (i != j){
            var common_not_parent = get_common_parent(a, as[j], return_common=false)
            if (is_smaller_child(common_not_parent, curr_parent)){
              curr_parent = common_not_parent  
            }
          }
        }
    } else {
        // Otherwise, get the greatest common parent for all the links.
        var all_instances_of_a = []
        a_counts[a_href].forEach(function(i){ all_instances_of_a.push(as[i]) })
        var curr_parent = all_instances_of_a[0]
        for (i=1; i < all_instances_of_a.length; i++){
            curr_parent = get_common_parent(curr_parent, all_instances_of_a[i])
        }
    }
    return curr_parent    
}

function get_highest_singular_parent(i, as, a_counts){
    var a = as[i]
    var curr_parent = document
    for (j = 0; j < as.length; j++){
      if ((i != j) & (as[i].href != as[j].href)) {
        var common_not_parent = get_common_parent(a, as[j], return_common=false)
        if (is_smaller_child(common_not_parent, curr_parent)){
          curr_parent = common_not_parent  
        }
      }
    }
    return curr_parent  
}

function get_url_parts(url){
    url = new URL(url)
    var path = url.pathname
    path = path.split(/[-/:.]/).filter(function(d){return d != ''})
    return path.length
}

function get_text_of_node(node){
    var iter = document.createNodeIterator(node, NodeFilter.SHOW_TEXT)
    var textnode;
    var output_text = ''

    // print all text nodes
    while (textnode = iter.nextNode()) {
      output_text = output_text + ' ' + textnode.textContent
    }
    return output_text.trim()
}
'''
executable_js = '''
var as = document.querySelectorAll('a')
as = Array.from(as).filter(function(a) { return a.href !== ''}).filter(function(a){return a.href !== undefined; })
// as = as.filter(function(a){return get_url_parts(a.href) > 4})

var a_counts = {}
as.forEach(function(a, i){
    a_counts[a.href] = a_counts[a.href] || []
    a_counts[a.href].push(i)
})

var a_top_nodes = as.map(function(a, i){ 
    return get_highest_singular_parent(i, as, a_counts) 
    // return get_most_parent_with_joining(a, as, a_counts)
})
a_top_nodes.forEach(function(node){
    node.setAttribute('style', 'border: 4px dotted blue !important;')
})
'''

In [777]:
sites = pd.read_csv('../bin/news-homepages/newshomepages/sources/sites.csv')

In [760]:
model_utils_js = open('../bin/news-homepages/newshomepages/bin/js/model_utils.js').read()
lr_obj = open('../bin/news-homepages/newshomepages/bin/model_files/trained_lr_obj.json').read()

In [761]:
await page.evaluate(model_utils_js)

0.5

In [762]:
await page.evaluate('const lr = new LRPathPredictor(%s)' % lr_obj)

t = "https://abcnews.go.com/US/42-magnitude-earthquake-strikes-malibu/story?id=96655478"
t = "https://abcnews.go.com/US/gunviolence"
await page.evaluate('lr.get_predictions("%s")' % t)

False

In [778]:
import json 
from playwright.async_api import async_playwright
import glob
import os 

In [779]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)

In [780]:
page = await browser.new_page()

In [764]:
await page.close()

In [781]:
html_files = glob.glob('../layout-parsing/data/test-html/*')

In [782]:
pwd = os.getcwd()
fp = os.path.join(pwd, html_files[1])

In [905]:
# await page.goto(f"file://{fp}")
await page.goto(f"https://nytimes.com")

<Response url='https://www.nytimes.com/' request=<Request url='https://www.nytimes.com/' method='GET'>>

In [906]:
await page.evaluate(function_js + executable_js)

In [908]:
async def get_bounding_box_info():
    bounding_boxes = await page.evaluate('''
        function () {
            var all_links = []
            a_top_nodes.forEach(function(node){
                var links = Array.from(node.querySelectorAll('a')).map(function(a){ return a })
                if ((links.length == 0) & (node.nodeName === 'A')){
                    links = [node]
                }
                links = links.map(function(a){return {'url': a.href, 'link_text': get_text_of_node(a)}})

                links.forEach(function(a){
                    var b = node.getBoundingClientRect()
                    a['x'] = b['x']
                    a['y'] = b['y']
                    a['width'] = b['width']
                    a['height'] = b['height']
                    a['all_text'] = get_text_of_node(node)
                    all_links.push(a)
                })
            })
            return all_links
        }
    ''')

    width = await page.evaluate('''
        Math.max(
            document.documentElement["clientWidth"],
            document.body["scrollWidth"],
            document.documentElement["scrollWidth"],
            document.body["offsetWidth"],
            document.documentElement["offsetWidth"]
        );
    ''')

    height = await page.evaluate('''Math.max(
        document.documentElement["clientHeight"],
        document.body["scrollHeight"],
        document.documentElement["scrollHeight"],
        document.body["offsetHeight"],
        document.documentElement["offsetHeight"]
    );''')
    
    return bounding_boxes, width, height

In [909]:
await page.evaluate('scroll(0, 000)')

In [912]:
await page.evaluate('''a_top_nodes.map(function(a) {
    return Array.from(a.querySelectorAll('img')).map(function(img){
        var img_bb = img.getBoundingClientRect()
        return {
        'src': img.src, 
        'alt': img.alt,
        'x': img_bb['x'],
        'y': img_bb['y'],
        'width': img_bb['width'],
        'height': img_bb['height']        
        }
    })
} )''')

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [{'src': 'https://static01.nyt.com/images/2023/02/01/business/fed-rate-promo/fed-rate-promo-threeByTwoMediumAt2X-v5.png?format=pjpg&quality=75&auto=webp&disable=upscale',
   'alt': '',
   'x': 352.234375,
   'y': 529.59375,
   'width': 513.859375,
   'height': 342.5625}],
 [],
 [],
 [],
 [],
 [{'src': 'https://static01.nyt.com/images/2023/02/01/multimedia/01tyre-nichols-funeral-carousel-06-bjtc/01tyre-nichols-funeral-carousel-06-bjtc-square640-v2.jpg?quality=75&auto=webp',
   'alt': '',
   'x': 866.09375,
   'y': 1059.734375,
   'width': 516.703125,
   'height': 344.46875}],
 [{'src': 'https://static01.nyt.com/images/2023/02/01/multimedia/01tyre-nichols-funeral-carousel-06-bjtc/01tyre-

In [799]:
import sys

In [800]:
sys.path.insert(0,'../bin/news-homepages/newshomepages/')

In [802]:
import hyperlinks_with_bounding_boxes as bb

In [804]:
from importlib import reload

In [861]:
psl_js = open('../bin/news-homepages/newshomepages/bin/js/psl.min.js').read()
util_js = open('../bin/news-homepages/newshomepages/bin/js/utils.js').read()

In [862]:
await page.evaluate(psl_js)
await page.evaluate(util_js)

[None]

In [897]:
reload(bb)

<module 'hyperlinks_with_bounding_boxes' from '/Users/alex/Projects/usc-research/newsworthiness/notebooks/../bin/news-homepages/newshomepages/hyperlinks_with_bounding_boxes.py'>

In [857]:
await page.evaluate('get_url_parts("https://nytimes.com/hello-there-there-there-there-there")')

True

In [864]:
print(bb.get_link_divs_js)


    var as = document.querySelectorAll('a')
    as = Array.from(as)
            .filter(function(a) { return a.href !== ''}).filter(function(a){return a.href !== undefined; })
            .map(function(a) {return {'node': a, 'href': a.href, 'is_long': get_url_parts(a.href) }} )
    
    var a_counts = {}
    as.forEach(function(a, i){
        a_counts[a.href] = a_counts[a.href] || []
        a_counts[a.href].push(i)
    })
    
    var a_top_nodes = as.map(function(a, i){ 
        return get_highest_singular_parent(i, as)
    })



In [867]:
await page.evaluate(bb.get_link_divs_js)

In [870]:
await page.evaluate(bb.js_to_spotcheck)

In [898]:
t = await bb.get_bounding_box_info(page)