### Detect countdown timers using heuristics based on segment updates

In [1]:
import os
import pickle
import re
import json
import sqlite3
import numpy as np
import pandas as pd
from os.path import dirname, abspath, join, isfile
from collections import Counter, OrderedDict

pd.options.display.max_colwidth = 200
pd.options.display.html.use_mathjax = False

TIMER_MIN_NEG_POS_UPDATE_RATIO = 5  # there should be 5X more -ve updates than +ve
TIMER_MIN_NO_OF_NEG_UPDATES = 5  # there should be 5 or more decreases



In [2]:
SEGMENTS_QUERY = '''
    SELECT sv.site_url, sv.visit_id, se.node_id,
        se.top, se.left, se.width, se.height, se.inner_text, se.time_stamp
        FROM SEGMENTS as se LEFT JOIN site_visits as sv ON se.visit_id = sv.visit_id
        WHERE se.node_name != 'BODY' and se.inner_text GLOB '*[0-9]*';
    '''

def load_segments(crawler_name, check_cache=True, dump_pickle=True):
    assert crawler_name in ["odin", "webtap"]
    pickle_file = "%s_segments.pickle" % crawler_name

    # load already pickled segments
    if check_cache and isfile(pickle_file):
        print("Will load segments from pickle %s" % pickle_file)
        return pd.read_pickle(pickle_file)

    final_crawl_dir = join(dirname(dirname(os.getcwd())), 'data', 'final-crawl')
    db_path = join(final_crawl_dir, "%s.sqlite" % crawler_name)
    con = sqlite3.connect(db_path)
    segments = pd.read_sql_query(SEGMENTS_QUERY, con)
    if dump_pickle:
        segments.to_pickle(pickle_file)
    return segments

def preprocess_segments(segments):
    """Add headers for analysis."""
    segments['time_stamp'] = pd.to_datetime(segments['time_stamp'])
    # replace digits/numbers with DPNUM
    segments['inner_processed'] = segments['inner_text'].map(lambda x: re.sub(r'\d+', 'DPNUM', x))
    # remove non-digits
    segments['inner_digits'] = segments['inner_text'].map(lambda x: re.sub(r'\D+', '', x))

In [3]:
def join_with_comma(series):
    return reduce(lambda x, y: x + "," + y, series)

def differences(series):
    """Return differences between the elements of a series.""" 
    return [int(j)-int(i) for i, j in zip(series[:-1], series[1:])]

def time_differences(series):
    """Return differences in seconds between the elements of a series."""
    return [(j-i).total_seconds() for i, j in zip(series[:-1], series[1:])]

def most_common(diffs):
    """Return the most common number of an iterable."""
    if not diffs: return None
    return Counter(diffs).most_common(1)[0][0]

def most_common_neg(diffs):
    """Return the most common negative number of an iterable."""
    if not diffs: return None
    neg_diffs = [x for x in diffs if x <0]
    if not neg_diffs: return None
    return Counter(neg_diffs).most_common(1)[0][0]

def num_most_common_neg(diffs):
    """Return the number of times the most common negative number occurs."""
    neg_mode = most_common_neg(diffs)
    if not neg_mode: return 0
    return diffs.count(neg_mode)


def is_decreasing(series):
    """Heuristic to determine whether a series is decreasing.
    
    We expect 5 decreasing updates to the timer and the
    number of negative updates must 5 times more than the positive ones."""
    diffs = differences(series)
    if not diffs: return False
    # 10->09, 00->59
    n_negs = sum([1 for diff in diffs if diff<0 and diff not in [59, 5, 9]])
    n_pos = sum([1 for diff in diffs if diff>0])
    n_zeroes = diffs.count(0)
    if n_negs < TIMER_MIN_NO_OF_NEG_UPDATES: return False  # fewer than 5 decreasing updates
    if not n_pos: return True
    return float(n_negs) / n_pos > TIMER_MIN_NEG_POS_UPDATE_RATIO


def is_decreasing_mode(series):
    """Heuristic to determine whether a series is decreasing using mode.
    
    We expect the following:
    - more than 5 negative updates
    - mode of the differences should be negative and it should occur more than 5 times
    - more negative updates than the positive updates

    """
    diffs = differences(series)
    if not diffs: return False
    # to few updates
    if len(set(series)) < 5: return False
    # 10->09, 00->59
    n_negs = sum([1 for diff in diffs if diff<0 and diff not in [59, 5, 9]])
    n_pos = sum([1 for diff in diffs if diff>0])
    n_zeroes = diffs.count(0)
    if n_negs < TIMER_MIN_NO_OF_NEG_UPDATES: return False  # fewer than 5 decreasing updates
    if not n_pos: return True
    mode = most_common(diffs)
    # mode should be negative
    if mode > 0: return False
    neg_mode_cnt = num_most_common_neg(diffs)
    # mode should occur more than 5 times
    if neg_mode_cnt < 5: return False
    # number of negative updates should be more than the positive updates
    return n_negs > n_pos


def num_unique(series):
    return len(set(series))


def ts_check(series):
    """Check whether we got updates on 5 distinct seconds."""
    n_uniq_ts_seconds = len(set([int(ts) for ts in series]))
    return n_uniq_ts_seconds >= 5


## Grouping

In [4]:
def detect_timers(segments, limit=None):
    tmp = segments
    if limit:
        tmp = segments.head(limit)
    segments_grouped = tmp.\
        groupby(['visit_id', 'top', 'left', 'inner_processed'], as_index=False).\
        agg({'node_id': num_unique,
             'time_stamp': ts_check,
             'inner_digits': [is_decreasing, is_decreasing_mode], 'site_url': 'first'})
    segments_grouped.columns = segments_grouped.columns.map('_'.join)
    timers = segments_grouped[segments_grouped.inner_digits_is_decreasing &
                              segments_grouped.inner_digits_is_decreasing_mode &
                              segments_grouped.time_stamp_ts_check]
    return timers, segments_grouped

In [5]:
def dump_timer_urls(timers, crawler_name):
    pd.Series(timers.site_url_first.unique()).\
        to_csv("%s_timer_urls.csv" % crawler_name, sep='\t', index=False)
    

def get_timers(crawler_name, disable_cache=False):
    pickle_path = "%s_grouped_segments.pickle" % crawler_name
    if isfile(pickle_path) and not disable_cache:
        print("Will load grouped segments from pickle %s" % pickle_path)
        grouped_segments = pd.read_pickle(pickle_path)
        timers = grouped_segments[
            grouped_segments.inner_digits_is_decreasing &
            grouped_segments.inner_digits_is_decreasing_mode &
            grouped_segments.time_stamp_ts_check]
        return timers, grouped_segments
    else:
        segments = load_segments(crawler_name)
        preprocess_segments(segments)
        timers, grouped_segments = detect_timers(segments)
        dump_timer_urls(timers, crawler_name)
        grouped_segments.to_pickle(pickle_path)
        return timers, grouped_segments


## Run timer detection

In [6]:
webtap_timers, webtap_segments_grouped = get_timers("webtap")
odin_timers, odin_segments_grouped = get_timers("odin")
all_timers = pd.concat([webtap_timers, odin_timers])
all_segments_grouped = pd.concat([webtap_segments_grouped, odin_segments_grouped])

Will load grouped segments from pickle webtap_grouped_segments.pickle
Will load grouped segments from pickle odin_grouped_segments.pickle


In [7]:
all_timers.site_url_first.nunique()

1618

In [9]:
all_timers.sort_values(
    'site_url_first')[['visit_id_', 'top_', 'left_', 'site_url_first']].to_csv(
    "timer_coords.csv", sep='\t', index=False, header=False)

### Pickle grouped segments for verification

In [10]:
all_urls = list(all_timers.site_url_first.unique())
all_urls.sort()
pd.Series(all_urls).to_csv("timer_urls.csv", index=False)

In [11]:
URL_CSV_CNT=8
for num, urls in enumerate(np.array_split(all_urls, URL_CSV_CNT)):
    pd.Series(urls).to_csv("timer_urls_%d.csv" % (num+1), sep='\t', index=False)

In [12]:
! wc -l  timer_urls*.csv

  1618 timer_urls.csv
   203 timer_urls_1.csv
   203 timer_urls_2.csv
   202 timer_urls_3.csv
   202 timer_urls_4.csv
   202 timer_urls_5.csv
   202 timer_urls_6.csv
   202 timer_urls_7.csv
   202 timer_urls_8.csv
  1836 timer_urls_v2.csv
  5072 total


In [16]:
## The effect of different approaches
print all_segments_grouped[all_segments_grouped.inner_digits_is_decreasing].visit_id_.nunique()
print all_segments_grouped[all_segments_grouped.inner_digits_is_decreasing_mode].visit_id_.nunique()
print all_segments_grouped[all_segments_grouped.inner_digits_is_decreasing & all_segments_grouped.inner_digits_is_decreasing_mode].visit_id_.nunique()
print all_timers.visit_id_.nunique()

1599
1651
1593
1589


In [18]:
all_timers.head()

Unnamed: 0,visit_id_,top_,left_,inner_processed_,time_stamp_ts_check,node_id_num_unique,site_url_first,inner_digits_is_decreasing,inner_digits_is_decreasing_mode,inner_digits_is_decreasing_relaxed
1827,41,145,290,Your order is reserved for DPNUM:DPNUM minutes!,True,1,https://www.anarchiststate.com/collections/top/products/what-is-truth-winter-jacket,True,True,True
1835,41,621,923,DPNUM\nHours\n\t\nDPNUM\nMinutes\n\t\nDPNUM\nDPNUM\nSeconds,True,1,https://www.anarchiststate.com/collections/top/products/what-is-truth-winter-jacket,True,True,True
6472,133,677,1214,Order in the next \nDPNUM\nhrs\nDPNUM\nmins\nDPNUM\nsec for free Friday delivery,True,1,https://www.printerbase.co.uk/canon-6431b001-pgi-550pgbk-xl-high-yield-black-pigment-ink-cartridge.html,True,True,True
7757,161,517,924,"Order within DPNUM hrs, DPNUM mins and DPNUM secs for delivery on Friday DPNUMth February",True,3,https://www.larizia.com/sale-c319/anya-hindmarch-circulus-mini-vere-silver-metallic-leather-barrel-bag-p90261,True,True,True
10700,207,322,268,Check out now for an extra DPNUM% off!\nUse the discount code: FAST. Expires in DPNUM:DPNUM minutes.,True,1,https://faradayscienceshop.com/collections/van-go-paint-by-number-kits/products/fall-van-go-paint-by-number-kit,True,True,True
