# Find *stories* in the New York Time data

In [None]:
import itertools
from lsh import minhash  # https://github.com/mattilyra/lsh
import datetime
from api_client import APIclient
import pandas as pd
import community
import networkx as nx
import numpy as np
from collections import defaultdict
import pickle

In [None]:
char_ngram = 4
bands = 20
seeds = 100
jaccard_min = 0.7
jaccard_max = 0.95
start_year = 1999
end_year = 2020
api_client = APIclient()

hasher = minhash.MinHasher(seeds=seeds, char_ngram=char_ngram, hashbytes=4)

def generate_shingles(text):
    return set(text[head:head + char_ngram] for head in range(0, len(text) - char_ngram))

def jaccard(set_a, set_b):
    intersection = set_a & set_b
    union = set_a | set_b
    return len(intersection) / len(union)

def clean_text(df):
    df['text'] = df['headline'].astype(str) + ' ' + df['snippet'].astype(str)
    df['text'] = df['text'].apply(lambda x: x.encode('utf8'))
    df = df[['_id', 'text']]
    df = df.set_index('_id')
    return df

# Get data from Mongo, generate fingerprint

In [None]:
for year in range(start_year, end_year):
    res = api_client.aggregate(
            [
                {'$match': {'pub_date': {'$lte': datetime.datetime(year, 12, 31),
                                         '$gte': datetime.datetime(year, 1, 1)}}},
                {
                    '$project':
                        {
                            'headline': '$headline.main',
                            'snippet': '$snippet',
                            # 'by': {'$substr': ['$byline.original', 3, -1]},
                        }
                }
            ]
    )
    df = pd.DataFrame(list(res)).dropna()
    df = clean_text(df)

    df['fingerprint'] = df['text'].apply(lambda t: hasher.fingerprint(t))
    df['fingerprint'].to_pickle('fingerprint/{}.pkl'.format(year))

#  Hash to buckets

In [None]:
bins = [defaultdict(set) for _ in range(bands)]

def bins_gen(fingerprint):
    yield from enumerate(np.array_split(fingerprint, bands))

def add_fingerprint(fingerprint, doc_id):
    for bin_i, bucket in bins_gen(fingerprint):
        bucket_id = hash(tuple(bucket))
        bins[bin_i][bucket_id].add(doc_id)

In [None]:
for year in range(start_year, end_year):
    df = pd.read_pickle('fingerprint/{}.pkl'.format(year))
    for i in range(len(df)):
        add_fingerprint(df.iloc[i], doc_id=df.index.values[i])
    with open('bins/{}.pkl'.format(year), 'wb') as f:
        pickle.dump(bins, f)
    
    del df
    del bins
    bins = [defaultdict(set) for _ in range(bands)]

# Find candidate pairs

In [None]:
for bin_i in range(bands):
    b = defaultdict(set)
    for year in range(start_year, end_year):
        with open('bins/{}.pkl'.format(year), 'rb') as f:
            bins = pickle.load(f)[bin_i]   
            for bucket_id in bins:
                b[bucket_id].update(bins[bucket_id])
            del bins
    print(bin_i)
    with open('bins/bin_{}.pkl'.format(bin_i), 'wb') as f:
        pickle.dump(b, f)

In [None]:
candidate_pairs = set()
for bin_i in range(bands):
    with open('bins/bin_{}.pkl'.format(bin_i), 'rb') as f:
        b = pickle.load(f) 
        for bucket_id in b:
            if len(b[bucket_id]) > 1:
                pairs = set(itertools.combinations(b[bucket_id], r=2))
                candidate_pairs.update(pairs)
        del b
    print(bin_i)
with open('candidate_pairs.pkl', 'wb') as f:
        pickle.dump(candidate_pairs, f)

# Compute Jaccard sim. Find communities

In [None]:
def get_text_by_id(id):
    res = api_client.aggregate(
            [
                {'$match': {'_id': id}},
                {'$project': {'headline': '$headline.main', 'snippet': '$snippet'}}
            ]
        )
    return clean_text(pd.DataFrame(list(res)))

In [None]:
with open('candidate_pairs.pkl', 'rb') as f:
        candidate_pairs = pickle.load(f)
G = nx.Graph()
for docid_a, docid_b in candidate_pairs:
    shingles_a = generate_shingles(get_text_by_id(docid_a).iloc[0]['text'])
    shingles_b = generate_shingles(get_text_by_id(docid_b).iloc[0]['text'])
    jaccard_sim = jaccard(shingles_a, shingles_b)
    if jaccard_min <= jaccard_sim <= jaccard_max:
        G.add_edge(docid_a, docid_b, weight=jaccard_sim)
        
print('{} of actual pairs were found'.format(nx.number_of_edges(G)))

partition = community.best_partition(G)
comm = []
for com in set(partition.values()):
    comm.append([nodes for nodes in partition.keys() if partition[nodes] == com])

# Get Stories' full data and visualize

In [None]:
stories = []
for c in comm:
    res = api_client.aggregate(
        [
            {'$match': {'_id': {'$in': c}}},
            {
                '$project':
                    {
                        'pub_date': '$pub_date',
                        'headline': '$headline.main',
                        'url': '$web_url',
                    }
            },
            {'$sort': {'pub_date': 1}}
        ]
    )
    story = pd.DataFrame(list(res))

    # if story has last less then 1 day, it isn't a story
    if (story.iloc[-1]['pub_date'] - story.iloc[0]['pub_date']).days > 1:
        stories.append(story)

In [None]:
from bokeh.models import ColumnDataSource, TapTool, OpenURL
from bokeh.layouts import gridplot
from bokeh.plotting import figure, save, output_file

In [None]:
stories_in_page = 40
files_count = 0

print('{} stories were found'.format(len(stories)))


def save_stories(grid):
    global files_count
    output_file('stories_{}.html'.format(files_count))
    files_count += 1
    save(gridplot(grid))


TOOLTIPS = """
    <div id="Tooltip">
        <div>
            <span style="font-size: 16px; font-weight: bold;">@headline</span>
        </div>
        <div>
            <span style="font-size: 10px;">Click to the article!</span>
        </div>
    </div>
"""
grid = []
for i, story in enumerate(stories):
    if i and not i % stories_in_page:
        save_stories(grid)
        grid = []
    story['y'] = 1
    p = figure(plot_height=100, plot_width=1200, title=story.iloc[0]['headline'],
               x_axis_type='datetime', tools='tap', tooltips=TOOLTIPS)
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.yaxis.visible = False
    source = ColumnDataSource(story)
    p.square('pub_date', 'y', size=10, source=source, fill_alpha=0.5)
    url = '@url'
    taptool = p.select(type=TapTool)
    taptool.callback = OpenURL(url=url)
    grid.append([p])

save_stories(grid)