In [1]:
import re

import spacy
import requests
import re
import IPython
from daterangeparser import parse

In [2]:
response = requests.get('https://raw.githubusercontent.com/qualicen/timeline/master/history_of_germany.txt')
text = response.text
print('Loaded {} lines'.format(text.count('\n')))

Loaded 744 lines


In [3]:
nlp = spacy.load("en_core_web_sm")

def dep_subtree(token, dep):
    deps = [child.dep_ for child in token.children]
    child = next(filter(lambda c: c.dep_ == dep, token.children), None)
    if child != None:
        return ' '.join([c.text for c in child.subtree])
    else:
        return ''


# to remove citations, e.g. "[91]" as this makes problems with spaCy

p = re.compile(r'\[\d+\]')


def extract_events_spacy(line):
    line = p.sub('', line)
    events = []
    doc = nlp(line)
    for ent in filter(lambda e: e.label_ == 'DATE', doc.ents):
        try:
            (start, end) = parse(ent.text)
        except:

      # could not parse the dates, hence ignore it

            continue
        current = ent.root
        while current.dep_ != 'ROOT':
            current = current.head
        desc = ' '.join(filter(None, [
            dep_subtree(current, 'nsubj'),
            dep_subtree(current, 'nsubjpass'),
            dep_subtree(current, 'auxpass'),
            dep_subtree(current, 'amod'),
            dep_subtree(current, 'det'),
            current.text,
            dep_subtree(current, 'acl'),
            dep_subtree(current, 'dobj'),
            dep_subtree(current, 'attr'),
            dep_subtree(current, 'advmod'),
            ]))
        events = events + [(start, ent.text, desc)]
    return events

In [11]:
extract_events_spacy("The Protestant Reformation was the first successful challenge to the Catholic Church and began in 1521 as Luther was outlawed at the Diet of Worms after his refusal to repent. ")

[(datetime.datetime(1521, 1, 1, 0, 0),
  '1521',
  'The Protestant Reformation was the first successful challenge to the Catholic Church')]

In [4]:
timeline = extract_events_spacy("McDonald's Corporation is an American fast food company, founded in 1940 as a restaurant operated by Richard and Maurice McDonald, in San Bernardino, California, United States. They rechristened their business as a hamburger stand, and later turned the company into a franchise, with the Golden Arches logo being introduced in 1953 at a location in Phoenix, Arizona. In 1955, Ray Kroc, a businessman, joined the company as a franchise agent and proceeded to purchase the chain from the McDonald brothers. McDonald's had its original headquarters in Oak Brook, Illinois, but moved its global headquarters to Chicago in June 2018.[6][7][8][9] McDonald's is the world's largest restaurant chain by revenue,[10] serving over 69 million customers daily in over 100 countries[11] across 37,855 outlets as of 2018.[12][13] Although McDonald's is best known for its hamburgers, cheeseburgers and french fries, they feature chicken products, breakfast items, soft drinks, milkshakes, wraps, and desserts. In response to changing consumer tastes and a negative backlash because of the unhealthiness of their food,[14] the company has added to its menu salads, fish, smoothies, and fruit. The McDonald's Corporation revenues come from the rent, royalties, and fees paid by the franchisees, as well as sales in company-operated restaurants. According to two reports published in 2018, McDonald's is the world's second-largest private employer with 1.7 million employees (behind Walmart with 2.3 million employees).")

In [9]:
import datetime
import numpy as np

a = np.array([(datetime.datetime.utcnow() - date[0]).days for date in timeline])
dict(zip(a, timeline))

{29537: (datetime.datetime(1940, 1, 1, 0, 0),
  '1940',
  "McDonald 's Corporation is an American fast food company , founded in 1940 as a restaurant operated by Richard and Maurice McDonald , in San Bernardino , California , United States"),
 24788: (datetime.datetime(1953, 1, 1, 0, 0),
  '1953',
  'They rechristened their business'),
 24058: (datetime.datetime(1955, 1, 1, 0, 0),
  '1955',
  'Ray Kroc , a businessman , joined the company'),
 896: (datetime.datetime(2018, 6, 1, 0, 0),
  'June 2018',
  "McDonald 's had its original headquarters in Oak Brook , Illinois"),
 1047: (datetime.datetime(2018, 1, 1, 0, 0),
  '2018',
  "McDonald 's is the world 's second - largest private employer with 1.7 million employees ( behind Walmart")}

In [29]:
import numpy as np

def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

subarrays = rolling_window(a, 3)
stds = list(map(np.std, subarrays))
list(zip(subarrays, stds))

[(array([29537, 24788, 24058]), 2429.113738702977),
 (array([24788, 24058,   896]), 11094.737531320372),
 (array([24058,   896,  1047]), 10883.255058421937),
 (array([ 896, 1047, 1047]), 71.1820826394458)]

In [11]:
import numpy as np

def std(arrays):
    total = 0
    for arr in arrays:
        total+=np.std(np.array(arr))
    return (total)
def splitter(arr):
    for i in range(1, len(arr)):
        start = arr[0:i]
        end = arr[i:]
        if(len(start) < 2 or len(end) < 2):
            continue
        yield (start, end)
        for split in splitter(end):
            result = [start]
            result.extend(split)
            yield result
            
minimum = 999999999
best_split = None
for split in splitter(a):
    print(split)
    std_ = std(split)
    if(std_ < minimum):
        minimum = std_
        best_split = split
        
print("best_split")
print(best_split)
print("minimum")
print(minimum)

(array([29537, 24788]), array([24058,   896,  1047,  1047]))
[array([29537, 24788]), array([24058,   896]), array([1047, 1047])]
(array([29537, 24788, 24058]), array([ 896, 1047, 1047]))
(array([29537, 24788, 24058,   896]), array([1047, 1047]))
best_split
[29537 24788 24058   896  1047  1047]
minimum
0.0


In [13]:
std_

0.0