In [341]:
import pandas as pd
import numpy as np
import operator
from scipy.spatial.distance import euclidean

In [342]:
def remove_small_probabilities(df, threashold = 0.00055):
    return df[df['probability']>threashold]

def assign_square(df, square_size):
    df['square_lat'] = (df['lat']/10).astype(int)*10
    df['square_lng'] = (df['lng']/10).astype(int)*10
    return df

def calc_square_representative(df):
    representative = {}
    grouped = df.groupby(['square_lat', 'square_lng'])
    for g in grouped.groups: representative[g]=np.mean(grouped.get_group(g)['probability'])
    return representative

In [343]:
spreading_df = pd.read_csv('spreading.csv')
#if spreading_df['lng'][(spreading_df['lng']>200) & (spreading_df['lng']<300)].any(): print(True)

In [344]:
spreading_df = pd.read_csv('spreading.csv')
start = spreading_df.loc[0]
spreading_df.drop(0, axis=0, inplace=True)
spreading_df = remove_small_probabilities(spreading_df)
#normalize lat and lng
lat_dom = min(spreading_df['lat'])
lng_dom = min(spreading_df['lng'])
spreading_df['lat'] = spreading_df['lat']-min(spreading_df['lat'])
spreading_df['lng'] = spreading_df['lng']-min(spreading_df['lng'])
#maping years from 10 to 5
spreading_df = spreading_df[spreading_df['year']<10]
spreading_df['year'] = ((spreading_df['year']+2)/2).astype(int)-1

In [345]:
#calculate for each data point its square, and probability of its square 
SQUARE_SIZE= 10
spreading_df = assign_square(spreading_df, SQUARE_SIZE)
square_prob= calc_square_representative(spreading_df)
def map_prob_sqr(sqr_lat, sqr_lng):
    return square_prob[(sqr_lat, sqr_lng)]
spreading_df['square_prob'] = np.vectorize(map_prob_sqr)(spreading_df['square_lat'], spreading_df['square_lng'])

In [346]:
def get_closest_start(point, level_points):
    min_dist = 100000
    for node in level_points:
        if euclidean(np.array(node), np.array(point)) <min_dist:
            min_node = node
            min_dist = euclidean(np.array(node), np.array(point))
    return min_node

In [347]:
CHILD_THRESHOLD = 0.001
grouped_years = spreading_df.groupby(['year', 'square_lat', 'square_lng'])
tree = {(start['lat'], start['lng']): []}
levels =[[]]*6
levels[0] = [(start['lat'], start['lng'])]
spreading_df = pd.DataFrame(grouped_years.sum())
for g in grouped_years.groups:
    if (g[1], g[2]) in tree.keys(): continue
    if grouped_years.get_group(g)['probability'].any() >=CHILD_THRESHOLD: 
        g = list(g)
        g[1]+=lat_dom+5
        g[2]+=lng_dom+5
        parent = get_closest_start((g[1],g[2]),levels[g[0]])
        levels[g[0]+1].append((g[1], g[2]))
        #print(g, parent)
        tree[parent].append((g[1],g[2]))
        tree[(g[1],g[2])] = []
        

In [348]:
tree

{(-37, 24): [],
 (-37, 34): [],
 (-37, 44): [],
 (-37, 54): [],
 (-37, 64): [],
 (-37, 74): [],
 (-37, 84): [],
 (-37, 94): [],
 (-37, 134): [],
 (-37, 144): [],
 (-37, 334): [(-27, 324)],
 (-37, 344): [(-37, 354)],
 (-37, 354): [],
 (-27, 34): [],
 (-27, 44): [],
 (-27, 54): [],
 (-27, 64): [],
 (-27, 74): [],
 (-27, 84): [],
 (-27, 94): [],
 (-27, 324): [],
 (-27, 334): [],
 (-27, 344): [(-27, 354)],
 (-27, 354): [],
 (-17, 34): [],
 (-17, 44): [],
 (-17, 54): [],
 (-17, 64): [],
 (-17, 74): [],
 (-17, 84): [],
 (-17, 94): [],
 (-17, 104): [],
 (-11.0, 43.0): [(-37, 24),
  (-37, 34),
  (-27, 34),
  (-27, 44),
  (-27, 94),
  (-17, 34),
  (-17, 44),
  (-17, 54),
  (-17, 64),
  (-17, 74),
  (-17, 84),
  (-17, 94),
  (-17, 104),
  (-7, 34),
  (-7, 44),
  (-7, 54),
  (-7, 64),
  (-7, 74),
  (-7, 84),
  (-7, 94),
  (-7, 104),
  (-7, 114),
  (3, 44),
  (3, 54),
  (3, 64),
  (3, 74),
  (3, 84),
  (3, 94),
  (13, 44),
  (13, 54),
  (13, 64),
  (13, 74),
  (13, 84),
  (13, 94),
  (23, 54),
  (