In [112]:
import pandas as pd
import numpy as np
import operator
from scipy.spatial.distance import euclidean

In [113]:
def remove_small_probabilities(df, threashold = 0.00055):
    return df[df['probability']>threashold]

def assign_square(df, square_size):
    df['square_lat'] = (df['lat']/square_size).astype(int)*square_size
    df['square_lng'] = (df['lng']/square_size).astype(int)*square_size
    return df

def calc_square_representative(df):
    representative = {}
    grouped = df.groupby(['square_lat', 'square_lng'])
    for g in grouped.groups: representative[g]=np.mean(grouped.get_group(g)['probability'])
    return representative

In [114]:
spreading_df = pd.read_csv('data/spreading/spreading.csv')
start = spreading_df.loc[0]
spreading_df.drop(0, axis=0, inplace=True)
spreading_df = remove_small_probabilities(spreading_df)
#normalize lat and lng
lat_dom = min(spreading_df['lat'])
lng_dom = min(spreading_df['lng'])
spreading_df['lat'] = spreading_df['lat']-min(spreading_df['lat'])
spreading_df['lng'] = spreading_df['lng']-min(spreading_df['lng'])
#maping years from 10 to 5
spreading_df = spreading_df[spreading_df['year']<10]
spreading_df['year'] = ((spreading_df['year']+2)/2).astype(int)

In [115]:
#calculate for each data point its square, and probability of its square 
SQUARE_SIZE= 5
spreading_df = assign_square(spreading_df, SQUARE_SIZE)
square_prob= calc_square_representative(spreading_df)
def map_prob_sqr(sqr_lat, sqr_lng):
    return square_prob[(sqr_lat, sqr_lng)]
spreading_df['square_prob'] = np.vectorize(map_prob_sqr)(spreading_df['square_lat'], spreading_df['square_lng'])

In [116]:
def get_closest_start(point, level_points):
    min_dist = 100000
    for node in level_points:
        if euclidean(np.array([node[0], node[1]]), np.array(point)) <min_dist:
            min_node = node
            min_dist = euclidean(np.array([node[0], node[1]]), np.array(point))
    return min_node

In [133]:
CHILD_THRESHOLD = 0.001
grouped_years = spreading_df.groupby(['year', 'square_lat', 'square_lng'])
tree = {(start['lat'], start['lng'],0): []}
levels =[[]]*6
levels[0] = [(start['lat'], start['lng'], 0)]
spreading_df = pd.DataFrame(grouped_years.sum())
for g in grouped_years.groups:
    keys = list(tree.keys())
    flag = False
    for k in keys:
        if g[1]+lat_dom+int(SQUARE_SIZE/2)== k[0] and g[2]+lng_dom+int(SQUARE_SIZE/2)==k[1]: 
            flag=True
            break
    if flag==True:
        continue
    if grouped_years.get_group(g)['probability'].any() >=CHILD_THRESHOLD: 
        g = list(g)
        g[1]+=lat_dom+int(SQUARE_SIZE/2)
        g[2]+=lng_dom+int(SQUARE_SIZE/2)
        parent = get_closest_start((g[1],g[2]),levels[g[0]-1])
        levels[g[0]].append((g[1], g[2], g[0]))
        tree[parent].append((g[1],g[2],g[0]))
        tree[(g[1],g[2], g[0])] = []   

working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working
working


In [134]:
nodes_df = pd.DataFrame({'node_lng': [i[1] for i in tree.keys()], 'node_lat': [i[0] for i in tree.keys()], 'depth': [i[2] for i in tree.keys()]})
nodes_df.to_csv('data/spreading/spreading_tree_nodes.csv')
edges_df=pd.DataFrame(columns = ['parent_lat', 'parent_lng', 'child_lat', 'child_lng', 'depth'])
for p in tree.keys():
    edges_df = edges_df.append(pd.DataFrame({'parent_lat': p[0], 'parent_lng':p[1], 'depth': [i[2] for i in tree[p]],\
                                    'child_lat': [i[0] for i in tree[p]], 'child_lng': [i[1] for i in tree[p]]}))
edges_df.to_csv('data/spreading/spreading_tree_edges.csv')

In [135]:
tree

{(-40, 21, 1): [],
 (-40, 31, 1): [(-40, 36, 3)],
 (-40, 36, 3): [],
 (-40, 46, 3): [],
 (-40, 51, 4): [],
 (-40, 141, 5): [],
 (-40, 146, 4): [(-35, 131, 4), (-30, 341, 4), (-40, 141, 5)],
 (-35, 26, 1): [],
 (-35, 31, 1): [],
 (-35, 36, 1): [(-35, 41, 2)],
 (-35, 41, 2): [(-35, 46, 2)],
 (-35, 46, 2): [(-35, 51, 2), (-40, 46, 3)],
 (-35, 51, 2): [(-35, 61, 2), (-30, 51, 2), (-35, 56, 3), (-40, 51, 4)],
 (-35, 56, 3): [],
 (-35, 61, 2): [(-30, 61, 2), (-35, 66, 3)],
 (-35, 66, 3): [],
 (-35, 71, 3): [],
 (-35, 76, 3): [(-35, 81, 4)],
 (-35, 81, 4): [],
 (-35, 86, 4): [],
 (-35, 91, 4): [],
 (-35, 131, 4): [(-35, 136, 4)],
 (-35, 136, 4): [],
 (-35, 331, 5): [(-35, 336, 5), (-30, 326, 5), (-30, 331, 5)],
 (-35, 336, 5): [],
 (-35, 341, 5): [],
 (-35, 346, 5): [(-35, 351, 5)],
 (-35, 351, 5): [(-35, 356, 5)],
 (-35, 356, 5): [(-30, 356, 5)],
 (-30, 31, 1): [],
 (-30, 36, 1): [],
 (-30, 41, 1): [(-30, 46, 2)],
 (-30, 46, 2): [],
 (-30, 51, 2): [(-30, 56, 2), (-25, 51, 2)],
 (-30, 56, 2):

In [136]:
edges_df

Unnamed: 0,child_lat,child_lng,depth,parent_lat,parent_lng
0,-40,21,1,-11.0,43.0
1,-40,31,1,-11.0,43.0
2,-35,26,1,-11.0,43.0
3,-35,31,1,-11.0,43.0
4,-35,36,1,-11.0,43.0
5,-30,31,1,-11.0,43.0
6,-30,36,1,-11.0,43.0
7,-30,41,1,-11.0,43.0
8,-25,31,1,-11.0,43.0
9,-25,36,1,-11.0,43.0


In [137]:
nodes_df

Unnamed: 0,depth,node_lat,node_lng
0,0,-11.0,43.0
1,1,-40.0,21.0
2,1,-40.0,31.0
3,1,-35.0,26.0
4,1,-35.0,31.0
5,1,-35.0,36.0
6,1,-30.0,31.0
7,1,-30.0,36.0
8,1,-30.0,41.0
9,1,-25.0,31.0
