In [102]:
import pandas as pd
import numpy as np
import operator
from scipy.spatial.distance import euclidean

In [103]:
def remove_small_probabilities(df, threashold = 0.00055):
    return df[df['probability']>threashold]

def assign_square(df, square_size):
    df['square_lat'] = (df['lat']/square_size).astype(int)*square_size
    df['square_lng'] = (df['lng']/square_size).astype(int)*square_size
    return df

def calc_square_representative(df):
    representative = {}
    grouped = df.groupby(['square_lat', 'square_lng'])
    for g in grouped.groups: representative[g]=np.mean(grouped.get_group(g)['probability'])
    return representative

In [104]:
path = 'west_africa'
spreading_df = pd.read_csv('data/spreading/' + path+'.csv')
start = spreading_df.loc[0]
spreading_df.drop(0, axis=0, inplace=True)
spreading_df = remove_small_probabilities(spreading_df)
#normalize lat and lng
lat_dom = min(spreading_df['lat'])
lng_dom = min(spreading_df['lng'])
spreading_df['lat'] = spreading_df['lat']-min(spreading_df['lat'])
spreading_df['lng'] = spreading_df['lng']-min(spreading_df['lng'])
#maping years from 10 to 5
spreading_df = spreading_df[spreading_df['year']<10]
spreading_df['year'] = ((spreading_df['year']+2)/2).astype(int)

In [105]:
#calculate for each data point its square, and probability of its square 
SQUARE_SIZE= 5
spreading_df = assign_square(spreading_df, SQUARE_SIZE)
square_prob= calc_square_representative(spreading_df)
def map_prob_sqr(sqr_lat, sqr_lng):
    return square_prob[(sqr_lat, sqr_lng)]
spreading_df['square_prob'] = np.vectorize(map_prob_sqr)(spreading_df['square_lat'], spreading_df['square_lng'])

In [106]:
def get_closest_start(point, level_points):
    min_dist = 100000
    for node in level_points:
        if euclidean(np.array([node[0], node[1]]), np.array(point)) <min_dist:
            min_node = node
            min_dist = euclidean(np.array([node[0], node[1]]), np.array(point))
    return min_node

In [107]:
CHILD_THRESHOLD = 0.001
grouped_years = spreading_df.groupby(['year', 'square_lat', 'square_lng'])
tree = {(start['lat'], start['lng'],0): []}
levels =[[]]*6
levels[0] = [(start['lat'], start['lng'], 0)]
spreading_df = pd.DataFrame(grouped_years.sum())
for g in grouped_years.groups:
    keys = list(tree.keys())
    flag = False
    for k in keys:
        if g[1]+lat_dom+int(SQUARE_SIZE/2)== k[0] and g[2]+lng_dom+int(SQUARE_SIZE/2)==k[1]: 
            flag=True
            break
    if flag==True:
        continue
    if grouped_years.get_group(g)['probability'].any() >=CHILD_THRESHOLD: 
        g = list(g)
        g[1]+=lat_dom+int(SQUARE_SIZE/2)
        g[2]+=lng_dom+int(SQUARE_SIZE/2)
        parent = get_closest_start((g[1],g[2]),levels[g[0]-1])
        levels[g[0]].append((g[1], g[2], g[0]))
        tree[parent].append((g[1],g[2],g[0]))
        tree[(g[1],g[2], g[0])] = []   

In [108]:
nodes_df = pd.DataFrame({'node_lng': [i[1] for i in tree.keys()], 'node_lat': [i[0] for i in tree.keys()], 'depth': [i[2] for i in tree.keys()]})
nodes_df.to_csv('data/spreading/nodes/'+path+'_nodes.csv')
edges_df=pd.DataFrame(columns = ['parent_lat', 'parent_lng', 'child_lat', 'child_lng', 'depth'])
for p in tree.keys():
    edges_df = edges_df.append(pd.DataFrame({'parent_lat': p[0], 'parent_lng':p[1], 'depth': [i[2] for i in tree[p]],\
                                    'child_lat': [i[0] for i in tree[p]], 'child_lng': [i[1] for i in tree[p]]}))
edges_df.to_csv('data/spreading/edges/'+path+'_edges.csv')

In [109]:
tree

{(-31, 317, 2): [],
 (-31, 342, 5): [],
 (-31, 347, 4): [(-31, 342, 5), (-31, 352, 5)],
 (-31, 352, 5): [],
 (-26, 317, 1): [(-31, 317, 2)],
 (-26, 322, 1): [],
 (-21, 322, 1): [],
 (-16, 322, 1): [],
 (-16, 327, 1): [],
 (-16, 332, 1): [],
 (-11, 322, 1): [],
 (-11, 327, 1): [],
 (-11, 332, 1): [],
 (-11, 337, 1): [],
 (-11, 342, 1): [],
 (-11, 347, 1): [(-31, 347, 4)],
 (-11, 352, 1): [],
 (-6, 2, 1): [],
 (-6, 7, 1): [],
 (-6, 322, 1): [],
 (-6, 327, 1): [],
 (-6, 332, 1): [],
 (-6, 337, 1): [],
 (-6, 342, 1): [],
 (-6, 347, 1): [],
 (-6, 352, 1): [],
 (-6, 357, 1): [],
 (-1, 2, 1): [],
 (-1, 7, 1): [],
 (-1, 312, 1): [],
 (-1, 317, 1): [],
 (-1, 322, 1): [],
 (-1, 327, 1): [],
 (-1, 332, 1): [],
 (-1, 337, 1): [],
 (-1, 342, 1): [],
 (-1, 347, 1): [],
 (-1, 352, 1): [],
 (-1, 357, 1): [],
 (1.0, 357.0, 0): [(-26, 317, 1),
  (-26, 322, 1),
  (-21, 322, 1),
  (-16, 322, 1),
  (-16, 327, 1),
  (-16, 332, 1),
  (-11, 322, 1),
  (-11, 327, 1),
  (-11, 332, 1),
  (-11, 337, 1),
  (-11, 3

In [110]:
edges_df

Unnamed: 0,child_lat,child_lng,depth,parent_lat,parent_lng
0,-26,317,1,1.0,357.0
1,-26,322,1,1.0,357.0
2,-21,322,1,1.0,357.0
3,-16,322,1,1.0,357.0
4,-16,327,1,1.0,357.0
5,-16,332,1,1.0,357.0
6,-11,322,1,1.0,357.0
7,-11,327,1,1.0,357.0
8,-11,332,1,1.0,357.0
9,-11,337,1,1.0,357.0


In [111]:
nodes_df

Unnamed: 0,depth,node_lat,node_lng
0,0,1.0,357.0
1,1,-26.0,317.0
2,1,-26.0,322.0
3,1,-21.0,322.0
4,1,-16.0,322.0
5,1,-16.0,327.0
6,1,-16.0,332.0
7,1,-11.0,322.0
8,1,-11.0,327.0
9,1,-11.0,332.0
