In [151]:
import pandas as pd
import numpy as np
import operator
from scipy.spatial.distance import euclidean
import geopy.distance

In [152]:
def remove_small_probabilities(df, threashold = 0.00055):
    return df[df['probability']>threashold]

def assign_square(df, square_size):
    df['square_lat'] = (df['lat']/square_size).astype(int)*square_size
    df['square_lng'] = (df['lng']/square_size).astype(int)*square_size
    return df

def calc_square_representative(df):
    representative = {}
    grouped = df.groupby(['square_lat', 'square_lng'])
    for g in grouped.groups: representative[g]=np.mean(grouped.get_group(g)['probability'])
    return representative

In [153]:
path = 'north_america_west'
spreading_df = pd.read_csv('data/spreading/' + path+'.csv')
spreading_df = remove_small_probabilities(spreading_df)
spreading_df['lng'] = spreading_df['lng'].map(lambda d: d-360 if d>180 else d)
start = spreading_df.loc[0]
spreading_df.drop(0, axis=0, inplace=True)
#maping years from 10 to 5
spreading_df = spreading_df[spreading_df['year']<10]
spreading_df['year'] = ((spreading_df['year']*12+spreading_df['month'])/2).astype(int)

In [154]:
MAX_DIST_eucl = 100
def get_closest_start(point, level_points):
    min_dist = 10000000
    min_node= None
    for node in level_points:
        dist = geopy.distance.vincenty(([node[0], node[1]]), (point[0], point[1])).km
        if dist <min_dist:
            if point[1]<0 and node[1]>0 and euclidean(np.array([node[0], node[1]]), np.array(point)) >MAX_DIST_eucl: continue
            if point[1]>0 and node[1]<0 and euclidean(np.array([node[0], node[1]]), np.array(point)) >MAX_DIST_eucl: continue
            min_node = node
            min_dist = geopy.distance.vincenty(([node[0], node[1]]), (point[0], point[1])).km
    return min_node, min_dist

In [155]:
CHILD_THRESHOLD = 0.001
MAX_DIST = 1500
levels =[[] for _ in range(len(spreading_df['year'].value_counts())+1)]
grouped_years = spreading_df.groupby(['year', 'lat', 'lng'])
tree = {(start['lat'], start['lng'],0): []}
levels[0] = [(start['lat'], start['lng'], 0)]
for g in grouped_years.groups:
    keys = list(tree.keys())
    flag = False
    for k in keys:
        if g[1]== k[0] and g[2]==k[1]: 
            flag=True
            break
    if flag==True: continue
    if grouped_years.get_group(g)['probability'].any() >=CHILD_THRESHOLD: 
        i=1
        while levels[g[0]-i]==[]: i+=1
        parent,dist = get_closest_start((g[1],g[2]),levels[g[0]-i])
        if (parent==None):
            levels[int(g[0]-1)].append((g[1], -179, g[0]-1))   
            levels[int(g[0]-1)].append((g[1], 179, g[0]-1)) 
            tree[(g[1],-179, g[0]-1)] = []  
            tree[(g[1],179, g[0]-1)] = [] 
            parent,dist = get_closest_start((g[1],g[2]),levels[g[0]-1])
        if parent==None: 
            print(g)
            continue
        if dist>MAX_DIST:
            continue
        levels[int(g[0])].append((g[1], g[2], g[0]))
        tree[parent].append((g[1],g[2],g[0]))
        tree[(g[1],g[2], g[0])] = []  

In [156]:
edges_df=pd.DataFrame(columns = ['parent_lat', 'parent_lng', 'child_lat', 'child_lng', 'depth'])
for p in tree.keys():
    edges_df = edges_df.append(pd.DataFrame({'parent_lat': p[0], 'parent_lng':p[1], 'depth': [i[2] for i in tree[p]],\
                                    'child_lat': [i[0] for i in tree[p]], 'child_lng': [i[1] for i in tree[p]]}))
edges_df.reset_index(inplace=True, drop =True)
edges_df['parent_lng']=edges_df['parent_lng'].apply(lambda d: d+360 if d<0 else d)
edges_df['child_lng']=edges_df['child_lng'].map(lambda d: d+360 if d<0 else d)
#edges_df.sort_values(by=['parent_lng', 'child_lng'], inplace=True)
edges_df.to_csv('data/spreading/edges/'+path+'_edges.csv')

In [157]:
edges_df

Unnamed: 0,child_lat,child_lng,depth,parent_lat,parent_lng
0,19,249,1,21.0,249.0
1,19,250,1,21.0,249.0
2,19,251,1,21.0,249.0
3,20,249,1,21.0,249.0
4,20,250,1,21.0,249.0
5,20,251,1,21.0,249.0
6,21,246,1,21.0,249.0
7,21,247,1,21.0,249.0
8,21,248,1,21.0,249.0
9,21,250,1,21.0,249.0
