In [1]:
import pandas as pd
import pickle

from tqdm import tqdm
from IPython.display import clear_output
from _lib.helper import mkdir, get_filepaths
from _lib.settings import CITY_NAMES, HEX_RESOLUTIONS
from _lib.settings import DATA_AFTER_PREPARATION_DIR, DATA_TRIPS_AS_HEXES_DIR, DATA_TRIPS_AS_HEXES_GRAPH_DIR
from _lib.h3_helper import get_trips_as_hexes, get_trips_inside_city, get_city_polygon
from _lib.data_preparation import get_trip_start

# Converting points 2 hexes & Removing trips outside cities

In [2]:
mkdir(f'{DATA_TRIPS_AS_HEXES_DIR}/inside_city')
mkdir(f'{DATA_TRIPS_AS_HEXES_DIR}/city_shapes')

### Download City Shapes

In [3]:
for c_name_item in CITY_NAMES.items():
    shape = get_city_polygon(c_name_item[1])
    with open(f'{DATA_TRIPS_AS_HEXES_DIR}/city_shapes/{c_name_item[0]}.pickle', 'wb') as f:
        pickle.dump(shape, f)
    clear_output(wait=True)

2022-03-28 21:16:03 Configured OSMnx 1.1.2
2022-03-28 21:16:03 HTTP response caching is on
2022-03-28 21:16:03 Retrieved response from cache file "cache/13e27f1da8b13c78e02db470d9147fcf2deb630d.json"
2022-03-28 21:16:03 Created GeoDataFrame with 1 rows from 1 queries


  geometry = geometry[0]


In [4]:
df_logs = pd.DataFrame(columns=['cname', 'resolution', 'trips_count', 'trips_inside_city_count'])

for c_name in CITY_NAMES:
    df_points = pd.read_csv(f'{DATA_AFTER_PREPARATION_DIR}/{c_name}.csv', sep=';')
    df_points = get_trip_start(df_points)
    for resolution in HEX_RESOLUTIONS:
        print(c_name, resolution)
        df_hexes = get_trips_as_hexes(df_points, resolution)
        # df_hex.to_csv(f'{DATA_TRIPS_AS_HEXES_DIR}/{c_name}_{resolution}.csv', index=False, sep=';')

        with open(f'{DATA_TRIPS_AS_HEXES_DIR}/city_shapes/{c_name}.pickle', 'rb') as f:
            city_shape = pickle.load(f)

        df_trips_inside_city = get_trips_inside_city(df_hexes, city_shape, resolution)
        df_trips_inside_city.to_csv(f'{DATA_TRIPS_AS_HEXES_DIR}/inside_city/{c_name}_{resolution}.csv', index=False, sep=';')
        
        df_logs = df_logs.append({  
                                    'c_name': CITY_NAMES[c_name], 
                                    'resolution': resolution,
                                    'trips_count': df_hexes['tripid'].unique().size, 
                                    'trips_inside_city_count': df_trips_inside_city['tripid'].unique().size,
                                    'distinct_hex_count': df_trips_inside_city['hexid'].unique().size
                                }, ignore_index=True)
        clear_output(wait=True)

df_logs.to_csv(f'{DATA_TRIPS_AS_HEXES_DIR}/_logs_trips_inside_cities_count.csv', sep=';')

sod 12


Converting distinct points 2 hexagons with resolution 12: 100%|██████████| 427132/427132 [00:01<00:00, 221344.12it/s]
Searching 4 distinct hexagons over trips: 100%|██████████| 452767/452767 [00:01<00:00, 367767.84it/s]


Hexagons count / Points count : 404421 / 452767
100%|██████████| 452766/452766 [00:00<00:00, 661628.63it/s]


In [5]:
df_logs

Unnamed: 0,cname,resolution,trips_count,trips_inside_city_count,c_name,distinct_hex_count
0,,8,3371,2352,"Amiens, France",59.0
1,,9,3371,2363,"Amiens, France",320.0
2,,10,3371,2286,"Amiens, France",1563.0
3,,11,3371,2287,"Amiens, France",6590.0
4,,12,3371,2364,"Amiens, France",26077.0
5,,8,90455,81480,"Wroclaw, Poland",409.0
6,,9,90455,81930,"Wroclaw, Poland",2550.0
7,,10,90455,81835,"Wroclaw, Poland",14138.0
8,,11,90455,81910,"Wroclaw, Poland",71330.0
9,,12,90455,82046,"Wroclaw, Poland",331540.0


### Joining datasets

In [6]:
fpaths = get_filepaths(f'{DATA_TRIPS_AS_HEXES_DIR}/inside_city', '.csv', list(CITY_NAMES.keys()))

for resolution in tqdm(HEX_RESOLUTIONS):
    print("Resolution:", resolution)
    df_joined = pd.DataFrame()
    for fpath in get_filepaths(f'{DATA_TRIPS_AS_HEXES_DIR}/inside_city', f'{resolution}.csv', list(CITY_NAMES.keys())):
        c_name = fpath.split('/')[-1].split('_')[0]
        print(CITY_NAMES[c_name])
        df = pd.read_csv(fpath, sep=';')
        df_joined = pd.concat([df_joined, df], ignore_index=True)
    df_joined.to_csv(f'{DATA_TRIPS_AS_HEXES_DIR}/{resolution}.csv', sep=';', index=False)
    
    clear_output(wait=True)

100%|██████████| 5/5 [11:35<00:00, 139.11s/it]


### Save as graph

In [2]:
def save_as_graph(resolution):
    df = pd.read_csv(f'{DATA_TRIPS_AS_HEXES_DIR}/{resolution}.csv', sep=';')

    tqdm.pandas(desc='end')
    df['end'] = pd.concat([df['tripid'].shift(-1).rename('tripid0'),
                                df['tripid'].rename('tripid1')], axis=1
                                ).progress_apply(lambda row: False if row[0] == row[1] else True, axis=1, raw=True)
    df.loc[df.index[-1], 'end'] = True

    df_nodes = pd.concat(  
                        [df['hexid'].rename('n1'),
                        df['hexid'].shift(-1).rename('n2'),
                        df['end']], 
                        axis=1
                )
    df_nodes = df_nodes[~df_nodes['end']]
    df_nodes = df_nodes[['n1', 'n2']]

    df_n1_count = df_nodes.groupby(['n1']).size().to_frame(name='n1_count').reset_index()

    final_df = df_nodes.groupby(['n1', 'n2']).size().to_frame(name = 'edge_count').reset_index()
    final_df = final_df.merge(df_n1_count, on='n1', how='left')
    final_df['p'] = final_df['edge_count']/final_df['n1_count']

    final_df = final_df.round({'p': 3})
    final_df = final_df[['n1', 'n2', 'p']]

    final_df.to_csv(f'{DATA_TRIPS_AS_HEXES_GRAPH_DIR}/{resolution}.csv', sep=' ', index=False, header=False)

In [3]:
for resolution in tqdm(HEX_RESOLUTIONS):
    save_as_graph(resolution)

end: 100%|██████████| 2462219/2462219 [00:06<00:00, 355751.83it/s]
end: 100%|██████████| 6231963/6231963 [00:18<00:00, 344553.46it/s]
end: 100%|██████████| 16346220/16346220 [00:46<00:00, 351164.26it/s]
end: 100%|██████████| 37514153/37514153 [01:44<00:00, 360395.44it/s]
end: 100%|██████████| 62368917/62368917 [02:56<00:00, 354036.98it/s]
100%|██████████| 5/5 [09:27<00:00, 113.50s/it]
