In [None]:
# Find the nearest shape for each record in MO file using multiprocessing python lib

In [None]:
import pandas as pd
import multiprocessing as mp
from haversine import haversine, Unit
from math import sin,cos,sqrt,atan2,radians

results_global = []

machine_cores = 46

filename="MO_15101"

traces = pd.read_parquet("./"+filename+"/")

shapes = pd.read_csv("shapes.csv")

outsize = len(traces)

columns = ["dt_avl","id_avl","line_id","trace_y","trace_x","hour_avl","hour_diff","region","time_variation","trip_id","direction","route_id","trip_head","shape_id","min_distance","min_shape_sequence","min_shape_coord_lat","min_shape_coord_lon"]

def get_shape(amount, index_start):
    results = set()

    global traces
    global shapes

    for i in range(amount):
        index_id = index_start + i

        line_id = traces["line_id"][index_id]

        dt_avl = traces["dt_avl"][index_id]

        id_avl = traces["id_avl"][index_id]

        hour_avl = traces["hour_avl"][index_id]

        hour_diff = traces["hour_diff"][index_id]

        region = traces["region"][index_id]

        time_variation = traces["time_variation"][index_id]

        trip_id = traces["trip_id"][index_id]

        direction = traces["direction"][index_id]

        route_id = traces["route_id"][index_id]

        trip_head = traces["trip_headsign"][index_id]

        trace_x = traces["longitude"][index_id]

        trace_y = traces["latitude"][index_id]

        shape_id = int(traces["shape_id"][index_id])

        candidate_shapes = shapes.loc[shapes['shape_id'] == shape_id]
    
        trace_coord = (trace_y,trace_x)
    
        min_distance = 999999999999
    
        min_shape_sequence = ""
    
        min_shape_coord_lat = 0
    
        min_shape_coord_lon = 0
    
        # Compares each possible shape for each record based on line_id of the record in MO file
        for _,shape in candidate_shapes.iterrows():
            shape_coord = (shape["shape_pt_lat"],shape["shape_pt_lon"])

            # distance = geopy.distance.distance(shape_coord,trace_coord).m # lower
            distance = haversine(shape_coord,trace_coord,unit=Unit.METERS) # faster using haversine version
        
            if distance <= min_distance:
                min_distance = distance
                min_shape_sequence = shape["shape_pt_sequence"]
                min_shape_coord_lat = shape["shape_pt_lat"]
                min_shape_coord_lon = shape["shape_pt_lon"]
        
        results.add((dt_avl,id_avl,line_id,trace_y,trace_x,hour_avl,hour_diff,region,time_variation,trip_id,direction,route_id,trip_head,shape_id,min_distance,min_shape_sequence,min_shape_coord_lat,min_shape_coord_lon))

    return results

# collect the intermediate results in distributed process
def collect_result(results):
    global results_global

    df = pd.DataFrame(results)

    df.columns = columns

    df.to_parquet(filename+"-partial-"+str(len(results)),compression="snappy",index=False)

    results_global = results_global + list(results)

    print("Tamanho do resultado global",len(results_global))
    

pool = mp.Pool(machine_cores)

# numbers of generated items in each loop
amounts = int(outsize/machine_cores)
number_of_loops = int(outsize/amounts)
residue = outsize - amounts * number_of_loops

#first generating residue
pool.apply_async(get_shape, args=(residue, 0), callback=collect_result) 

# generating shippers in  parallel using multiprocessing lib
for i in range(number_of_loops):
    pool.apply_async(get_shape, args=(amounts, (i * amounts) + residue), callback=collect_result)

# closing pool
pool.close()
pool.join()

df = pd.DataFrame(results_global)

df.columns = columns
df.to_parquet(filename+"-full",compression="snappy",index=False)







