In [1]:
import ast
import keras
import pandas as pd
import numpy as np
import json
import os

Using TensorFlow backend.


In [3]:
OFFSET = 8
trajectory = [[23], [1,2,3,4,5,6], [1,2,3,4,0,0,0,0,0,0]]
keras.preprocessing.sequence.pad_sequences(trajectory, maxlen=OFFSET, dtype='int16')

array([[ 0,  0,  0,  0,  0,  0,  0, 23],
       [ 0,  0,  1,  2,  3,  4,  5,  6],
       [ 3,  4,  0,  0,  0,  0,  0,  0]], dtype=int16)

In [2]:
base_path = 'C:/Users/abdul/Desktop/forest_python/'
data_path = os.path.join(base_path, 'data')
pickle_folder = os.path.join(data_path, 'pickles')
trips_file_path = os.path.join(data_path, 'map_matched_all/')
mapped_dict_file_name = 'mapped_dict.json'
nodes_file_name = 'chengdu_nodes.json'

In [3]:
step_count = 6
root_dir = os.listdir(data_path)
root_folder = []
for i in range(1, step_count+1):
    for d in root_dir:
        if f'map_matched_csv_{i}' in d:
            root_folder.append(os.path.join(data_path, d))

In [4]:
def get_valid_files(folder):
    gps_folders = os.listdir(folder)
    valid_files = []
    for gps_folder in gps_folders:
        gps_folder_path = os.path.join(folder, gps_folder)
        all_files = os.listdir(gps_folder_path)
        for f in all_files:
            f_path = os.path.join(gps_folder_path, f)
            valid_files.append(f_path)
    return valid_files

In [5]:
from tqdm.notebook import tqdm
def get_full_df(valid_files):
    df = None 
    pbar = tqdm(total=len(valid_files))
    for i, f in enumerate(valid_files):
        pbar.update(1)
        try:
            _df = pd.read_csv(f)
        except:
            continue
        if df is None:
            df = _df
        else:
            df = df.append(_df)
    pbar.close()
    return df

In [6]:
def load_dictionary():
    with open(os.path.join(data_path, mapped_dict_file_name), 'r') as mapped_dict:
        return json.load(mapped_dict)

In [7]:
mapped_dict = load_dictionary()

In [8]:
def dict_mapper(arr):
    ret_arr = []
    for e in arr:
        if str(e) not in mapped_dict: return np.array(ret_arr).astype('int16')
        ret_arr.append(mapped_dict[str(e)])
    return np.array(ret_arr).astype('int16')

In [9]:
def make_vectors(_df):
    Y = df['destination'].astype('string')
    Y = dict_mapper(Y)
    
    X1 = df['priors']
    X1 = list(map(ast.literal_eval, X1)) 
    _X1 = []

    _X2 = []

    index_delete = []

    for i in range(len(X1)):
        t = X1[i]
        if len(t) > 8:
            index_delete.append(i)
            continue
        ndarr = dict_mapper(t)
        
        _X1.append(ndarr)

        ndarr = np.array([])
        _X2.append(ndarr)

    X1 = _X1
    X2 = _X2

    Y = np.delete(Y, index_delete)
    
    max_len1 = 0
    for x in X1:
        if max_len1 < len(x):
            max_len1 = len(x)
    max_len2 = 0
    for x in X2:
        if max_len2 < len(x):
            max_len2 = len(x)
            
    X1 = keras.preprocessing.sequence.pad_sequences(X1, maxlen=max_len1, dtype='int16')
    X2 = keras.preprocessing.sequence.pad_sequences(X2, maxlen=max_len2, dtype='int16')

    all_features = len(mapped_dict)
    X = np.hstack((X1,X2))
    return X, Y
    


In [10]:
import pickle
def save(x, y, step):
    pickle.dump( x, open( os.path.join(pickle_folder, f"X_{step}.pkl"), "wb" ) )
    pickle.dump( y, open( os.path.join(pickle_folder, f"Y_{step}.pkl"), "wb" ) )

In [11]:
def generate_vectors():
    for i in range(len(root_folder)):
        vf = get_valid_files(root_folder[i])
        df = get_full_df(vf)
        x, y = make_vectors(df)
        save(x, y, i+1)

In [14]:
def generate_weight_dict():
    train_folder = os.path.join(data_path, 'map_matched')
    gps_folders = os.listdir(train_folder)
    all_files = []
    for g in gps_folders:
        g_path = os.path.join(train_folder, g)
        t_f = os.listdir(g_path)
        for t in t_f:
            all_files.append(os.path.join(g_path, t))
    
    weights = {}
    mapped_dict = load_dictionary()
    pbar = tqdm(total=len(all_files))
    for f in all_files:
        pbar.update(1)
        with open(f, 'r') as content:
            trips = json.load(content)
            for t in trips:
                for p in t:
                    m_id = mapped_dict[str(p)]
                    w = weights.get(m_id, 0)
                    weights[m_id] = w + 1
    pbar.close()
    json.dump(weights, open('../weights.json', 'w'), indent=2)

In [15]:
generate_weight_dict()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1556.0), HTML(value='')))


