In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('data_train/all_samples.csv')
df["j_time"] = list(map(journey_time, df["time_entry"], df["time_exit"])) 
df.drop(["time_entry","time_exit"]
                   , axis=1, inplace=True)
df.set_index("trajectory_id", inplace=True)
df.head()

df_all = pd.read_csv('data_train/all_binary_traj.csv')
df.fillna(0,inplace=True)

In [17]:
def in_city(x_pred,y_pred):
    if (3750901.5068 <= x_pred <= 3770901.5069) and (-19268905.6133 <= y_pred <= -19208905.6133):
        return 1
    else:
        return 0
    
def pos_neg_traj(x):
    """
    Assigns pos or neg to trajectory rather
    than total value.
    """
    if np.sign(x) == 1:
        return 1
    else:
        return 0 

def all_tr(a):
    tr = []
    tr.append(0)
    for i in range(1,len(a)):
        x = pos_neg_traj(a[i-1] - a[i])
        tr.append(x)
    return tr

def odd_or_even(x):
    """
    Odd or Even number of journeys.
    """
    if x % 2 == 0: return 1 
    else: return 0
        
def journey_time(x,y):
    """
    Compute journey time in seconds.
    """
    x = pd.to_datetime(x)
    y = pd.to_datetime(y)
    return (y-x).total_seconds()

def sigmoid(x):
    e = np.exp(1)
    y = 1/(1+e**(-x))
    return y
scaler = MinMaxScaler(feature_range=(0,1))
df.fillna(0,inplace=True)

In [18]:
"""
Scale or binarize values to facilitate easier onehot encoding.
Add transformed values to existing dataframe.
"""
df["dpc"] = list(map(sigmoid,df["dist_pct_ch"]))
df["home"] = list(map(in_city, df["x_home"], df["y_home"]))
df["start_CC"] = list(map(in_city, df["x_entry"], df["y_entry"]))
df["net_tr_b"] = list(map(pos_neg_traj, df["net_tr"]))
df["prev_tr_b"] = list(map(pos_neg_traj, df["prev_tr"]))
df["net_tr_b"] = df_all["net_traj_b"].values
df["prev_tr_b"] = df_all["prev_traj_b"].values
df["odd_even_nj"] = list(map(odd_or_even, df["nj"]))
df["dist_scaled"] = scaler.fit_transform(df["dist"].values.reshape(-1, 1))
df["jt_scaled"] = scaler.fit_transform(df["j_time"].values.reshape(-1, 1))
df["final_loc"] = list(map(in_city, df["x_exit"], df["y_exit"]))

In [19]:
"""
Remove unnecessary real values from the dataframe.
"""
df.drop(["hash","vmax","vmin","vmean","x_exit","y_exit","x_entry","y_entry","prev_tr","net_tr",
         "x_home","y_home","nj","final"],axis=1, inplace=True)
df.head()

Unnamed: 0_level_0,dist,dist_pct_ch,j_time,dpc,home,start_CC,net_tr_b,prev_tr_b,odd_even_nj,dist_scaled,jt_scaled,final_loc
trajectory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
traj_0000a8602cf2def930488dee7cdad104_1_0,145261.881023,0.0,241.0,0.5,0,0,1,0,1,0.73987,0.009806,0
traj_0000a8602cf2def930488dee7cdad104_1_1,85266.551966,-0.413015,308.0,0.398189,0,0,1,1,1,0.434215,0.012532,0
traj_0000a8602cf2def930488dee7cdad104_1_2,56962.437581,-0.331949,593.0,0.417767,0,0,1,1,1,0.290016,0.024128,0
traj_0000a8602cf2def930488dee7cdad104_1_3,55736.470981,-0.021522,1173.0,0.49462,0,0,1,1,1,0.28377,0.047728,0
traj_0000a8602cf2def930488dee7cdad104_1_4,49342.931074,-0.11471,0.0,0.471354,0,0,1,1,1,0.251197,0.0,0


In [None]:
df.to_csv('data_train/all_binary_features.csv', index=True) #store these features.

In [22]:
"""
OneHot Encode these newly-binarized features and concatenate into 
a single DataFrame.
"""
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

from sklearn.preprocessing import OneHotEncoder, LabelEncoder 
ohe = OneHotEncoder()
home_feat_array = ohe.fit_transform(df[["home"]]).toarray()
home_features = pd.DataFrame(home_feat_array, columns=["Outer","Inner"], index=df.index)

start_ohe = OneHotEncoder()
start_feat_array = start_ohe.fit_transform(df[["start_CC"]]).toarray()
start_features = pd.DataFrame(home_feat_array, columns=["St_Outer","St_Inner"], index=df.index)

net_tr_ohe = OneHotEncoder()
net_tr_feat_array = net_tr_ohe.fit_transform(df[["net_tr_b"]]).toarray()
net_tr_features = pd.DataFrame(net_tr_feat_array, columns=["net_neg", "net_pos"], index=df.index)

prev_tr_ohe = OneHotEncoder()
prev_tr_feat_array = prev_tr_ohe.fit_transform(df[["prev_tr_b"]]).toarray()
prev_tr_features = pd.DataFrame(prev_tr_feat_array, columns=["prev_neg", "prev_pos"], index=df.index)

odd_even_ohe = OneHotEncoder()
odd_even_feat_array = odd_even_ohe.fit_transform(df[["prev_tr_b"]]).toarray()
odd_even_features = pd.DataFrame(odd_even_feat_array, columns=["odd", "even"], index=df.index)

df_ohe = pd.concat([df, home_features, start_features, net_tr_features, prev_tr_features, odd_even_features], axis=1)
df_ohe.drop(["home","start_CC","net_tr_b","prev_tr_b","odd_even_nj"],axis=1, inplace=True)

df_ohe.to_csv('data_train/one_hot_features.csv', index=True)
df_ohe.head()

Unnamed: 0_level_0,dist,dist_pct_ch,j_time,dpc,dist_scaled,jt_scaled,final_loc,Outer,Inner,St_Outer,St_Inner,net_neg,net_pos,prev_neg,prev_pos,odd,even
trajectory_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
traj_0000a8602cf2def930488dee7cdad104_1_0,145261.881023,0.0,241.0,0.5,0.73987,0.009806,0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
traj_0000a8602cf2def930488dee7cdad104_1_1,85266.551966,-0.413015,308.0,0.398189,0.434215,0.012532,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
traj_0000a8602cf2def930488dee7cdad104_1_2,56962.437581,-0.331949,593.0,0.417767,0.290016,0.024128,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
traj_0000a8602cf2def930488dee7cdad104_1_3,55736.470981,-0.021522,1173.0,0.49462,0.28377,0.047728,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
traj_0000a8602cf2def930488dee7cdad104_1_4,49342.931074,-0.11471,0.0,0.471354,0.251197,0.0,0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0


In [24]:
"""
Short example of OHE for multiple bin examples like those shown in 
the data_viz notebook.
"""
df = pd.read_csv("data_train/bin.csv")
df.drop(["Unnamed: 0"],axis=1,inplace=True)
df.head()

Unnamed: 0,home,start_CC,net_tr_b,prev_tr_b,time,d_bin,jt_bin
0,0,0,1,1,1,2,1
1,0,0,0,1,1,1,1
2,0,0,0,0,0,5,2
3,0,0,1,0,1,3,1
4,1,0,0,0,1,1,1


In [28]:
"""
Here each of the categorisations has a unique OHE label.
"""
ohe = OneHotEncoder()
feat_array = ohe.fit_transform(df[["d_bin"]]).toarray()
home_features = pd.DataFrame(feat_array, columns=[1,2,3,4,5,6,7], index=df.index)
home_features.head()

Unnamed: 0,1,2,3,4,5,6,7
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
ohe = OneHotEncoder()
feat_array = ohe.fit_transform(df[["jt_bin"]]).toarray()
j_features = pd.DataFrame(feat_array, columns=['j1','j2','j3','j4','j5','j6','j7','j8','j9','j10'], index=df.index)
j_features.head()

Unnamed: 0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
