### Feature Engineering

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset/data_train.csv", index_col=[0])

In [3]:
def to_sec(s):
    assert isinstance(s, str)
    r = tuple(int(x) for x in s.split(':'))
    assert len(r) == 3
    hours, mins, secs = r
    return hours*3600 + mins*60 + secs

In [4]:
df['time_entry_secs'] = df['time_entry'].apply(to_sec)
df['time_exit_secs'] = df['time_exit'].apply(to_sec)
df['time_delta'] = df['time_exit_secs'] - df['time_entry_secs']

In [5]:
center_of_x = (3750901.5068+3770901.5068)/2
center_of_y = (-19268905.6133-19208905.6133)/2

df['x_entry_delta'] = df['x_entry'].apply(lambda x: x - center_of_x)
df['y_entry_delta'] = df['y_entry'].apply(lambda y: y - center_of_y)

In [6]:
df

Unnamed: 0,hash,trajectory_id,time_entry,time_exit,vmax,vmin,vmean,x_entry,y_entry,x_exit,y_exit,time_entry_secs,time_exit_secs,time_delta,x_entry_delta,y_entry_delta
0,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_0,07:04:31,07:08:32,,,,3.751014e+06,-1.909398e+07,3.750326e+06,-1.913634e+07,25471,25712,241,-9887.737394,144924.969304
1,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_1,07:20:34,07:25:42,,,,3.743937e+06,-1.932247e+07,3.744975e+06,-1.931966e+07,26434,26742,308,-16964.317424,-83561.934028
2,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_2,07:53:32,08:03:25,,,,3.744868e+06,-1.929356e+07,3.744816e+06,-1.929284e+07,28412,29005,593,-16033.686481,-54659.310214
3,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_3,08:17:50,08:37:23,,,,3.744880e+06,-1.929229e+07,3.744809e+06,-1.929049e+07,29870,31043,1173,-16021.441337,-53384.151345
4,0000a8602cf2def930488dee7cdad104_1,traj_0000a8602cf2def930488dee7cdad104_1_4,14:38:09,14:38:09,,,,3.744909e+06,-1.928558e+07,3.744909e+06,-1.928558e+07,52689,52689,0,-15992.498269,-46679.383523
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
814257,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_4,02:21:11,02:21:11,,,,3.744666e+06,-1.925679e+07,3.744666e+06,-1.925679e+07,8471,8471,0,-16235.174759,-17885.140197
814258,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_5,06:02:17,06:02:17,,,,3.744732e+06,-1.925614e+07,3.744732e+06,-1.925614e+07,21737,21737,0,-16169.496260,-17234.043974
814259,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_7,09:52:13,09:52:13,,,,3.744666e+06,-1.925679e+07,3.744666e+06,-1.925679e+07,35533,35533,0,-16235.174759,-17885.140197
814260,ffffc6359725f0e1feac9ef1872ab207_11,traj_ffffc6359725f0e1feac9ef1872ab207_11_8,14:20:26,14:27:15,,,,3.741043e+06,-1.929051e+07,3.741057e+06,-1.928936e+07,51626,52035,409,-19858.624185,-51603.948534


### Model Building

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split

X = df[['x_entry','y_entry', 'time_entry_secs','time_exit_secs','time_delta', 'x_entry_delta', 'y_entry_delta']]
y = df[['x_exit', 'y_exit']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [8]:
from xgboost import XGBRegressor
multi_xgb = MultiOutputRegressor(XGBRegressor(eta=0.15, gamma=0.0, max_depth=8))

In [9]:
multi_xgb.fit(X_train, y_train)

MultiOutputRegressor(estimator=XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, eta=0.15,
                                            gamma=0.0, gpu_id=None,
                                            importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=8,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=100, n_jobs=None,
                                            num_parallel_tree=None,
                                            random_state=None, reg_

In [10]:
predict_result = multi_xgb.predict(X_test)
predict_result

array([[  3757285.5, -19141234. ],
       [  3756430.8, -19186264. ],
       [  3759952. , -19339208. ],
       ...,
       [  3772874.5, -19165004. ],
       [  3746133. , -19243618. ],
       [  3762771.5, -19080538. ]], dtype=float32)

In [11]:
X_test_predict = pd.DataFrame(predict_result, columns=['x_exit_predict', 'y_exit_predict'])
X_test_predict

Unnamed: 0,x_exit_predict,y_exit_predict
0,3757285.50,-19141234.0
1,3756430.75,-19186264.0
2,3759952.00,-19339208.0
3,3759032.75,-19258592.0
4,3757796.25,-19241578.0
...,...,...
162848,3745812.50,-19288996.0
162849,3741392.25,-19334798.0
162850,3772874.50,-19165004.0
162851,3746133.00,-19243618.0


### Apply the certain rule for inside or outside area

In [12]:
reset_y_test = y_test.reset_index()
reset_y_test

Unnamed: 0,index,x_exit,y_exit
0,440102,3.757411e+06,-1.913738e+07
1,438577,3.756545e+06,-1.918472e+07
2,712629,3.757173e+06,-1.923925e+07
3,678461,3.759116e+06,-1.925871e+07
4,160062,3.757721e+06,-1.924037e+07
...,...,...,...
162848,418621,3.744682e+06,-1.928853e+07
162849,169704,3.741382e+06,-1.933157e+07
162850,270469,3.775418e+06,-1.917646e+07
162851,203994,3.746226e+06,-1.924093e+07


In [13]:
temp = pd.concat([reset_y_test, X_test_predict], axis=1)
temp

Unnamed: 0,index,x_exit,y_exit,x_exit_predict,y_exit_predict
0,440102,3.757411e+06,-1.913738e+07,3757285.50,-19141234.0
1,438577,3.756545e+06,-1.918472e+07,3756430.75,-19186264.0
2,712629,3.757173e+06,-1.923925e+07,3759952.00,-19339208.0
3,678461,3.759116e+06,-1.925871e+07,3759032.75,-19258592.0
4,160062,3.757721e+06,-1.924037e+07,3757796.25,-19241578.0
...,...,...,...,...,...
162848,418621,3.744682e+06,-1.928853e+07,3745812.50,-19288996.0
162849,169704,3.741382e+06,-1.933157e+07,3741392.25,-19334798.0
162850,270469,3.775418e+06,-1.917646e+07,3772874.50,-19165004.0
162851,203994,3.746226e+06,-1.924093e+07,3746133.00,-19243618.0


In [14]:
inside_area = lambda x,y: 3750901.5068<=x<=3770901.5068 and -19268905.6133<=y<=-19208905.6133
is_in_area = lambda args: int(inside_area(*args))

In [15]:
temp['exit_in_rect'] = temp[['x_exit', 'y_exit']].apply(is_in_area, axis=1)
temp['exit_predict_in_rect'] = temp[['x_exit_predict', 'y_exit_predict']].apply(is_in_area, axis=1)

In [16]:
temp

Unnamed: 0,index,x_exit,y_exit,x_exit_predict,y_exit_predict,exit_in_rect,exit_predict_in_rect
0,440102,3.757411e+06,-1.913738e+07,3757285.50,-19141234.0,0,0
1,438577,3.756545e+06,-1.918472e+07,3756430.75,-19186264.0,0,0
2,712629,3.757173e+06,-1.923925e+07,3759952.00,-19339208.0,1,0
3,678461,3.759116e+06,-1.925871e+07,3759032.75,-19258592.0,1,1
4,160062,3.757721e+06,-1.924037e+07,3757796.25,-19241578.0,1,1
...,...,...,...,...,...,...,...
162848,418621,3.744682e+06,-1.928853e+07,3745812.50,-19288996.0,0,0
162849,169704,3.741382e+06,-1.933157e+07,3741392.25,-19334798.0,0,0
162850,270469,3.775418e+06,-1.917646e+07,3772874.50,-19165004.0,0,0
162851,203994,3.746226e+06,-1.924093e+07,3746133.00,-19243618.0,0,0


Berikut adalah confusion matrixnya.

In [17]:
from sklearn.metrics import classification_report, confusion_matrix
confusion_matrix(temp.exit_in_rect, temp.exit_predict_in_rect)

array([[103989,  10107],
       [  4066,  44691]])

Berikut adalah classification reportnya.

In [18]:
print(classification_report(temp.exit_in_rect, temp.exit_predict_in_rect))

              precision    recall  f1-score   support

           0       0.96      0.91      0.94    114096
           1       0.82      0.92      0.86     48757

    accuracy                           0.91    162853
   macro avg       0.89      0.91      0.90    162853
weighted avg       0.92      0.91      0.91    162853



In [19]:
from sklearn.metrics import f1_score
f1_score(temp.exit_in_rect, temp.exit_predict_in_rect)

0.8631355318429819

F1 score yang didapat oleh model ini adalah 0.86

### Export Model

In [23]:
import pickle

filename = 'final_model.sav'
write = open(filename, 'wb')
pickle.dump(multi_xgb, write)

### Pickel Testing

In [24]:
with open('final_model.sav', 'rb') as file:  
    pk_model = pickle.load(file)

y_predict = pk_model.predict(X_test)

In [25]:
y_predict

array([[  3757285.5, -19141234. ],
       [  3756430.8, -19186264. ],
       [  3759952. , -19339208. ],
       ...,
       [  3772874.5, -19165004. ],
       [  3746133. , -19243618. ],
       [  3762771.5, -19080538. ]], dtype=float32)

In [26]:
X_test

Unnamed: 0,x_entry,y_entry,time_entry_secs,time_exit_secs,time_delta,x_entry_delta,y_entry_delta
440102,3.757411e+06,-1.913738e+07,3918,3918,0,-3490.206258,101523.017713
438577,3.756545e+06,-1.918472e+07,38456,38456,0,-4356.271897,54189.035063
712629,3.760233e+06,-1.934043e+07,39615,39758,143,-668.257167,-101521.787985
678461,3.759116e+06,-1.925871e+07,56082,56082,0,-1785.904854,-19804.545536
160062,3.757721e+06,-1.924037e+07,54947,54947,0,-3180.738074,-1468.027020
...,...,...,...,...,...,...,...
418621,3.744592e+06,-1.928108e+07,53117,53998,881,-16309.758818,-42171.387617
169704,3.741382e+06,-1.933157e+07,45857,45857,0,-19519.099738,-92662.699009
270469,3.772844e+06,-1.916397e+07,44105,44421,316,11942.014750,74939.270105
203994,3.746226e+06,-1.924093e+07,57043,57043,0,-14675.588693,-2026.362021
