In [None]:
import pandas as pd
import sklearn
import time
import numpy as np
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

sklearn.set_config(transform_output="pandas")

In [None]:
data = pd.read_csv('processed_train.csv')
test = pd.read_csv('processed_test.csv')

A = data[data['CALL_TYPE'] == 'A']
A_train, A_val = train_test_split(A, test_size=0.2, random_state=42)
A_train = A_train.reset_index()
A_val = A_val.reset_index()

B = data[data['CALL_TYPE'] == 'B']
B_train, B_val = train_test_split(B, test_size=0.2, random_state=42)
B_train = B_train.reset_index()
B_val = B_val.reset_index()

C = data[data['CALL_TYPE'] == 'C']
C_train, C_val = train_test_split(C, test_size=0.2, random_state=42)
C_train = C_train.reset_index()
C_val = C_val.reset_index()

In [None]:
target = TargetEncoder(cols=['ORIGIN_CALL', 'ORIGIN_STAND', 'TAXI_ID'])

A_train = target.fit_transform(A_train, A_train['TRAVEL_TIME'])
A_val = target.transform(A_val)

In [None]:
random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 500, stop = 1500, num = 11)],
               'max_depth': [int(x) for x in np.linspace(50, 150, num = 11)] + [None],
               'min_samples_split': [14, 16, 18],
               'min_samples_leaf': [3, 4, 5]}

categorical_A = [
    'ORIGIN_CALL',
    'TAXI_ID',
    'QTRHR',
    'WK',
    'WKYR',
    'HOLIDAY'
]

categorical_B = [
    'ORIGIN_STAND',
    'TAXI_ID',
    'QTRHR',
    'WK',
    'WKYR',
    'HOLIDAY'
]

categorical_C = [
    'TAXI_ID',
    'QTRHR',
    'WK',
    'WKYR',
    'HOLIDAY'
]

In [None]:
rfA = RandomForestRegressor(bootstrap=True, max_features='sqrt', n_jobs=16)
rf_random = RandomizedSearchCV(estimator = rfA, param_distributions = random_grid, n_iter=20, cv=3, random_state=42, n_jobs=16)
rf_random.fit(A_train[categorical_A], A_train['TRAVEL_TIME'])
print('Finished fitting A')

In [None]:
best_rfA = rf_random.best_estimator_

In [None]:
mean_squared_error(A_val['TRAVEL_TIME'].values, best_rfA.predict(A_val[categorical_A]), squared=False)

In [None]:
B_train = target.fit_transform(B_train, B_train['TRAVEL_TIME'])
B_val = target.transform(B_val)

In [None]:
rfB = RandomForestRegressor(bootstrap=True, max_features='sqrt', n_jobs=16)
rf_random = RandomizedSearchCV(estimator = rfB, param_distributions = random_grid, n_iter=20, cv=3, random_state=42, n_jobs=16)
rf_random.fit(B_train[categorical_B], B_train['TRAVEL_TIME'])
print('Finished fitting B')

In [None]:
best_rfB = rf_random.best_estimator_

In [None]:
mean_squared_error(B_val['TRAVEL_TIME'].values, best_rfB.predict(B_val[categorical_B]), squared=False)

In [None]:
C_train = target.fit_transform(C_train, C_train['TRAVEL_TIME'])
C_val = target.transform(C_val)

In [None]:
rfC = RandomForestRegressor(bootstrap=True, max_features='sqrt', n_jobs=16)
rf_random = RandomizedSearchCV(estimator = rfC, param_distributions = random_grid, n_iter=20, cv=3, random_state=42, n_jobs=16)
rf_random.fit(C_train[categorical_C], C_train['TRAVEL_TIME'])
print('Finished fitting C')

In [None]:
best_rfC = rf_random.best_estimator_

In [None]:
mean_squared_error(C_val['TRAVEL_TIME'].values, best_rfC.predict(C_val[categorical_C]), squared=False)

In [None]:
test = pd.read_csv('processed_test.csv')
A_data_test = test[test['CALL_TYPE'] == 'A'].reset_index()
B_data_test = test[test['CALL_TYPE'] == 'B'].reset_index()
C_data_test = test[test['CALL_TYPE'] == 'C'].reset_index()

In [None]:
dfA = pd.DataFrame({'TRIP_ID': A_data_test['TRIP_ID'], 'TRAVEL_TIME': best_rfA.predict(A_data_test[categorical_A])})
dfB = pd.DataFrame({'TRIP_ID': B_data_test['TRIP_ID'], 'TRAVEL_TIME': best_rfB.predict(B_data_test[categorical_B])})
dfC = pd.DataFrame({'TRIP_ID': C_data_test['TRIP_ID'], 'TRAVEL_TIME': best_rfC.predict(C_data_test[categorical_C])})

In [None]:
submission = pd.concat([dfA, dfB, dfC], ignore_index=True)

In [None]:
def extract_id(value):
    return int(value[1:])
submission = submission.iloc[submission['TRIP_ID'].map(extract_id).argsort()]

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)