In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import statsmodels.api as sm
import math
import json
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, space_eval, STATUS_OK
from functools import partial
from sklearn.cluster import KMeans

In [10]:
train_data = pd.read_csv('train.csv', encoding='utf-8')
train_data = shuffle(train_data)

In [11]:
test_data = pd.read_csv('test.csv', encoding='utf-8')

In [79]:
def encode_hour(hour):
    if 6 <= hour and hour < 10:
        return 0
    if 10 <= hour and hour < 14:
        return 1
    if 14 <= hour and hour < 18:
        return 2
    if 18 <= hour and hour < 22:
        return 3
    if 22 <= hour or hour < 6:
        return 4

In [82]:
train_data['hour_class'] = train_data['Hour'].apply(encode_hour)
test_data['hour_class'] = test_data['Hour'].apply(encode_hour)

In [15]:
direc_dic = {'E': [1, 0], 'NE': [0.7, 0.7], 'N': [0, 1], 'NW': [-0.7, 0.7], 
             'W': [-1, 0], 'SW': [-0.7, -0.7], 'S': [0, -1], 'SE': [0.7, -0.7] }
heading_coor_name = ['x', 'y']
# encoding direction
for data in [train_data, test_data]:
    for heading in ['EntryHeading', 'ExitHeading']:
        for i in [0, 1]:
            new_col = heading + '_' + heading_coor_name[i]
            data[new_col] = data[heading].apply(lambda x: direc_dic[x][i])

In [16]:
# encoding city
city_dic = {'Atlanta':0, 'Boston':1, 'Chicago':2, 'Philadelphia':3}
for data in [train_data, test_data]:
    data['city_code'] = data['City'].apply(lambda x: city_dic[x])

In [161]:
# unique id
min_id = min(train_data['IntersectionId'].unique())
max_id = max(train_data['IntersectionId'].unique())
train_data['UniqueId'] = train_data['IntersectionId']
test_data['UniqueId'] = test_data['IntersectionId']
for i, city in enumerate(['Atlanta', 'Boston', 'Philadelphia', 'Chicago']):
    train_city_df = train_data[train_data['City'] == city]
    test_city_df = test_data[test_data['City'] == city]
    train_data.loc[train_city_df.index, 'UniqueId'] = train_city_df['IntersectionId'] + i * (max_id - min_id)
    test_data.loc[test_city_df.index, 'UniqueId'] = test_city_df['IntersectionId'] + i * (max_id - min_id)

In [143]:
# handle latitude and longitude data
train_data['city_pos'] = 0
test_data['city_pos'] = 0

for city in ['Atlanta', 'Boston', 'Philadelphia', 'Chicago']:
    train_geo_df = train_data[train_data['City'] == city]
    test_geo_df = test_data[test_data['City'] == city]
    kmeans = KMeans(n_clusters=10).fit(train_geo_df[['Latitude', 'Longitude']])
    train_data.loc[train_geo_df.index, 'city_pos'] = kmeans.labels_ + city_dic[city] * 10 
    test_geo_pred = kmeans.predict(test_data.loc[test_geo_df.index][['Latitude', 'Longitude']])
    test_data.loc[test_geo_df.index, 'city_pos'] = test_geo_pred + city_dic[city] * 10 

#  fig = plt.figure(figsize=(16, 12))
#  colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'aqua', 'brown', 'darkblue']
#  print color
# for i in range(10):
#     x = np.array(geo_df[geo_df['cluster_n'] == i]['Latitude'])
#     y = np.array(geo_df[geo_df['cluster_n'] == i]['Longitude'])
#     plt.scatter(x, y, c=colors[i], alpha=0.5)
# plt.show()

In [105]:
def add_distance(df):
    df_center = pd.DataFrame({"Atlanta":[33.753746, -84.386330], 
                            "Boston":[42.361145, -71.057083], 
                                "Chicago":[41.881832, -87.623177], 
                                  "Philadelphia":[39.952583, -75.165222]})
    df["CenterDistance"] = df.apply(lambda row: math.sqrt((df_center[row.City][0] - row.Latitude) ** 2 +
                                                              (df_center[row.City][1] - row.Longitude) ** 2) , axis=1)
add_distance(train_data)
add_distance(test_data)

In [14]:
feature_cols = ['Latitude', 'Longitude', 'EntryHeading_x', 'EntryHeading_y',
                'ExitHeading_x', 'ExitHeading_y', 'city_code', 'Hour', 'Weekend', 
                'Month', 'city_pos', 'UniqueId', 'CenterDistance', 'hour_class']
target_cols = [
    'TotalTimeStopped_p20',
    'TotalTimeStopped_p50',
    'TotalTimeStopped_p80',
    'DistanceToFirstStop_p20',
    'DistanceToFirstStop_p50',
    'DistanceToFirstStop_p80',
]

In [18]:
categorical_features = ['UniqueId', 'city_code', 'Hour', 'Weekend', 'Month', 'city_pos', 'hour_class']

In [12]:
train_size = int(len(train_data) * 0.7)

In [15]:
X_train = train_data.iloc[:train_size][feature_cols]
y_train = train_data.iloc[:train_size][target_cols]
X_val = train_data.iloc[train_size:][feature_cols]
y_val = train_data.iloc[train_size:][target_cols]
X_test = test_data[feature_cols]

In [19]:
def lgb_model(params, lgb_train, lgb_val, model_name='', boost_round=2000, early_stop=20, verbose_eval=200):
    print('Starting training...')
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=boost_round,
                valid_sets=lgb_val,
                early_stopping_rounds=early_stop, verbose_eval=200)

    print('Saving model...')
    # save model to file
    if model_name != '':
        gbm.save_model('{}.txt'.format(model_name))
    # eval
#     print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
    return gbm

In [20]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train['TotalTimeStopped_p20'], 
                        categorical_feature=categorical_features, free_raw_data=False)
lgb_val = lgb.Dataset(X_val, y_val['TotalTimeStopped_p20'], reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

In [21]:
def lgb_objective(params, lgb_train, lgb_val, X_val, y_val):
    gbm = lgb_model(params, lgb_train, lgb_val)
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    rmse_val = mean_squared_error(y_val, y_pred) ** 0.5
    return {'loss': rmse_val ,  'status': STATUS_OK}

In [22]:
# using hyperopt to tune parameters of LightGBM
def hyperopt_lgb(lgb_objective, lgb_train, lgb_val, X_val, y_val):
    space = {'objective': 'regression',
             'metric':'rmse',
             'boosting':'gbdt',
             'num_leaves': hp.choice('num_leaves', list(range(20, 180, 20))),
             'feature_fraction': hp.choice('feature_fraction', [.7, .8, .9, 1]),
             'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1),
             'learning_rate': hp.uniform('learning_rate', 0.03, 0.12),
            }
    fmin_objective = partial(lgb_objective,lgb_train=lgb_train, lgb_val=lgb_val, X_val=X_val, y_val=y_val)
    best_vals = fmin(fmin_objective, space, algo=tpe.suggest, max_evals=10)
    best_params = space_eval(space, best_vals)
    return best_params

In [None]:
# tune lightGBM parameters
for i, target in enumerate(target_cols):
    print("Target: {}".format(target))
    lgb_train = lgb.Dataset(X_train, y_train[target], 
                        categorical_feature=categorical_features, free_raw_data=False)
    lgb_val = lgb.Dataset(X_val, y_val[target], reference=lgb_train)
    best_params = hyperopt_lgb(lgb_objective, lgb_train, lgb_val, X_val, y_val[target])
    file_name='{}_lgb_params_14features.txt'.format(target)
    with open(file_name, 'w') as f:
        f.write(json.dumps(best_params)) 
    print(best_params)

Target: TotalTimeStopped_p20
Starting training...


New categorical_feature is ['Hour', 'Month', 'UniqueId', 'Weekend', 'city_code', 'city_pos', 'hour_class']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds
[200]	valid_0's rmse: 6.04853
[400]	valid_0's rmse: 5.94412
[600]	valid_0's rmse: 5.88258
[800]	valid_0's rmse: 5.83812
[1000]	valid_0's rmse: 5.808
[1200]	valid_0's rmse: 5.77716
[1400]	valid_0's rmse: 5.74717
[1600]	valid_0's rmse: 5.73068
[1800]	valid_0's rmse: 5.7056
[2000]	valid_0's rmse: 5.69773
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 5.69773
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 5.75637
[400]	valid_0's rmse: 5.69289
[600]	valid_0's rmse: 5.64635
[800]	valid_0's rmse: 5.61988
[1000]	valid_0's rmse: 5.61149
[1200]	valid_0's rmse: 5.60394
Early stopping, best iteration is:
[1297]	valid_0's rmse: 5.59859
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 5.81335
[400]	valid_0's rmse: 5.75804
[600]	valid_0's rmse: 5.70824
[800]	valid_0's rmse: 5.68208
[1000]	valid_0's rmse: 5.66153
[1200]	valid_0's rmse: 5.64382
[1400]	valid_0's rmse: 5.63488
[1600]	valid_0's rmse: 5.62044
Early stopping, best iteration is:
[1634]	valid_0's rmse: 5.61957
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 6.35113
[400]	valid_0's rmse: 6.20911
[600]	valid_0's rmse: 6.14716
[800]	valid_0's rmse: 6.09525
[1000]	valid_0's rmse: 6.04999
[1200]	valid_0's rmse: 6.02103
[1400]	valid_0's rmse: 5.99663
[1600]	valid_0's rmse: 5.97911
[1800]	valid_0's rmse: 5.9598
[2000]	valid_0's rmse: 5.9433
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 5.9433
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 5.91721
[400]	valid_0's rmse: 5.82052
[600]	valid_0's rmse: 5.76799
[800]	valid_0's rmse: 5.67968
[1000]	valid_0's rmse: 5.65567
[1200]	valid_0's rmse: 5.63561
[1400]	valid_0's rmse: 5.61916
Early stopping, best iteration is:
[1485]	valid_0's rmse: 5.61529
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 5.80994
[400]	valid_0's rmse: 5.69372
[600]	valid_0's rmse: 5.64218
[800]	valid_0's rmse: 5.60405
[1000]	valid_0's rmse: 5.58852
[1200]	valid_0's rmse: 5.57816
Early stopping, best iteration is:
[1263]	valid_0's rmse: 5.5776
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 5.90853
[400]	valid_0's rmse: 5.7324
[600]	valid_0's rmse: 5.67969
[800]	valid_0's rmse: 5.65319
[1000]	valid_0's rmse: 5.6264
[1200]	valid_0's rmse: 5.61222
[1400]	valid_0's rmse: 5.60471
[1600]	valid_0's rmse: 5.59965
Early stopping, best iteration is:
[1728]	valid_0's rmse: 5.59597
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 5.80655
[400]	valid_0's rmse: 5.72578
[600]	valid_0's rmse: 5.68696
[800]	valid_0's rmse: 5.65313
[1000]	valid_0's rmse: 5.63961
Early stopping, best iteration is:
[1067]	valid_0's rmse: 5.63742
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 6.15079
[400]	valid_0's rmse: 6.03247
[600]	valid_0's rmse: 5.95248
[800]	valid_0's rmse: 5.90041
[1000]	valid_0's rmse: 5.86864
[1200]	valid_0's rmse: 5.83537
[1400]	valid_0's rmse: 5.80911
[1600]	valid_0's rmse: 5.78481
[1800]	valid_0's rmse: 5.77194
[2000]	valid_0's rmse: 5.74577
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 5.74577
Saving model...
Starting training...
Training until validation scores don't improve for 20 rounds




[200]	valid_0's rmse: 5.67093
Early stopping, best iteration is:
[331]	valid_0's rmse: 5.63544
Saving model...
{'bagging_fraction': 0.9311084425834824, 'boosting': 'gbdt', 'feature_fraction': 1, 'learning_rate': 0.040405727551738074, 'metric': 'rmse', 'num_leaves': 140, 'objective': 'regression'}
Target: TotalTimeStopped_p50
Starting training...


New categorical_feature is ['Hour', 'Month', 'UniqueId', 'Weekend', 'city_code', 'city_pos', 'hour_class']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds
[200]	valid_0's rmse: 10.403
[400]	valid_0's rmse: 10.1481
[600]	valid_0's rmse: 10.0542


In [177]:
submissions = []
for i, target in enumerate(target_cols):
    print("Target: {}".format(target))
    lgb_train = lgb.Dataset(X_train, y_train[target], 
                        categorical_feature=categorical_features, free_raw_data=False)
    lgb_val = lgb.Dataset(X_val, y_val[target], reference=lgb_train)
    f = open('{}_lgb_params_new.txt'.format(target), 'r')
    dic_str = f.readline()
    params = json.loads(dic_str)
    f.close()
    gbm = lgb_model(params, lgb_train, lgb_val, target, boost_round=2000)
    print('Starting predicting...')
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    sub_target_id = list(test_data['RowId'].apply(lambda x: '{}_{}'.format(x, i)).values)
    sub_target_df = pd.DataFrame({'TargetId': sub_target_id, 'Target': y_pred})
    submissions.append(sub_target_df)

Target: TotalTimeStopped_p20
Starting training...


New categorical_feature is ['Hour', 'Month', 'UniqueId', 'Weekend', 'city_code', 'city_pos', 'hour_class']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 20 rounds
[200]	valid_0's rmse: 5.7159
[400]	valid_0's rmse: 5.64164
[600]	valid_0's rmse: 5.61648
[800]	valid_0's rmse: 5.59479
[1000]	valid_0's rmse: 5.57975
Early stopping, best iteration is:
[1163]	valid_0's rmse: 5.5708
Saving model...
Starting predicting...
Target: TotalTimeStopped_p50
Starting training...
Training until validation scores don't improve for 20 rounds
[200]	valid_0's rmse: 10.4649
[400]	valid_0's rmse: 10.1139
[600]	valid_0's rmse: 9.99422
[800]	valid_0's rmse: 9.93252
[1000]	valid_0's rmse: 9.87061
[1200]	valid_0's rmse: 9.81012
[1400]	valid_0's rmse: 9.74984
[1600]	valid_0's rmse: 9.71671
[1800]	valid_0's rmse: 9.69315
[2000]	valid_0's rmse: 9.6822
Did not meet early stopping. Best iteration is:
[1997]	valid_0's rmse: 9.68219
Saving model...
Starting predicting...
Target: TotalTimeStopped_p80
Starting training...
Training until validation scores don't improve for 20 rounds
[200]	valid_0's rmse: 17.4847
[400]	vali

In [178]:
pd.concat(submissions).to_csv('submission_lgb_fine_tuned_14features.csv', index=False)

In [179]:
train_data.to_csv('train_14_features.csv', index=False)
test_data.to_csv('test_14_features.csv', index=False)