In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import statsmodels.api as sm
import math
import json
import matplotlib.pyplot as plt
import xgboost as xgb
from catboost import Pool, CatBoostRegressor
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from hyperopt import fmin, tpe, hp, space_eval, STATUS_OK
from functools import partial
from sklearn.cluster import KMeans

In [None]:
train_data = pd.read_csv('train.csv', encoding='utf-8')
train_data = shuffle(train_data)

In [None]:
test_data = pd.read_csv('test.csv', encoding='utf-8')

In [None]:
def encode_hour(hour):
    if 6 <= hour and hour < 10:
        return 0
    if 10 <= hour and hour < 14:
        return 1
    if 14 <= hour and hour < 18:
        return 2
    if 18 <= hour and hour < 22:
        return 3
    if 22 <= hour or hour < 6:
        return 4

In [None]:
train_data['hour_class'] = train_data['Hour'].apply(encode_hour)
test_data['hour_class'] = test_data['Hour'].apply(encode_hour)

In [None]:
direc_dic = {'E': [1, 0], 'NE': [0.7, 0.7], 'N': [0, 1], 'NW': [-0.7, 0.7], 
             'W': [-1, 0], 'SW': [-0.7, -0.7], 'S': [0, -1], 'SE': [0.7, -0.7] }
heading_coor_name = ['x', 'y']
# encoding direction
for data in [train_data, test_data]:
    for heading in ['EntryHeading', 'ExitHeading']:
        for i in [0, 1]:
            new_col = heading + '_' + heading_coor_name[i]
            data[new_col] = data[heading].apply(lambda x: direc_dic[x][i])

In [None]:
# calculate “steering angle” feature
for data in [train_data, test_data]:
    x1 = np.array(data['EntryHeading_x'])
    y1 = np.array(data['EntryHeading_y'])
    x2 = np.array(data['ExitHeading_x'])
    y2 = np.array(data['ExitHeading_y'])
    data['steering_angle'] = np.multiply(x1, x2) + np.multiply(y1, y2)

In [None]:
# add "same_heading" feautre
# train_data['same_heading'] = 
train_data['same_heading'] = (train_data['EntryHeading'] == train_data['ExitHeading']).astype(int)
test_data['same_heading'] = (test_data['EntryHeading'] == test_data['ExitHeading']).astype(int)

In [None]:
# encoding city
city_dic = {'Atlanta':0, 'Boston':1, 'Chicago':2, 'Philadelphia':3}
for data in [train_data, test_data]:
    data['city_code'] = data['City'].apply(lambda x: city_dic[x])

In [None]:
# unique id
min_id = min(train_data['IntersectionId'].unique())
max_id = max(train_data['IntersectionId'].unique())
train_data['UniqueId'] = train_data['IntersectionId']
test_data['UniqueId'] = test_data['IntersectionId']
for i, city in enumerate(['Atlanta', 'Boston', 'Philadelphia', 'Chicago']):
    train_city_df = train_data[train_data['City'] == city]
    test_city_df = test_data[test_data['City'] == city]
    train_data.loc[train_city_df.index, 'UniqueId'] = train_city_df['IntersectionId'] + i * (max_id - min_id)
    test_data.loc[test_city_df.index, 'UniqueId'] = test_city_df['IntersectionId'] + i * (max_id - min_id)

In [None]:
# handle latitude and longitude data
train_data['city_pos'] = 0
test_data['city_pos'] = 0

for city in ['Atlanta', 'Boston', 'Philadelphia', 'Chicago']:
    train_geo_df = train_data[train_data['City'] == city]
    test_geo_df = test_data[test_data['City'] == city]
    kmeans = KMeans(n_clusters=10).fit(train_geo_df[['Latitude', 'Longitude']])
    train_data.loc[train_geo_df.index, 'city_pos'] = kmeans.labels_ + city_dic[city] * 10 
    test_geo_pred = kmeans.predict(test_data.loc[test_geo_df.index][['Latitude', 'Longitude']])
    test_data.loc[test_geo_df.index, 'city_pos'] = test_geo_pred + city_dic[city] * 10 

#  fig = plt.figure(figsize=(16, 12))
#  colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'aqua', 'brown', 'darkblue']
#  print color
# for i in range(10):
#     x = np.array(geo_df[geo_df['cluster_n'] == i]['Latitude'])
#     y = np.array(geo_df[geo_df['cluster_n'] == i]['Longitude'])
#     plt.scatter(x, y, c=colors[i], alpha=0.5)
# plt.show()

In [None]:
def add_distance(df):
    df_center = pd.DataFrame({"Atlanta":[33.753746, -84.386330], 
                            "Boston":[42.361145, -71.057083], 
                                "Chicago":[41.881832, -87.623177], 
                                  "Philadelphia":[39.952583, -75.165222]})
    df["CenterDistance"] = df.apply(lambda row: math.sqrt((df_center[row.City][0] - row.Latitude) ** 2 +
                                                              (df_center[row.City][1] - row.Longitude) ** 2) , axis=1)
add_distance(train_data)
add_distance(test_data)

In [None]:
street_code_start = 0
for city in ['Atlanta', 'Boston', 'Chicago', 'Philadelphia']:
    
    train_city_entry_streets = list(train_data[train_data['City'] == city]['EntryStreetName'].unique())
    train_city_exit_streets = list(train_data[train_data['City'] == city]['ExitStreetName'].unique())
    train_city_streets = list(set(train_city_entry_streets) | set(train_city_exit_streets))

    test_city_entry_streets = list(test_data[test_data['City'] == city]['EntryStreetName'].unique())
    test_city_exit_streets = list(test_data[test_data['City'] == city]['ExitStreetName'].unique())
    test_city_streets = list(set(test_city_entry_streets) | set(test_city_exit_streets))
    
    
    city_streets = list(set(train_city_streets) | set(test_city_streets))
    city_streets_dic = dict(zip(city_streets, range(street_code_start,  street_code_start + len(city_streets))))
    street_code_start += len(city_streets)
    
    street_cols = ['EntryStreetName', 'ExitStreetName']
    for data in [train_data, test_data]:
        for i, col in enumerate(['EntryStreetCode', 'ExitStreetCode']):
            data[col] = 0
            city_data = data[data['City'] == city]
            data.loc[data['City'] == city, col] = city_data[street_cols[i]].apply(lambda x: city_streets_dic[x])
    
#     print(len(train_city_streets), len(test_city_streets))
#     print(len(set(train_city_streets) ^ set(test_city_streets)))
#     print(len(city_streets))
#     print(sum(city_streets))

In [None]:
# encode road type
street_encoding = {'Street': 15, 'St': 0, 'Avenue': 1, 'Ave': 1, 'Boulevard': 2, 'Road': 3,
                'Drive': 4, 'Lane': 5, 'Tunnel': 6, 'Highway': 7, 'Way': 8, 'Parkway': 9,
                'Parking': 10, 'Oval': 11, 'Square': 12, 'Place': 13, 'Bridge': 14}
def street_type(street_name, street_encoding):
    if pd.isna(street_name):
        return 0
    street_name_list = street_name.split()
    for s in street_name_list:
        if s in street_encoding.keys():
            return street_encoding[s]
    return 0

In [None]:
for data in [train_data, test_data]:
    data['EntryStreet_type'] = data['EntryStreetName'].apply(lambda x: street_type(x, street_encoding))
    data['ExitStreet_type'] = data['ExitStreetName'].apply(lambda x: street_type(x, street_encoding))

In [None]:
# encode path
path_code_start = 0
for city in ['Atlanta', 'Boston', 'Chicago', 'Philadelphia']:
    train_paths = list(train_data.loc[train_data['City'] == city, 'Path'].unique())
    test_paths = list(test_data.loc[test_data['City'] == city, 'Path'].unique())
    city_paths = list(set(train_paths) | set(test_paths))
#     print(len(train_paths), len(test_paths))
#     print(len(list(set(train_paths) & set(test_paths))))
#     print(len(city_paths))
    city_paths_dic = dict(zip(city_paths, range(path_code_start,  path_code_start + len(city_paths))))
    path_code_start += len(city_paths)
    
    for data in [train_data, test_data]:
        data['path_code'] = 0
        city_data = data[data['City'] == city]
        data.loc[data['City'] == city, 'path_code'] = city_data['Path'].apply(lambda x: city_paths_dic[x])



In [None]:
feature_cols = ['Latitude', 'Longitude', 'EntryHeading_x', 'EntryHeading_y',
                'ExitHeading_x', 'ExitHeading_y', 'Hour', 'Weekend', 
                'Month', 'UniqueId', 'CenterDistance', 'city_pos',
                'EntryStreetCode', 'ExitStreetCode', 'hour_class', 'EntryStreet_type',
                'ExitStreet_type', 'steering_angle', 'same_heading']
target_cols = [
    'TotalTimeStopped_p20',
    'TotalTimeStopped_p50',
    'TotalTimeStopped_p80',
    'DistanceToFirstStop_p20',
    'DistanceToFirstStop_p50',
    'DistanceToFirstStop_p80',
]

In [None]:
categorical_features = ['UniqueId', 'Hour', 'Weekend', 'Month', 
                        'EntryStreetCode', 'ExitStreetCode', 'city_pos', 'hour_class',
                        'EntryStreet_type', 'ExitStreet_type', 'same_heading']

In [None]:
train_size = int(len(train_data) * 0.8)

In [None]:
X_train = train_data.iloc[:train_size][feature_cols]
y_train = train_data.iloc[:train_size][target_cols]
X_val = train_data.iloc[train_size:][feature_cols]
y_val = train_data.iloc[train_size:][target_cols]
X_test = test_data[feature_cols]

In [None]:
def lgb_model(params, lgb_train, lgb_val, model_name='', boost_round=2000, early_stop=20, verbose_eval=200):
    print('Starting training...')
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=boost_round,
                valid_sets=lgb_val,
                early_stopping_rounds=early_stop, verbose_eval=200)

    print('Saving model...')
    # save model to file
    if model_name != '':
        gbm.save_model('{}.txt'.format(model_name))
    # eval
#     print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
    return gbm

In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train['TotalTimeStopped_p20'], 
                        categorical_feature=categorical_features, free_raw_data=False)
lgb_val = lgb.Dataset(X_val, y_val['TotalTimeStopped_p20'], reference=lgb_train)

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

In [None]:
def lgb_objective(params, lgb_train, lgb_val, X_val, y_val):
    gbm = lgb_model(params, lgb_train, lgb_val)
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    rmse_val = mean_squared_error(y_val, y_pred) ** 0.5
    return {'loss': rmse_val ,  'status': STATUS_OK}

In [None]:
# using hyperopt to tune parameters of LightGBM
def hyperopt_lgb(lgb_objective, lgb_train, lgb_val, X_val, y_val):
    space = {'objective': 'regression',
             'metric':'rmse',
             'boosting':'gbdt',
             'num_leaves': hp.choice('num_leaves', list(range(20, 800, 20))),
             'feature_fraction': hp.choice('feature_fraction', [.7, .8, .9, 1]),
             'bagging_fraction': hp.uniform('bagging_fraction', 0.7, 1),
             'learning_rate': hp.uniform('learning_rate', 0.03, 0.12),
            }
    fmin_objective = partial(lgb_objective,lgb_train=lgb_train, lgb_val=lgb_val, X_val=X_val, y_val=y_val)
    best_vals = fmin(fmin_objective, space, algo=tpe.suggest, max_evals=10)
    best_params = space_eval(space, best_vals)
    return best_params

In [None]:
# tune lightGBM parameters
for i, target in enumerate(target_cols):
    print("Target: {}".format(target))
    lgb_train = lgb.Dataset(X_train, y_train[target], 
                        categorical_feature=categorical_features, free_raw_data=False)
    lgb_val = lgb.Dataset(X_val, y_val[target], reference=lgb_train)
    best_params = hyperopt_lgb(lgb_objective, lgb_train, lgb_val, X_val, y_val[target])
    file_name='{}_lgb_params_19features_new2.txt'.format(target)
    with open(file_name, 'w') as f:
        f.write(json.dumps(best_params)) 
    print(best_params)

In [None]:
submissions = []
models = []
for i, target in enumerate(target_cols):
    print("Target: {}".format(target))
    lgb_train = lgb.Dataset(X_train, y_train[target], 
                        categorical_feature=categorical_features, free_raw_data=False)
    lgb_val = lgb.Dataset(X_val, y_val[target], reference=lgb_train)
    f = open('{}_lgb_params_19features_new.txt'.format(target), 'r')
    dic_str = f.readline()
    params = json.loads(dic_str)
    f.close()
    gbm = lgb_model(params, lgb_train, lgb_val, '{}_19features_new'.format(target), boost_round=2000)
    models.append(gbm)
    print('Starting predicting...')
    y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    sub_target_id = list(test_data['RowId'].apply(lambda x: '{}_{}'.format(x, i)).values)
    sub_target_df = pd.DataFrame({'TargetId': sub_target_id, 'Target': y_pred})
    submissions.append(sub_target_df)

In [None]:
pd.concat(submissions).to_csv('submission_lgb_19features_fine_tuned_new2.csv', index=False)

In [None]:
train_data.to_csv('train_features.csv', index=False)
test_data.to_csv('test_features.csv', index=False)

In [None]:
# check feature importance of light GBM
models = []
feature_importances = []
for target in target_cols:
    model = lgb.Booster(model_file='{}_19features_new.txt'.format(target))
    models.append(model)
    tmp_df = pd.DataFrame(
        { 'column': model.feature_name(), 
         'importance': model.feature_importance(importance_type='gain'), 
        }).sort_values(by='importance', ascending=False)
    feature_importances.append(tmp_df)

In [None]:
for i, df in enumerate(feature_importances):
    df.to_csv('{}_12100811_feature_importance.csv'.format(target_cols[i]))

In [None]:
ax = lgb.plot_importance(models[0], tree_index=1, figsize=(20, 20), show_info=['split_gain'])
plt.show()

In [None]:
# xgboost model
xgb_params = {
    'booster': 'gbtree',
    'objective': 'reg:squarederror',  # 回归
    'gamma': 0.1,                  # 用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子。
    'max_depth': 10,               # 构建树的深度，越大越容易过拟合
    'lambda': 1.,                  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    'subsample': 0.8,              # 随机采样训练样本
    'colsample_bytree': 0.7,       # 生成树时进行的列采样
    'min_child_weight': 3,
    'silent': 1,                   # 设置成1则没有运行信息输出，最好是设置为0.
    'eta': 0.05,                   # 如同学习率
    'seed': 1000,
    'nthread': 2,                  # cpu 线程数
    'eval_metric': 'rmse',         # 评价指标
    'min_child_weight': 20
} 

nround = 10
xgb_submissions = []
for i, target in enumerate(target_cols):
    if i == 0:
        continue
    xgb_train = xgb.DMatrix(X_train, label=y_train[target])
    xgb_valid = xgb.DMatrix(X_val, label=y_val[target])

    watchlist = [(xgb_valid, 'valid')]
    xgb_model = xgb.train(xgb_params, xgb_train, nround, evals=watchlist,
                      verbose_eval=1, early_stopping_rounds=50)
    print("Best Iteration: {}".format(xgb_model.best_iteration))
#     xgb_model.save_model('{}_xgb.model'.format(target))
#     pv = xgb_model.predict(xgb_valid,)
#     mse = np.mean((pv - y_val[target]) ** 2)
#     print(target, 'rmse', np.sqrt(mse))
    # prediction
    y_pred = xgb_model.predict(xgb_test, ntree_limit=xgb_model.best_iteration)
    xgb_sub_target_id = list(test_data['RowId'].apply(lambda x: '{}_{}'.format(x, i)).values)
    xgb_sub_target_df = pd.DataFrame({'TargetId': xgb_sub_target_id, 'Target': y_pred})
    xgb_submissions.append(xgb_sub_target_df)
    

In [None]:
submissions = []
xgb_test = xgb.DMatrix(X_test)
for i, target in enumerate(target_cols):
    xgb_model = xgb.Booster({'nthread': 2})  # init model
    xgb_model.load_model('{}_xgb.model'.format(target)) 
#     xgb_model = xgb.Booster(model_file='{}_xgb.model'.format())
    y_pred = xgb_model.predict(xgb_test)
    sub_target_id = list(test_data['RowId'].apply(lambda x: '{}_{}'.format(x, i)).values)
    sub_target_df = pd.DataFrame({'TargetId': sub_target_id, 'Target': y_pred})
    submissions.append(sub_target_df)

In [None]:
pd.concat(submissions).to_csv('submission_xgb_19features.csv', index=False)

In [None]:
categorical_features_indices = np.where(X_train.dtypes != np.float)[0]

In [None]:
cat_params = {
    'iterations': 500,
    'learning_rate': 0.01,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'od_type': 'Iter',
    'od_wait': 40,
    'task_type': "GPU",
    'devices': '1:3',
    'loss_function':'RMSE',
    'grow_policy': 'Lossguide',
    'logging_level': 'Verbose'
}

cat_submissions = []
test_pool = Pool(X_test, cat_features=categorical_features_indices)
for i, target in enumerate(target_cols):
    train_pool = Pool(X_train, y_train[target], categorical_features_indices)
    val_pool = Pool(X_val, y_val[target], categorical_features_indices)
    # specify the training parameters 
    cat_model = CatBoostRegressor(**cat_params,)
    #train the model
    cat_model.fit(train_pool, eval_set=val_pool)
    # make the prediction using the resulting model
    preds = model.predict(test_pool)
    cat_sub_target_id = list(test_data['RowId'].apply(lambda x: '{}_{}'.format(x, i)).values)
    cat_sub_target_df = pd.DataFrame({'TargetId': cat_sub_target_id, 'Target': y_pred})
    cat_submissions.append(cat_sub_target_df)