In [1]:
import time
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import log_loss

In [2]:
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    value = time.localtime(value)
    dt = time.strftime(format, value)
    return dt


def convert_data(data):
    data['time'] = data.context_timestamp.apply(timestamp_datetime)
    data['day'] = data.time.apply(lambda x: int(x[8:10]))
    data['hour'] = data.time.apply(lambda x: int(x[11:13]))
    user_query_day = data.groupby(
        ['user_id',
         'day']).size().reset_index().rename(columns={
             0: 'user_query_day'
         })
    data = pd.merge(data, user_query_day, 'left', on=['user_id', 'day'])
    user_query_day_hour = data.groupby(
        ['user_id', 'day',
         'hour']).size().reset_index().rename(columns={
             0: 'user_query_day_hour'
         })
    data = pd.merge(
        data, user_query_day_hour, 'left', on=['user_id', 'day', 'hour'])

    return data

In [4]:
if __name__ == "__main__":
    online = True  # 这里用来标记是 线下验证 还是 在线提交

    data = pd.read_csv('df.txt', sep=' ')
    data.drop_duplicates(inplace=True)
    data = convert_data(data)

    if online == False:
        train = data.loc[data.day < 24]  # 18,19,20,21,22,23,24
        test = data.loc[data.day == 24]  # 暂时先使用第24天作为验证集
    elif online == True:
        train = data.copy()
        test = pd.read_csv('test.txt', sep=' ')
        test = convert_data(test)

    features = [
        'item_id',
        'item_brand_id',
        'item_city_id',
        'item_price_level',
        'item_sales_level',
        'item_collected_level',
        'item_pv_level',
        'user_gender_id',
        'user_occupation_id',
        'user_age_level',
        'user_star_level',
        'user_query_day',
        'user_query_day_hour',
        'context_page_id',
        'hour',
        'shop_id',
        'shop_review_num_level',
        'shop_star_level',
        'shop_review_positive_rate',
        'shop_score_service',
        'shop_score_delivery',
        'shop_score_description',
    ]
    target = ['is_trade']

#     if online == False:
#         clf = lgb.LGBMClassifier(
#             num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
#         clf.fit(
#             train[features],
#             train[target],
#             feature_name=features,
#             categorical_feature=[
#                 'user_gender_id',
#             ])
#         test['lgb_predict'] = clf.predict_proba(test[features], )[:, 1]
#         print(log_loss(test[target], test['lgb_predict']))
#     else:
#         clf = lgb.LGBMClassifier(
#             num_leaves=63, max_depth=7, n_estimators=80, n_jobs=20)
#         clf.fit(
#             train[features],
#             train[target],
#             categorical_feature=[
#                 'user_gender_id',
#             ])
#         test['predicted_score'] = clf.predict_proba(test[features])[:, 1]
#         test[['instance_id', 'predicted_score']].to_csv(
#             'baseline.csv', index=False, sep=' ')  #保存在线提交结果

In [5]:
data.head()

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description,is_trade,time,day,hour,user_query_day,user_query_day_hour
0,108641074714126964,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,5002,1.0,1.0,1.0,0,2018-09-18 10:09:04,18,10,3,3
1,5754713551599725161,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,5002,1.0,1.0,1.0,0,2018-09-18 12:00:32,18,12,8,5
2,842679481291040981,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,5002,1.0,1.0,1.0,0,2018-09-18 03:04:12,18,3,2,2
3,937088850059189027,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,5002,1.0,1.0,1.0,0,2018-09-18 06:17:50,18,6,1,1
4,7975697065017708072,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,5002,1.0,1.0,1.0,0,2018-09-18 19:48:40,18,19,2,2
