In [1]:
import sys
import random
import numpy as np
import pandas as pd
import lightgbm as lgb
import gc
import polars as pl
pd.set_option('display.min_rows',None)
from tqdm import tqdm
import datetime
import time
from pandarallel import pandarallel
import random
from tqdm import tqdm_notebook
from sklearn.metrics import r2_score
pandarallel.initialize()
import warnings 
warnings.filterwarnings("ignore")

INFO: Pandarallel will run on 44 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [2]:
def sliding_window (df, end_date, day=7) :
    start_date = end_date - datetime.timedelta(day)
    df = df.filter(
        pl.col("date").is_between(start_date, end_date),
    )
    return df

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [4]:
def get_label (df_behaviors, end_date, day=7) :
    user_behaviors_label_windows = sliding_window(df_behaviors, 
                                                   end_date, 
                                                   day=7)
    user_history_hebaviors = sliding_window(df_behaviors, 
                                             end_date - datetime.timedelta(day), 
                                             day=777)
    out = user_behaviors_label_windows.groupby("did").agg(
        [
            pl.col("date_day").n_unique().alias('active_days'),  
            pl.col("vid").count().alias('watch_nums'), 
            pl.col("vts").sum().alias('watch_durations'), 
        ]
    )
    return out,user_history_hebaviors

In [5]:
%%time
vid_info = pl.read_csv('../data/vid_info.csv')
# vid_info = vid_info.lazy()

CPU times: total: 3.75 s
Wall time: 206 ms


In [6]:
%%time
user_behaviors = pl.read_csv('../data/user_behaviors.csv')
# user_behaviors = user_behaviors.lazy()

CPU times: total: 3min 3s
Wall time: 15.2 s


In [7]:
%%time
user_behaviors = user_behaviors.with_columns(
    [
        pl.col("timestamp")*1000
    ]
)

CPU times: total: 891 ms
Wall time: 878 ms


In [8]:
%%time
user_behaviors = user_behaviors.with_columns(
    [
        pl.col("timestamp").cast(pl.Datetime(time_unit="ms")).alias("date")
    ]
)

CPU times: total: 1.2 s
Wall time: 1.19 s


In [9]:
%%time
user_behaviors = user_behaviors.with_columns(
    [
        pl.col("date").dt.replace_time_zone("UTC")
    ]
)

CPU times: total: 20.1 s
Wall time: 20.1 s


In [10]:
%%time
user_behaviors = user_behaviors.with_columns(
    [
        pl.col("date").dt.convert_time_zone("Asia/Shanghai")
    ]
)

CPU times: total: 0 ns
Wall time: 999 µs


In [11]:
%%time
user_behaviors = user_behaviors.with_columns(
    [
        pl.col("date").dt.date().alias('date_day'),
        pl.col("date").dt.hour().alias('date_hour'),
        pl.col("date").dt.weekday().alias('date_weekday'),
        pl.col("date").dt.minute().alias('date_minute') % 10,
    ]
)

CPU times: total: 1min 1s
Wall time: 24.6 s


In [12]:
%%time
user_behaviors = user_behaviors.join(vid_info[['vid','cid']],on='vid',how='left')

CPU times: total: 35 s
Wall time: 3.78 s


In [13]:
user_behaviors = user_behaviors.drop("cid")

In [14]:
# vid 视频id
# cid 合集id
# Is_intact 视频类型 1正片,2短片,3预告片,4花絮,5番外,6前序,7彩蛋,8,分支剧情,9摘要
# online_time 上线时间
# serialno 集号
# series_id 系列id
# duration 视频时长
# stars 明星
# tags 标签
# img_url 视频封面图片地址
# classify_id 频道id

In [15]:
%%time
# 训练数据信息1
df_train_label_1, user_train_history_behaviors_1 = get_label(user_behaviors, user_behaviors['date'].max()-datetime.timedelta(7*1), day=7)

CPU times: total: 59.5 s
Wall time: 13.8 s


In [16]:
%%time
#生成验证集label
df_valid_label, user_valid_history_behaviors = get_label(user_behaviors, user_behaviors['date'].max(), day=7)

CPU times: total: 58.3 s
Wall time: 18.7 s


In [17]:
def make_number_feats (df, vid_info, days) :
    end_date = df['date'].max()
    # 截取需要的时间范围    
    df_temp = sliding_window(df, end_date, days)
    
    print(df_temp['date'].max(),df_temp['date'].min())
    
    df_temp = df_temp.join(vid_info, on=["vid"], how="left")
    
    df_feats = df_temp.groupby("did").agg(
    [
        ((pl.col("timestamp").max() - pl.col("timestamp").min())/3600).alias('timestamp_ptp_{}'.format(days)),
        
        pl.col("date_day").n_unique().alias('did_date_{}_nunique'.format(days)), 
        
        pl.col("cid").n_unique().alias('did_cid_{}_nunique'.format(days)), 
        
        pl.col("classify_id").n_unique().alias('did_classify_id_{}_nunique'.format(days)), 
        
        pl.col("vid").count().alias('did_vid_{}_count'.format(days)), 
        pl.col("vid").n_unique().alias('did_vid_{}_nunique'.format(days)),  
        
        (pl.col("vid").n_unique() / pl.col("vid").count()).alias('did_vid_{}_nunique_count'.format(days)), 
    ]
    )
    
    a = df_temp.groupby(["did","cid"]).agg(
    
        pl.col("vid").count().alias('cid_{}'.format(days)),
    
    )

    a = a.groupby("did", maintain_order=True).agg(pl.col("cid").take(0).alias('cid_like_{}'.format(days)))
    df_feats = df_feats.join(a, on=["did"], how="left")
    del a
                                                  
    a = df_temp.groupby(["did","classify_id"]).agg(
    
        pl.col("vid").count().alias('classify_id_{}'.format(days))
    
    )
    a = a.groupby("did", maintain_order=True).agg(pl.col("classify_id").take(0).alias('classify_id_like_{}'.format(days)))
    df_feats = df_feats.join(a, on=["did"], how="left")
    del a
    
    a = df_temp.groupby(["vid"]).agg(
    [
        pl.col("vid").count().alias('g_vid_{}'.format(days)), 
    ]
    )
    df_temp = df_temp.join(a, on=["vid"], how="left")
    del a
                                                  
    for key in [
        'vts',
        'g_vid_{}'.format(days)
               ]:
        a = df_temp.groupby("did").agg(
            [
                pl.col("{}".format(key)).mean().alias('did_{}_{}_mean'.format(key,days)), 
                pl.col("{}".format(key)).std().alias('did_{}_{}_std'.format(key,days)), 
                pl.col("{}".format(key)).min().alias('did_{}_{}_min'.format(key,days)), 
                pl.col("{}".format(key)).max().alias('did_{}_{}_max'.format(key,days)), 
                pl.col("{}".format(key)).sum().alias('did_{}_{}_sum'.format(key,days)), 
            ])
        df_feats = df_feats.join(a, on=["did"], how="left")
    return df_feats

In [18]:
# %%time 测试代码
# a = make_number_feats (user_train_history_behaviors_1, vid_info, 1)

In [19]:
def create_sample (user_history_behaviors, df_label) :
    #训练集特征窗口
    for days in [1,3,5,7,14,28]:
        print(days)
        df_feats = make_number_feats (user_history_behaviors, vid_info, days=days)
        df_label = df_label.join(df_feats, on='did', how='left') 
        del df_feats
    return df_label

In [20]:
%%time
df_train_data_1 = create_sample (user_train_history_behaviors_1, df_train_label_1)

1
2023-01-25 23:57:52+08:00 2023-01-24 23:57:52+08:00
3
2023-01-25 23:57:52+08:00 2023-01-22 23:57:52+08:00
5
2023-01-25 23:57:52+08:00 2023-01-20 23:57:52+08:00
7
2023-01-25 23:57:52+08:00 2023-01-18 23:57:52+08:00
14
2023-01-25 23:57:52+08:00 2023-01-11 23:57:52+08:00
28
2023-01-25 23:57:52+08:00 2022-12-28 23:57:52+08:00
CPU times: total: 27min 15s
Wall time: 2min 9s


In [21]:
%%time
# 获取用户全局信息
m_timestamp = user_train_history_behaviors_1['timestamp'].max()
global_df =  user_train_history_behaviors_1.groupby("did").agg(
    [
    ((m_timestamp - pl.col("timestamp").min())/3600).alias('activa_time'), 
    ]
)    
df_train_data_1 = df_train_data_1.join(global_df,on=['did'],how='left')

CPU times: total: 2min 9s
Wall time: 6.03 s


In [22]:
df_train_data_1.head()

did,active_days,watch_nums,watch_durations,timestamp_ptp_1,did_date_1_nunique,did_cid_1_nunique,did_classify_id_1_nunique,did_vid_1_count,did_vid_1_nunique,did_vid_1_nunique_count,cid_like_1,classify_id_like_1,did_vts_1_mean,did_vts_1_std,did_vts_1_min,did_vts_1_max,did_vts_1_sum,did_g_vid_1_1_mean,did_g_vid_1_1_std,did_g_vid_1_1_min,did_g_vid_1_1_max,did_g_vid_1_1_sum,timestamp_ptp_3,did_date_3_nunique,did_cid_3_nunique,did_classify_id_3_nunique,did_vid_3_count,did_vid_3_nunique,did_vid_3_nunique_count,cid_like_3,classify_id_like_3,did_vts_3_mean,did_vts_3_std,did_vts_3_min,did_vts_3_max,did_vts_3_sum,…,did_cid_14_nunique,did_classify_id_14_nunique,did_vid_14_count,did_vid_14_nunique,did_vid_14_nunique_count,cid_like_14,classify_id_like_14,did_vts_14_mean,did_vts_14_std,did_vts_14_min,did_vts_14_max,did_vts_14_sum,did_g_vid_14_14_mean,did_g_vid_14_14_std,did_g_vid_14_14_min,did_g_vid_14_14_max,did_g_vid_14_14_sum,timestamp_ptp_28,did_date_28_nunique,did_cid_28_nunique,did_classify_id_28_nunique,did_vid_28_count,did_vid_28_nunique,did_vid_28_nunique_count,cid_like_28,classify_id_like_28,did_vts_28_mean,did_vts_28_std,did_vts_28_min,did_vts_28_max,did_vts_28_sum,did_g_vid_28_28_mean,did_g_vid_28_28_std,did_g_vid_28_28_min,did_g_vid_28_28_max,did_g_vid_28_28_sum,activa_time
str,u32,u32,f64,f64,u32,u32,u32,u32,u32,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,f64,u32,u32,u32,u32,u32,f64,i64,i64,f64,f64,f64,f64,f64,…,u32,u32,u32,u32,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,f64,u32,u32,u32,u32,u32,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,f64
"""0fe03beb66df23…",2,4,53.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,12678.0,4.0,900.0,0.0,900.0,900.0,900.0,53427.0,0.0,53427.0,53427.0,53427.0,11342.222222,2.0,1.0,1.0,5.0,4.0,0.8,12678.0,4.0,2099.0,1541.720468,12.0,3412.0,10495.0,…,1,1,10,7,0.7,12678,4,1055.0,1505.803588,9.0,3412.0,10550.0,498674.1,119569.139268,282756,648408,4986741,639182.222222,11,2,1,16,13,0.8125,12678,4,696.1875,1264.577042,7.0,3412.0,11139.0,535132.5625,133377.099317,282756,720531,8562121,1254600.0
"""4d18c63083ded8…",3,20,10449.0,8568.888889,1.0,6.0,2.0,16.0,16.0,1.0,7534.0,4.0,715.375,743.961457,1.0,1859.0,11446.0,2350.4375,912.747937,4.0,3319.0,37607.0,8568.888889,1.0,6.0,2.0,16.0,16.0,1.0,12942.0,6.0,715.375,743.961457,1.0,1859.0,11446.0,…,8,3,26,26,1.0,9477,12,485.307692,652.384903,1.0,1859.0,12618.0,10108.653846,9304.970542,11,25778,262825,648675.555556,9,15,4,37,37,1.0,10190,12,616.108108,1217.206656,1.0,6776.0,22796.0,35184.513514,58508.997114,4,258880,1301827,1640800.0
"""a34e146e6dadc6…",1,1,6.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,6,1,13,13,1.0,5564,4,50.0,30.948883,4.0,106.0,650.0,149990.461538,235045.184612,577,648408,1949876,455111.111111,13,9,1,31,31,1.0,5877,4,949.580645,1435.32746,4.0,4900.0,29437.0,191217.967742,270069.524946,287,720531,5927757,1274900.0
"""c5789a35335bfb…",7,22,22493.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2152.0,4.0,1260.0,0.0,1260.0,1260.0,1260.0,762.0,0.0,762.0,762.0,762.0,40960.0,3.0,1.0,1.0,7.0,5.0,0.714286,2152.0,4.0,1515.571429,1135.17603,4.0,3088.0,10609.0,…,4,2,38,36,0.947368,12367,3,1860.289474,1387.996279,2.0,4440.0,70691.0,11747.421053,22539.644446,352,114878,446402,648000.0,26,11,2,68,66,0.970588,5877,3,1719.132353,1423.886817,2.0,5617.0,116901.0,50710.470588,153897.812958,267,720531,3448312,1644500.0
"""7b6aa195c0e757…",3,9,5099.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,9515.0,4.0,657.0,0.0,657.0,657.0,657.0,38715.0,0.0,38715.0,38715.0,38715.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,9515.0,4.0,657.0,0.0,657.0,657.0,657.0,…,1,1,2,2,1.0,9515,4,956.0,422.849855,657.0,1255.0,1912.0,200580.0,170415.562693,80078,321082,401160,299271.111111,2,1,1,2,2,1.0,9515,4,956.0,422.849855,657.0,1255.0,1912.0,322089.0,1424.113057,321082,323096,644178,1125100.0


In [23]:
del user_train_history_behaviors_1,df_train_label_1
gc.collect()

0

In [24]:
%%time
df_train_data = df_train_data_1.to_pandas()

CPU times: total: 5.77 s
Wall time: 1.02 s


In [25]:
del df_train_data_1
gc.collect()

0

In [26]:
# %%time
# df_train_data = reduce_mem_usage(df_train_data)

In [27]:
%%time
#构建验证集样本
df_valid_data = create_sample (user_valid_history_behaviors, df_valid_label)

1
2023-02-01 23:57:52+08:00 2023-01-31 23:57:52+08:00
3
2023-02-01 23:57:52+08:00 2023-01-29 23:57:52+08:00
5
2023-02-01 23:57:52+08:00 2023-01-27 23:57:52+08:00
7
2023-02-01 23:57:52+08:00 2023-01-25 23:57:52+08:00
14
2023-02-01 23:57:52+08:00 2023-01-18 23:57:52+08:00
28
2023-02-01 23:57:52+08:00 2023-01-04 23:57:52+08:00
CPU times: total: 36min 11s
Wall time: 2min 29s


In [28]:
%%time
# 获取用户全局信息
m_timestamp = user_valid_history_behaviors['timestamp'].max()
global_df =  user_valid_history_behaviors.groupby("did").agg(
    [
    (m_timestamp - pl.col("timestamp").min()/3600).alias('activa_time'), 
    ]
)    
df_valid_data = df_valid_data.join(global_df,on=['did'],how='left')

CPU times: total: 2min 3s
Wall time: 6.45 s


In [29]:
del user_valid_history_behaviors,df_valid_label
gc.collect()

0

In [30]:
%%time
df_valid_data = df_valid_data.to_pandas()

CPU times: total: 10.3 s
Wall time: 1.12 s


In [31]:
# %%time
# df_valid_data = reduce_mem_usage(df_valid_data)

In [32]:
%%time
#测试集特征
df_test_data = create_sample (user_behaviors, user_behaviors.unique(subset=['did'], keep='first')[['did']])

1
2023-02-08 23:57:52+08:00 2023-02-07 23:57:52+08:00
3
2023-02-08 23:57:52+08:00 2023-02-05 23:57:52+08:00
5
2023-02-08 23:57:52+08:00 2023-02-03 23:57:52+08:00
7
2023-02-08 23:57:52+08:00 2023-02-01 23:57:52+08:00
14
2023-02-08 23:57:52+08:00 2023-01-25 23:57:52+08:00
28
2023-02-08 23:57:52+08:00 2023-01-11 23:57:52+08:00
CPU times: total: 39min 1s
Wall time: 2min 49s


In [33]:
%%time
# 获取用户全局信息
m_timestamp = user_behaviors['timestamp'].max()
global_df =  user_behaviors.groupby("did").agg(
    [
    (m_timestamp - pl.col("timestamp").min()/3600).alias('activa_time'), 
    ]
)    
df_test_data = df_test_data.join(global_df,on=['did'],how='left')

CPU times: total: 2min 54s
Wall time: 8.62 s


In [34]:
gc.collect()

0

In [35]:
%%time
df_test_data = df_test_data.to_pandas()

CPU times: total: 7.47 s
Wall time: 1.41 s


In [36]:
# %%time
# df_test_data = reduce_mem_usage(df_test_data)

In [37]:
def train (df_train, df_valid, label, params, features) :
    train_label = df_train[label].values
    train_feat = df_train[features]

    valid_label = df_valid[label].values
    valid_feat = df_valid[features]
#     print (train_feat.columns )
    gc.collect()

    trn_data = lgb.Dataset(train_feat, label=train_label)#, categorical_feature=cat_cols)
    val_data = lgb.Dataset(valid_feat, label=valid_label)#, categorical_feature=cat_cols)
    clf = lgb.train(params,
                    trn_data,
                    #3000,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=50,
                    #categorical_feature=cat_cols, 
                    early_stopping_rounds=100)#, feval=custom_metric)#, feval=self_gauc)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    fold_importance_df = fold_importance_df.sort_values(by='importance', ascending=False)
    fold_importance_df.to_csv("{}_fold_importance_df.csv".format(label, index=None) )
    df_valid['{}_preds'.format(label)] = clf.predict(valid_feat, num_iteration=clf.best_iteration)
                              
    result = r2_score(df_valid[label], df_valid['{}_preds'.format(label)])
    
    return clf, result

In [38]:
useless_cols = ['did', 'active_days', 'active_days_preds', 
                'watch_nums', 'watch_nums_preds', 
                'watch_durations', 'watch_durations_preds']
features = df_train_data.columns[~df_train_data.columns.isin(useless_cols)].values
print (features)
print (len(features))
params = {
    'objective': 'regression', #定义的目标函数
    #'metric': {'auc', 'binary_logloss'}, rmse',
    'metric': {'rmse'},
    'boosting_type' : 'gbdt',

    'learning_rate': 0.05,
    'max_depth' : -1,
    'num_leaves' : 2 ** 8,

#     'feature_fraction' : 0.70,
#     'subsample' : 0.75,
    'seed' : 114,
    'num_iterations' : 3000,
    'nthread' : -1,
    'verbose' : -1,
    #'scale_pos_weight':200
}

['timestamp_ptp_1' 'did_date_1_nunique' 'did_cid_1_nunique'
 'did_classify_id_1_nunique' 'did_vid_1_count' 'did_vid_1_nunique'
 'did_vid_1_nunique_count' 'cid_like_1' 'classify_id_like_1'
 'did_vts_1_mean' 'did_vts_1_std' 'did_vts_1_min' 'did_vts_1_max'
 'did_vts_1_sum' 'did_g_vid_1_1_mean' 'did_g_vid_1_1_std'
 'did_g_vid_1_1_min' 'did_g_vid_1_1_max' 'did_g_vid_1_1_sum'
 'timestamp_ptp_3' 'did_date_3_nunique' 'did_cid_3_nunique'
 'did_classify_id_3_nunique' 'did_vid_3_count' 'did_vid_3_nunique'
 'did_vid_3_nunique_count' 'cid_like_3' 'classify_id_like_3'
 'did_vts_3_mean' 'did_vts_3_std' 'did_vts_3_min' 'did_vts_3_max'
 'did_vts_3_sum' 'did_g_vid_3_3_mean' 'did_g_vid_3_3_std'
 'did_g_vid_3_3_min' 'did_g_vid_3_3_max' 'did_g_vid_3_3_sum'
 'timestamp_ptp_5' 'did_date_5_nunique' 'did_cid_5_nunique'
 'did_classify_id_5_nunique' 'did_vid_5_count' 'did_vid_5_nunique'
 'did_vid_5_nunique_count' 'cid_like_5' 'classify_id_like_5'
 'did_vts_5_mean' 'did_vts_5_std' 'did_vts_5_min' 'did_vts_5_max'


In [39]:
#离线验证
active_days_valid_clf, active_days_valid_result = train (df_train_data, df_valid_data, 'active_days', params, features)
watch_nums_valid_clf, watch_nums_valid_result = train (df_train_data, df_valid_data, 'watch_nums', params, features)
watch_durations_valid_clf, watch_durations_valid_result = train (df_train_data, df_valid_data, 'watch_durations', params, features)

Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 1.54511	valid_1's rmse: 1.56146
[100]	training's rmse: 1.52732	valid_1's rmse: 1.57991
Early stopping, best iteration is:
[36]	training's rmse: 1.56017	valid_1's rmse: 1.55692
Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 14.5074	valid_1's rmse: 14.7443
[100]	training's rmse: 14.2279	valid_1's rmse: 15.0113
Early stopping, best iteration is:
[21]	training's rmse: 15.0421	valid_1's rmse: 14.4879
Training until validation scores don't improve for 100 rounds
[50]	training's rmse: 13887.5	valid_1's rmse: 14299.6
[100]	training's rmse: 13571.1	valid_1's rmse: 14646.2
Early stopping, best iteration is:
[43]	training's rmse: 13967.2	valid_1's rmse: 14282.5


In [40]:
active_days_valid_result * 0.5 + watch_nums_valid_result * 0.25 + watch_durations_valid_result * 0.25

0.2821276082689113

In [41]:
active_days_valid_result
# 0.3081352281883971

0.3025921554594039

In [42]:
watch_nums_valid_result
# 0.1805256261514938

0.17068038424515808

In [43]:
watch_durations_valid_result
# 0.3531987557874068

0.3526457379116794

In [44]:
#预测留存
params['num_iterations'] = active_days_valid_clf.best_iteration
active_days_test_clf, _ = train (df_valid_data, df_valid_data, 'active_days', params, features)

params['num_iterations'] = watch_nums_valid_clf.best_iteration
watch_nums_test_clf, _ = train (df_valid_data, df_valid_data, 'watch_nums', params, features)

params['num_iterations'] = watch_durations_valid_clf.best_iteration
watch_durations_test_clf, _ = train (df_valid_data, df_valid_data, 'watch_durations', params, features)

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[36]	training's rmse: 1.49108	valid_1's rmse: 1.49108
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[21]	training's rmse: 14.0919	valid_1's rmse: 14.0919
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[43]	training's rmse: 13822.4	valid_1's rmse: 13822.4


In [45]:
#predict
df_test_data['active_days'] = active_days_test_clf.predict(df_test_data[features], num_iteration=active_days_test_clf.best_iteration)
df_test_data['watch_nums'] = watch_nums_test_clf.predict(df_test_data[features], num_iteration=watch_nums_test_clf.best_iteration)
df_test_data['watch_durations'] = watch_durations_test_clf.predict(df_test_data[features], num_iteration=watch_durations_test_clf.best_iteration)
df_test_answer = df_test_data[['did', 'active_days', 'watch_nums', 'watch_durations']]

In [46]:
df_test_answer.to_csv('../submit/df_test_answer_baseline_v1_001.csv', index=None)

In [47]:
df_test_answer['active_days'].min()

2.1324817181968707