In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

import gc
import time

from utils import *

PATH = Path('data')
list(PATH.iterdir())

[PosixPath('data/train_no_att_time.csv'),
 PosixPath('data/train_day7_3to16_nextclick.feather'),
 PosixPath('data/train_day8_3to16_FE.feather'),
 PosixPath('data/train_df_10mil_fe_v1.feather'),
 PosixPath('data/train_day8_3to16_nextclick.feather'),
 PosixPath('data/sample_submission.csv'),
 PosixPath('data/train_day9_3to16_FE.feather'),
 PosixPath('data/train_day9_3to16_nextclick.feather'),
 PosixPath('data/train_sample.csv'),
 PosixPath('data/train_day7_3to16_FE.feather'),
 PosixPath('data/val_df_10mil_fe_v1.feather'),
 PosixPath('data/train_day7_3to16.feather'),
 PosixPath('data/train_day9_3to16_todo.feather'),
 PosixPath('data/train_day9_3to16.feather'),
 PosixPath('data/mean_enc_df'),
 PosixPath('data/test.csv'),
 PosixPath('data/train_day8_3to16.feather')]

In [2]:
day_x = get_feather('train_day9_3to16.feather',PATH)

In [3]:
day_x.shape

(43546887, 19)

# next click

In [4]:
GROUP_BY_NEXT_CLICKS = [
    {'groupby': ['ip']},
    {'groupby': ['ip', 'app']},
    {'groupby': ['ip', 'channel']},
]
for spec in GROUP_BY_NEXT_CLICKS:
    new_fea = '{}_next_click'.format('_'.join(spec['groupby']))
    # Run calculation
    print(f">> Grouping by {spec['groupby']}, and saving time to next click in: {new_fea}")
    day_x = time_till_next_click(day_x,spec['groupby'],new_fea)

>> Grouping by ['ip'], and saving time to next click in: ip_next_click
>> Grouping by ['ip', 'app'], and saving time to next click in: ip_app_next_click
>> Grouping by ['ip', 'channel'], and saving time to next click in: ip_channel_next_click


In [5]:
day_x= downcast_dtypes(day_x)

In [6]:
day_x.to_feather(PATH/'train_day9_3to16_nextclick.feather')

# Cumcount (for each day) (might be discard)

In [7]:
CUMCOUNT_GROUP=[
    ['ip','os'],
    ['ip','app'],
    ['ip','device','os','app'],
    ['ip'],
    # TODO: what else can we put here
]



In [8]:
for spec in CUMCOUNT_GROUP:
    day_x = cum_count(spec,day_x)

In [9]:
day_x.dtypes

ip                                                 int32
app                                                int32
device                                             int32
os                                                 int32
channel                                            int32
click_time                                datetime64[ns]
is_attributed                                      int32
day                                                uint8
hour                                               uint8
os_device_mean_target_day8_alpha1                float32
ip_mean_target_day8_alpha1                       float32
device_mean_target_day8_alpha1                   float32
app_channel_mean_target_day8_alpha1              float32
app_os_mean_target_day8_alpha1                   float32
channel_device_mean_target_day8_alpha1           float32
app_mean_target_day8_alpha1                      float32
channel_os_mean_target_day8_alpha1               float32
app_device_mean_target_day8_alp

In [10]:
day_x= downcast_dtypes(day_x)
day_x.to_feather(PATH/'train_day9_3to16_todo.feather')

In [13]:
day_x.tail().T

Unnamed: 0,43546882,43546883,43546884,43546885,43546886
ip,121312,46894,320126,189286,106485
app,12,3,1,12,11
device,1,1,1,1,1
os,10,19,13,37,19
channel,340,211,274,259,137
click_time,2017-11-09 16:00:00,2017-11-09 16:00:00,2017-11-09 16:00:00,2017-11-09 16:00:00,2017-11-09 16:00:00
is_attributed,0,0,0,0,0
day,9,9,9,9,9
hour,16,16,16,16,16
os_device_mean_target_day8_alpha1,0.00125083,0.00178335,0.0015165,0.00316955,0.00178335


In [14]:
gc.collect()

7

# Group-by aggregation (for each day)

In [15]:
# Define all the groupby transformations
GROUPBY_AGGREGATIONS = [
    
    # Count, for ip-day-hour
#     {'groupby': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'},
#     Count, for ip-app
#     {'groupby': ['ip', 'app'], 'select': 'channel', 'agg': 'count'},        
    # Count, for ip-app-os
    {'groupby': ['ip', 'app', 'os'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app-day-hour
    {'groupby': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Mean hour, for ip-app-channel
#     {'groupby': ['ip','app','channel'], 'select': 'hour', 'agg': 'mean'}, 
    
    # V2 - GroupBy Features #
    #########################
    # Average clicks on app by distinct users; is it an app they return to?
    {'groupby': ['app'], 
     'select': 'ip', 
     'agg': lambda x: float(len(x)) / len(x.unique()), 
     'agg_name': 'AvgViewPerDistinct'
    },
    
    # How popular is the app or channel?
    {'groupby': ['ip'], 'select': 'channel', 'agg': 'count'},
    {'groupby': ['app'], 'select': 'channel', 'agg': 'count'},
    {'groupby': ['channel'], 'select': 'app', 'agg': 'count'},
    
    
    # Size calculation
    {'groupby': ['ip','device','os'], 'select': None, 'agg': 'size'},
    {'groupby': ['ip','device','os','app'], 'select': None, 'agg': 'size'}
]

In [3]:
# day_x=get_feather('train_day7_3to16_todo.feather')

In [8]:
# day_x.to_feather(PATH/'train_day7_3to16_todo.feather')

In [16]:
# Apply all the groupby transformations
for spec in GROUPBY_AGGREGATIONS:
    day_x,_=groupby_agg(spec,day_x)

Grouping by ['ip', 'app', 'os'], and aggregating channel with count
Grouping by ['ip', 'app', 'day', 'hour'], and aggregating channel with count
Grouping by ['app'], and aggregating ip with AvgViewPerDistinct
Grouping by ['ip'], and aggregating channel with count
Grouping by ['app'], and aggregating channel with count
Grouping by ['channel'], and aggregating app with count
Grouping by ['ip', 'device', 'os'], and aggregating None with size
Grouping by ['ip', 'device', 'os', 'app'], and aggregating None with size


In [17]:
day_x = downcast_dtypes(day_x)
day_x.to_feather(PATH/'train_day9_3to16_todo.feather')

In [15]:
day_x.tail().T

Unnamed: 0,45439224,45439225,45439226,45439227,45439228
ip,43667,90891,322636,46769,186326
app,15,18,79,18,3
device,1,1,1,1,1
os,9,19,17,13,27
channel,140,134,213,107,137
click_time,2017-11-08 16:59:59,2017-11-08 16:59:59,2017-11-08 16:59:59,2017-11-08 16:59:59,2017-11-08 16:59:59
is_attributed,0,0,1,0,0
day,8,8,8,8,8
hour,16,16,16,16,16
app_device_mean_target_day7_alpha1,0.000169391,0.000630344,0.319629,0.000630344,0.000344286


In [16]:
day_x.shape

(45439229, 34)

# Time features

In [18]:
day_x = time_feature(day_x)

In [13]:
day_x.ip_device_os_app_size_None.describe()

count    4.270792e+07
mean     1.274128e+02
std      7.053993e+02
min      1.000000e+00
25%      4.000000e+00
50%      1.200000e+01
75%      4.200000e+01
max      1.163100e+04
Name: ip_device_os_app_size_None, dtype: float64

In [21]:
day_x.to_feather(PATH/'train_day9_3to16_FE.feather')

# Mean encoding (frequency) (lagging from previous day)

prev_mean_target. mean_df is saved to merge into validation/test set

TODO: 3to6_prev_mean_target, 7to11_prev_mean_target, 12to16_prev_mean_target