In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

import gc
import time

from utils import *

PATH = Path('data')
list(PATH.iterdir())

[PosixPath('data/train_10mil.feather'),
 PosixPath('data/val_df_1mil_fe_v1.feather'),
 PosixPath('data/train_df_1mil_fe_v1.feather'),
 PosixPath('data/train_df_10mil_fe_v1.feather'),
 PosixPath('data/train.csv'),
 PosixPath('data/sample_submission.csv'),
 PosixPath('data/train_50mil.feather'),
 PosixPath('data/train_sample.csv'),
 PosixPath('data/val_df_10mil_fe_v1.feather'),
 PosixPath('data/train_1mil.feather'),
 PosixPath('data/test.csv'),
 PosixPath('data/train_43mil.feather'),
 PosixPath('data/train_10mil_fe.feather')]

In [2]:
def mean_enc_smoothing(df,cols,targ,new_fea,glob_mean,alpha=0,df_val=None):
    group_means = df.groupby(cols)[targ].mean()
    n_group=df.groupby(cols).size()
    
    mean_df = (group_means*n_group + glob_mean*alpha)/ (n_group+alpha)
    mean_df= mean_df.reset_index()
    
    df = pd.merge(df,mean_df,'left',on=cols).rename(columns={0:new_fea})
    if not df_val is None:
        print(f'Generating {new_fea}...')
        df_val = pd.merge(df_val,mean_df,'left',on=cols).rename(columns={0:new_fea})
        df_val[new_fea].fillna(glob_mean,inplace=True)
        
    return df,df_val

def groupby_agg(spec,X_train,X_val=None):
    # Name of the aggregation we're applying
    agg_name = spec['agg_name'] if 'agg_name' in spec else spec['agg']
    
    # Info
    print(f"Grouping by {spec['groupby']}, and aggregating {spec['select']} with {agg_name}")
    
    # Unique list of features to select
    all_features = list(set(spec['groupby'] + [spec['select']]))
    
    # Name of new feature
    new_feature = '{}_{}_{}'.format('_'.join(spec['groupby']), agg_name, spec['select'])
    
    # Perform the groupby
    if spec['select']==None and spec['agg']=='size':
        gp = X_train.groupby(spec['groupby']).size().reset_index().rename(columns={0:new_feature})
    else:  
        gp = X_train[all_features]. \
            groupby(spec['groupby'])[spec['select']]. \
            agg(spec['agg']). \
            reset_index(). \
            rename(index=str, columns={spec['select']: new_feature})
        
    # Merge back to X_train
    X_train = X_train.merge(gp, on=spec['groupby'], how='left')
    if not X_val is None:
        print(f'Generating {new_feature} for validation set...')
        X_val = X_val.merge(gp, on=spec['groupby'], how='left')
        X_val[new_feature].fillna(0,inplace=True)
    
    return X_train,X_val

def cum_count(cols,df):
    new_fea = '_'.join(cols)+'_cumcount'
    df[new_fea]=df.groupby(cols).cumcount()+1
    return df

def time_till_next_click(df,cols,new_fea):
    df[new_fea]= df.groupby(cols).click_time.transform(lambda x: x.diff()).dt.seconds
    df[new_fea].fillna(-1,inplace=True)
    df[new_fea] = df[new_fea].astype(np.int32)
    return df
def time_feature(df):
    df['day'] = df['click_time'].dt.day.astype('uint8')
    df['hour'] = df['click_time'].dt.hour.astype('uint8')
    df['minute'] = df['click_time'].dt.minute.astype('uint8')
    df['second'] = df['click_time'].dt.second.astype('uint8')
#     df.drop('click_time',axis=1,inplace=True)
#     gc.collect()
    return df

def downcast_dtypes(df):
    '''
    Changes column types in the dataframe: 

        `float64` type to `float32`
        `int64`   type to `int32`
    '''

    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]

    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)

    return df

# Get test data

In [22]:
test_df = get_feather('test.feather',PATH)
test_df.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [23]:
test_df.tail()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
18790464,18790464,99442,9,1,13,127,2017-11-10 15:00:00
18790465,18790465,88046,23,1,37,153,2017-11-10 15:00:00
18790466,18790467,81398,18,1,17,265,2017-11-10 15:00:00
18790467,18790466,123236,27,1,13,122,2017-11-10 15:00:00
18790468,18790468,73516,12,2,27,265,2017-11-10 15:00:00


In [10]:
test_df = time_feature(test_df)

In [11]:
test_df['hour'].value_counts()

4     3344125
14    3261257
13    3212566
10    3127993
9     2984808
5     2858427
15        499
11        413
6         381
Name: hour, dtype: int64

# Get full data

In [None]:
# df = pd.read_csv(PATH/'train.csv')


# df = downcast_dtypes(df)

# df.click_time = pd.to_datetime(df.click_time)

# df['day'] = df['click_time'].dt.day.astype('uint8')

# df.drop(df[df.day==6].index,inplace=True)

# day7 = df.loc[df.day==9]

# day7['hour'] = day7['click_time'].dt.hour.astype('uint8')

# day7.day.value_counts()

# day7 = day7[(day7.hour >=3) & (day7.hour <=16)].copy()

# day7 = day7.reset_index().drop('index',axis=1)

# day7.to_feather(PATH/'train_day9_3to16.feather')

# gc.collect()

# Get train data

In [11]:
df = get_feather('train_50mil.feather',PATH)
df.click_time = pd.to_datetime(df.click_time)

4     4032691
5     3671741
6     3570940
13    3457523
14    3443283
12    3363917
3     3351149
11    3347741
10    3304199
7     3186240
1     3082862
2     3068887
15    3026111
9     2986204
8     2804701
0      301364
16        447
Name: hour, dtype: int64

## next click

In [6]:
GROUP_BY_NEXT_CLICKS = [
    {'groupby': ['ip']},
    {'groupby': ['ip', 'app']},
    {'groupby': ['ip', 'channel']},
    {'groupby': ['ip', 'os']},
]
for spec in GROUP_BY_NEXT_CLICKS:
    new_fea = '{}_next_click'.format('_'.join(spec['groupby']))
    # Run calculation
    print(f">> Grouping by {spec['groupby']}, and saving time to next click in: {new_fea}")
    df = time_till_next_click(df,spec['groupby'],new_fea)

>> Grouping by ['ip'], and saving time to next click in: ip_next_click
>> Grouping by ['ip', 'app'], and saving time to next click in: ip_app_next_click
>> Grouping by ['ip', 'channel'], and saving time to next click in: ip_channel_next_click
>> Grouping by ['ip', 'os'], and saving time to next click in: ip_os_next_click


In [7]:
df.tail()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,ip_next_click,ip_app_next_click,ip_channel_next_click,ip_os_next_click
9999995,121312,12,1,10,340,2017-11-09 16:00:00,0,0,82,2191,0
9999996,46894,3,1,19,211,2017-11-09 16:00:00,0,0,181,181,0
9999997,320126,1,1,13,274,2017-11-09 16:00:00,0,0,25,-1,0
9999998,189286,12,1,37,259,2017-11-09 16:00:00,0,1,4,4,1
9999999,106485,11,1,19,137,2017-11-09 16:00:00,0,5,6,6,55


In [8]:
df.to_feather(PATH/'train_10mil_fe.feather')

## time feature

In [9]:
df = time_feature(df)

## cum count

In [10]:
CUMCOUNT_GROUP=[
    ['ip','device','os'],
    ['ip','device','os','app']
    # TODO: what else can we put here
]

In [11]:
for each in CUMCOUNT_GROUP:
    df = cum_count(each,df)

## split train val

In [12]:
train_size = int(df.shape[0]*.8)
train_size

8000000

In [13]:
train_df = df.loc[:train_size-1,:].reset_index().drop('index',axis=1)
val_df = df.loc[train_size:,:].reset_index().drop('index',axis=1)
del df
gc.collect()

132

In [14]:
train_df.tail()

Unnamed: 0,ip,app,device,os,channel,is_attributed,ip_next_click,ip_app_next_click,ip_channel_next_click,ip_os_next_click,day,hour,minute,second,ip_device_os_cumcount,ip_device_os_app_cumcount
7999995,25071,3,1,13,137,0,6,8,19,14,9,15,18,36,751,102
7999996,32772,18,1,13,107,0,73,86,86,73,9,15,18,36,50,3
7999997,5023,12,1,37,205,0,3,101,2793,3,9,15,18,36,48,2
7999998,48212,2,1,49,237,0,0,7,12,79,9,15,18,36,169,22
7999999,190420,21,1,18,232,0,0,8081,8081,0,9,15,18,36,9,1


In [15]:
val_df.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,ip_next_click,ip_app_next_click,ip_channel_next_click,ip_os_next_click,day,hour,minute,second,ip_device_os_cumcount,ip_device_os_app_cumcount
0,180553,14,1,15,134,0,1,84,513,1,9,15,18,36,66,9
1,216547,3,1,47,480,0,0,0,-1,0,9,15,18,36,4,2
2,78833,3,1,19,280,0,0,7,12,0,9,15,18,36,109,33
3,77048,12,1,19,340,0,0,6,144,10,9,15,18,36,1485,203
4,119349,7,1,9,101,0,0,1,1,0,9,15,18,36,528,53


# Mean encoding with smoothing

In [16]:
ATTRIBUTION_CATEGORIES = [        
    # V1 Features #
    ###############
    ['ip'],['app'], ['device'], ['channel'],
    
    # V2 Features #
    ###############
    ['app', 'channel'],
    ['app', 'os'],
    ['app', 'device'],
    
    # TODO: group by ['ip',...]

    
]

In [17]:
glob_mean = train_df.is_attributed.mean()
glob_mean

0.00236425

In [18]:
%%time
for cols in ATTRIBUTION_CATEGORIES:
    new_fea = '_'.join(cols)+'_mean_target'
    train_df,val_df = mean_enc_smoothing(train_df,cols,'is_attributed',
                                   new_fea,glob_mean,alpha=1,df_val=val_df)

Generating ip_mean_target...
Generating app_mean_target...
Generating device_mean_target...
Generating channel_mean_target...
Generating app_channel_mean_target...
Generating app_os_mean_target...
Generating app_device_mean_target...
CPU times: user 22.5 s, sys: 18.1 s, total: 40.6 s
Wall time: 13.6 s


# Group-by aggregation

In [19]:
# Define all the groupby transformations
GROUPBY_AGGREGATIONS = [
    
    # V1 - GroupBy Features #
    #########################    
#     # Variance in day, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'day', 'agg': 'var'},
    # Variance in hour, for ip-app-os
    {'groupby': ['ip','app','os'], 'select': 'hour', 'agg': 'var'},
    # Variance in hour, for ip-day-channel
    {'groupby': ['ip','day','channel'], 'select': 'hour', 'agg': 'var'},
    
    # Count, for ip-day-hour
    {'groupby': ['ip','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app
    {'groupby': ['ip', 'app'], 'select': 'channel', 'agg': 'count'},        
    # Count, for ip-app-os
    {'groupby': ['ip', 'app', 'os'], 'select': 'channel', 'agg': 'count'},
    # Count, for ip-app-day-hour
    {'groupby': ['ip','app','day','hour'], 'select': 'channel', 'agg': 'count'},
    # Mean hour, for ip-app-channel
    {'groupby': ['ip','app','channel'], 'select': 'hour', 'agg': 'mean'}, 
    
    # V2 - GroupBy Features #
    #########################
    # Average clicks on app by distinct users; is it an app they return to?
    {'groupby': ['app'], 
     'select': 'ip', 
     'agg': lambda x: float(len(x)) / len(x.unique()), 
     'agg_name': 'AvgViewPerDistinct'
    },
    
    # How popular is the app or channel?
    {'groupby': ['ip'], 'select': 'channel', 'agg': 'count'},
    {'groupby': ['app'], 'select': 'channel', 'agg': 'count'},
    {'groupby': ['channel'], 'select': 'app', 'agg': 'count'},
    
    
    # Size calculation
    {'groupby': ['ip','device','os'], 'select': None, 'agg': 'size'},
    {'groupby': ['ip','device','os','app'], 'select': None, 'agg': 'size'}
]

# Apply all the groupby transformations
for spec in GROUPBY_AGGREGATIONS:
    train_df,val_df=groupby_agg(spec,train_df,val_df)

Grouping by ['ip', 'app', 'channel'], and aggregating day with var
Generating ip_app_channel_var_day for validation set...
Grouping by ['ip', 'app', 'os'], and aggregating hour with var
Generating ip_app_os_var_hour for validation set...
Grouping by ['ip', 'day', 'channel'], and aggregating hour with var
Generating ip_day_channel_var_hour for validation set...
Grouping by ['ip', 'day', 'hour'], and aggregating channel with count
Generating ip_day_hour_count_channel for validation set...
Grouping by ['ip', 'app'], and aggregating channel with count
Generating ip_app_count_channel for validation set...
Grouping by ['ip', 'app', 'os'], and aggregating channel with count
Generating ip_app_os_count_channel for validation set...
Grouping by ['ip', 'app', 'day', 'hour'], and aggregating channel with count
Generating ip_app_day_hour_count_channel for validation set...
Grouping by ['ip', 'app', 'channel'], and aggregating hour with mean
Generating ip_app_channel_mean_hour for validation set...


In [20]:
train_df.shape
val_df.shape

(8000000, 37)

(2000000, 37)

In [23]:
train_df.isnull().sum()

ip                                     0
app                                    0
device                                 0
os                                     0
channel                                0
is_attributed                          0
ip_next_click                          0
ip_app_next_click                      0
ip_channel_next_click                  0
ip_os_next_click                       0
day                                    0
hour                                   0
minute                                 0
second                                 0
ip_device_os_cumcount                  0
ip_device_os_app_cumcount              0
ip_mean_target                         0
app_mean_target                        0
device_mean_target                     0
channel_mean_target                    0
app_channel_mean_target                0
app_os_mean_target                     0
app_device_mean_target                 0
ip_app_channel_var_day           1198754
ip_app_os_var_ho

In [21]:
train_df = downcast_dtypes(train_df)
val_df = downcast_dtypes(val_df)

In [22]:
train_df.to_feather(PATH / 'train_df_10mil_fe_v1.feather')
val_df.to_feather(PATH / 'val_df_10mil_fe_v1.feather')