In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
import itertools
import pickle
from copy import copy
from datetime import datetime
from tqdm import tqdm_notebook

from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### data preprocess

In [2]:
train_df = pd.read_csv('./data/train_sessions.csv', index_col='session_id')

In [3]:
time_columns = ['time%d' % i for i in range(1, 11)]
train_df[time_columns] = train_df[time_columns].apply(pd.to_datetime)

site_columns = ['site%d' % i for i in range(1, 11)]
train_df[site_columns] = train_df[site_columns].fillna(0).astype('int')

In [4]:
train_df = train_df.sort_values('time1', axis=0, ascending=True)

In [5]:
X = train_df.drop('target', axis=1)
y = train_df['target']

X.shape, y.shape

((253561, 20), (253561,))

### pipeline classes and modificate methods

In [6]:
# data selecting
class DataSelecting(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_names):
        self.column_names = column_names
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return X[self.column_names]
    
# make sites sparse matrix (one-hot-encoding)
class SparseOneHotEncoding(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.values_flatten = X.values.flatten()
        return self
    
    def transform(self, X):
        transformed = sparse.csr_matrix(([1] * self.values_flatten.shape[0], 
                                         self.values_flatten, 
                                         range(0, self.values_flatten.shape[0] + 10, 10)))[:, 1:]
        return transformed
    
# make one feature for all sites index
class AllSitesJoining(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.apply(lambda row: ' '.join(np.array([x for x in row.values if x != 0]).astype('str')), axis=1)
    
# insert binary feature
class BinaryFeatureInsertion(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_column, new_feature_name, modificate_method, use_feature=True):
        self.feature_column = feature_column
        self.new_feature_name = new_feature_name
        self.modificate_method = modificate_method
        self.use_feature = use_feature
        
    def fit(self, X, y=None):
        if not self.use_feature:
            return self
        
        new_feature_column = self.feature_column.apply(self.modificate_method)
        self.feature_df = pd.DataFrame(new_feature_column.values.reshape(-1, 1), columns=[self.new_feature_name])
        #print('BinaryFeatureInsertion: {}'.format(self.new_feature_name))
        return self
            
    def transform(self, X):
        if not self.use_feature:
            return X
        
        transformed = sparse.csc_matrix(sparse.hstack([X, self.feature_df]))
        return transformed
    
# insert float feature
class FloatFeatureInsertion(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_column, new_feature_name, modificate_method, method_axis=0, scaler=None, use_feature=True):
        self.feature_column = feature_column
        self.new_feature_name = new_feature_name
        self.modificate_method = modificate_method
        self.method_axis = method_axis
        self.scaler = scaler
        self.use_feature = use_feature
        
    def fit(self, X, y=None):
        if not self.use_feature:
            return self
        
        # column
        if(len(self.feature_column.shape) == 1): 
            new_feature_column = self.feature_column.apply(self.modificate_method)
        # dataframe
        else: 
            new_feature_column = self.feature_column.apply(self.modificate_method, axis=self.method_axis)
        
        new_feature_column = new_feature_column.values.reshape(-1, 1).astype('float64')
        if self.scaler is not None:
            new_feature_column = self.scaler.fit_transform(new_feature_column)
        
        self.feature_df = pd.DataFrame(new_feature_column, columns=[self.new_feature_name])
        
        #print('FloatFeatureInsertion: {}'.format(self.new_feature_name))
        return self
            
    def transform(self, X):
        if not self.use_feature:
            return X
        
        transformed = sparse.csc_matrix(sparse.hstack([X, self.feature_df]))
        return transformed
        
    
# insert one-hot-encoded feature
class OHEFeatureInsertion(BaseEstimator, TransformerMixin):
    
    def __init__(self, feature_column, new_feature_name, modificate_method, method_axis=0, use_feature=True):
        self.feature_column = feature_column
        self.new_feature_name = new_feature_name
        self.modificate_method = modificate_method
        self.method_axis = method_axis
        self.use_feature = use_feature
        
    def fit(self, X, y=None):
        if not self.use_feature:
            return self
        
        # column
        if(len(self.feature_column.shape) == 1): 
            new_feature_column = self.feature_column.apply(self.modificate_method)
        # dataframe
        else: 
            new_feature_column = self.feature_column.apply(self.modificate_method, axis=self.method_axis)
            
        self.ohe_feature_df = pd.DataFrame(new_feature_column.values.reshape(-1, 1), columns=[self.new_feature_name])
        self.ohe_feature_df = pd.get_dummies(self.ohe_feature_df[self.new_feature_name])
        
        #print('OHEFeatureInsertion: {}, columns: {}'.format(self.new_feature_name, self.ohe_feature_df.shape[1]))
        return self
            
    def transform(self, X):
        if not self.use_feature:
            return X
        
        transformed = sparse.csc_matrix(sparse.hstack([X, self.ohe_feature_df]))
        return transformed

In [7]:
start_year_month_mod = lambda dt: dt.year*100 + dt.month
start_hour_mod = lambda dt: dt.hour
dayofweek_mod = lambda dt: dt.dayofweek

session_length_mod = lambda row: sum([int(not pd.isnull(row[col])) for col in range(row.shape[0])])

def daytime_mod(dt):
    if dt.hour <= 11:
        return 1
    elif (dt.hour >= 12) & (dt.hour <= 14):
        return 2
    else:
        return 3

def session_duration_mod(row):
    for col in range(row.shape[0])[::-1]:
        if not (pd.isnull(row[col])):
            return (row[col] - row[0]).seconds

### feature processing pipeline

In [8]:
feat_processing_pipeline = Pipeline(steps = [
    
     ('selecting', DataSelecting(site_columns)),
    
     ('all_sites_joining', AllSitesJoining()),
    
     ('sparse_tf_idf', TfidfVectorizer(ngram_range=(1, 3), max_features=100000)),
    
# new features
    
    ('start_year_month_insertion', OHEFeatureInsertion(feature_column=X['time1'], 
                                                       new_feature_name='start_year_month', 
                                                       modificate_method=start_year_month_mod)),
    
    ('dayofweek_insertion', OHEFeatureInsertion(feature_column=X['time1'], 
                                                new_feature_name='dayofweek', 
                                                modificate_method=dayofweek_mod)),
    
    ('start_hour_insertion', OHEFeatureInsertion(feature_column=X['time1'], 
                                                 new_feature_name='start_hour', 
                                                 modificate_method=start_hour_mod)),
    
    ('daytime_insertion', OHEFeatureInsertion(feature_column=X['time1'], 
                                              new_feature_name='daytime', 
                                              modificate_method=daytime_mod)),
    
    ('session_length_insertion', OHEFeatureInsertion(feature_column=X[time_columns], 
                                                     new_feature_name='session_length', 
                                                     modificate_method=session_length_mod, 
                                                     method_axis=1)),
    
    ('session_duration_insertion', FloatFeatureInsertion(feature_column=X[time_columns], 
                                                         new_feature_name='session_duration', 
                                                         modificate_method=session_duration_mod, 
                                                         method_axis=1, 
                                                         scaler=MinMaxScaler(feature_range=(0, 1)))),
])

In [9]:
def get_score(X_transformed, y_original):
    
    holdout_size = 0.1
    holdout_idx = int(round(X_transformed.shape[0] * (1 - holdout_size)))

    X_train = X_transformed[:holdout_idx, :]
    y_train = y_original[:holdout_idx]

    X_holdout = X_transformed[holdout_idx:, :]
    y_holdout = y_original[holdout_idx:]
    
    lr = LogisticRegression(C=1.0, class_weight='balanced', random_state=14)
    lr.fit(X_train, y_train)
    
    return roc_auc_score(y_holdout, lr.predict_proba(X_holdout)[:, 1])

### fit

In [10]:
param_names = ['start_year_month_insertion__use_feature', 
               'dayofweek_insertion__use_feature', 
               'start_hour_insertion__use_feature', 
               'daytime_insertion__use_feature', 
               'session_length_insertion__use_feature', 
               'session_duration_insertion__use_feature']

param_mask = list(itertools.product([True, False], repeat=len(param_names)))

In [11]:
# features_bruteforce = {}

# for mask in tqdm_notebook(param_mask):
#     current_args = dict(zip(param_names, mask))
#     feat_processing_pipeline.set_params(**current_args)
#     score = get_score(feat_processing_pipeline.fit_transform(X), y)
    
#     features_bruteforce[mask] = score
    
#     print('mask = {} score = {}'.format(mask, score))

In [12]:
# with open('./data/features_bruteforce_dic.pkl', 'wb') as features_bruteforce_file:
#     pickle.dump(features_bruteforce, features_bruteforce_file)

In [13]:
with open('./data/features_bruteforce_dic.pkl', 'rb') as features_bruteforce_file:
    features_bruteforce = pickle.load(features_bruteforce_file)

In [14]:
sorted([(k, v) for (k, v) in features_bruteforce.items()], key=lambda x: x[1], reverse=True)

[((True, True, True, False, True, True), 0.9845310167673869),
 ((True, True, True, True, True, True), 0.9845289033327053),
 ((True, True, True, False, True, False), 0.9842276634107696),
 ((True, True, True, True, True, False), 0.9842263047741885),
 ((True, True, True, False, False, True), 0.9839531433415938),
 ((True, True, True, True, False, True), 0.9839498222299513),
 ((True, True, True, False, False, False), 0.9836756040800159),
 ((True, True, True, True, False, False), 0.9836676032201499),
 ((True, True, False, True, True, True), 0.9782879307179881),
 ((True, True, False, True, True, False), 0.9781016465467685),
 ((True, True, False, True, False, True), 0.9773939478476781),
 ((True, True, False, True, False, False), 0.9771913600374863),
 ((False, True, True, False, True, True), 0.976379801119758),
 ((False, True, True, True, True, True), 0.9763742156138139),
 ((False, True, True, True, True, False), 0.9759816451217098),
 ((False, True, True, False, True, False), 0.9759787768889275