In [1]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
import pickle
import seaborn as sns

from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


# Add hours features to matrix
def add_hours_features(df, X_sparse):
    close_hour = (df[times].max(axis=1)).dt.hour
    c_morning = ((close_hour >= 7) & (close_hour <= 11)).astype('int')
    c_day = ((close_hour >= 12) & (close_hour <= 18)).astype('int')
    c_evening = ((close_hour >= 19) & (close_hour <= 23)).astype('int')
    
    hour = df['time1'].dt.hour
    dumm_hour = pd.get_dummies(hour, prefix='h')
    
    X = hstack([X_sparse,
                dumm_hour['h_7'].values.reshape(-1, 1), dumm_hour['h_8'].values.reshape(-1, 1),
                dumm_hour['h_9'].values.reshape(-1, 1), dumm_hour['h_10'].values.reshape(-1, 1), dumm_hour['h_11'].values.reshape(-1, 1),
                dumm_hour['h_12'].values.reshape(-1, 1), dumm_hour['h_13'].values.reshape(-1, 1), dumm_hour['h_14'].values.reshape(-1, 1),
                dumm_hour['h_15'].values.reshape(-1, 1), dumm_hour['h_16'].values.reshape(-1, 1), dumm_hour['h_17'].values.reshape(-1, 1),
                dumm_hour['h_18'].values.reshape(-1, 1), dumm_hour['h_19'].values.reshape(-1, 1), dumm_hour['h_20'].values.reshape(-1, 1),
                dumm_hour['h_21'].values.reshape(-1, 1), dumm_hour['h_22'].values.reshape(-1, 1), dumm_hour['h_23'].values.reshape(-1, 1),
                c_morning.values.reshape(-1, 1), c_day.values.reshape(-1, 1), c_evening.values.reshape(-1, 1)
               ])

    return X


# Add additional features
def add_new_features(df, X_sparse):
    start = df[times].min(axis=1)
    end = df[times].max(axis=1)
    seconds = (end - start) / np.timedelta64(1, 's')
    q80_100 = ((seconds > seconds.quantile(0.80)) & (seconds <= seconds.quantile(1.0))).astype(int)
    
    weekday = df['time1'].dt.weekday
    dumm_day = pd.get_dummies(weekday, prefix='d')

    X = hstack([X_sparse, q80_100.values.reshape(-1, 1),
                dumm_day['d_1'].values.reshape(-1, 1), dumm_day['d_2'].values.reshape(-1, 1), dumm_day['d_3'].values.reshape(-1, 1), dumm_day['d_4'].values.reshape(-1, 1),
                dumm_day['d_5'].values.reshape(-1, 1), dumm_day['d_6'].values.reshape(-1, 1),
               ])  # alice_macromedia.values.reshape(-1, 1), noalice_728.values.reshape(-1, 1), noalice_780.values.reshape(-1, 1), 
                        #noalice_778.values.reshape(-1, 1), noalice_570.values.reshape(-1, 1)
    return X

In [3]:
train_df = pd.read_csv('data/train_sessions.csv', index_col='session_id')
test_df = pd.read_csv('data/test_sessions.csv', index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

In [4]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('data/train_sessions_text.txt', sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('data/test_sessions_text.txt', sep=' ', index=None, header=None)

In [5]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50_000)
vectorizer = make_pipeline(cv, TfidfTransformer())

with open('data/train_sessions_text.txt') as inp_train_file:
    X_train = vectorizer.fit_transform(inp_train_file)
with open('data/test_sessions_text.txt') as inp_test_file:
    X_test = vectorizer.transform(inp_test_file)
X_train.shape, X_test.shape

Wall time: 8.85 s


In [6]:
y_train = train_df['target'].astype('int')

In [7]:
time_split = TimeSeriesSplit(n_splits=10)
print([(el[0].shape, el[1].shape) for el in time_split.split(X_train)])

logit = LogisticRegression(C=1, random_state=17)

[((23051,), (23051,)), ((46102,), (23051,)), ((69153,), (23051,)), ((92204,), (23051,)), ((115255,), (23051,)), ((138306,), (23051,)), ((161357,), (23051,)), ((184408,), (23051,)), ((207459,), (23051,)), ((230510,), (23051,))]


In [8]:
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

In [9]:
# Создаём параметры времени
%time
X_train_new = add_hours_features(train_df, X_train)
X_test_new = add_hours_features(test_df, X_test)
X_train_new.shape, X_test_new.shape

Wall time: 0 ns


((253561, 50020), (82797, 50020))

In [10]:
# Create additional features
X_train_newest = add_new_features(train_df, X_train_new)
X_test_newest = add_new_features(test_df, X_test_new)
X_train_newest.shape, X_test_newest.shape

((253561, 50027), (82797, 50027))

In [11]:
# Looking for regressor
c_values = np.logspace(-2, 2, 10)
logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [12]:
%%time
logit_grid_searcher.fit(X_train_newest, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   17.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.9min finished


Wall time: 1min 58s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [13]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9065735045421652, {'C': 4.6415888336127775})

In [296]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_newest)[:, 1]
write_to_submission_file(logit_test_pred3, 'data/alice-all/FINAL_subm.csv') # 0.94242