# Read train and test dasets

In [46]:
import pandas as pd
pd.set_option('max_columns', 100)

train = pd.read_csv('train_sessions.csv')
test = pd.read_csv('test_sessions.csv')

# Analyze data types in train dataset

In [47]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 22 columns):
session_id    253561 non-null int64
site1         253561 non-null int64
time1         253561 non-null object
site2         250098 non-null float64
time2         250098 non-null object
site3         246919 non-null float64
time3         246919 non-null object
site4         244321 non-null float64
time4         244321 non-null object
site5         241829 non-null float64
time5         241829 non-null object
site6         239495 non-null float64
time6         239495 non-null object
site7         237297 non-null float64
time7         237297 non-null object
site8         235224 non-null float64
time8         235224 non-null object
site9         233084 non-null float64
time9         233084 non-null object
site10        231052 non-null float64
time10        231052 non-null object
target        253561 non-null int64
dtypes: float64(9), int64(3), object(10)
memory usage: 32.9+ MB


# Analyze data types in test dataset

In [48]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82797 entries, 0 to 82796
Data columns (total 21 columns):
session_id    82797 non-null int64
site1         82797 non-null int64
time1         82797 non-null object
site2         81308 non-null float64
time2         81308 non-null object
site3         80075 non-null float64
time3         80075 non-null object
site4         79182 non-null float64
time4         79182 non-null object
site5         78341 non-null float64
time5         78341 non-null object
site6         77566 non-null float64
time6         77566 non-null object
site7         76840 non-null float64
time7         76840 non-null object
site8         76151 non-null float64
time8         76151 non-null object
site9         75484 non-null float64
time9         75484 non-null object
site10        74806 non-null float64
time10        74806 non-null object
dtypes: float64(9), int64(2), object(10)
memory usage: 10.1+ MB


# Fill null data in train dataset

In [49]:
train = train.fillna(0)
test = test.fillna(0)

# Convert time columns to datetime

In [50]:
time_columns = ['time1', 'time2', 'time3', 'time4', 'time5', 'time6', 'time7', 'time8', 'time9', 'time10']

for column in time_columns:
    train[column] = pd.to_datetime(train[column])
    test[column] = pd.to_datetime(test[column])

# Add columns with deltas in seconds

In [51]:
for index, column in enumerate(time_columns[1:], 1):
    train['delta{}'.format(index)] = (train[time_columns[index]]-train['time1']).dt.total_seconds().astype(int)
    test['delta{}'.format(index)] = (test[time_columns[index]]-test['time1']).dt.total_seconds().astype(int)

# Convert all negative delta values to -1

In [59]:
delta_columns = ['delta1', 'delta2', 'delta3', 'delta4', 'delta5', 'delta6', 'delta7', 'delta8', 'delta9']

for col in delta_columns:
    train.loc[train[col] < 0, col] = -1
    test.loc[test[col] < 0, col] = -1

# Convert all site columns to int

In [61]:
site_columns = ['site1', 'site2', 'site3', 'site4', 'site5', 'site6', 'site7', 'site8', 'site9', 'site10']
train[site_columns] = train[site_columns].astype(int)
test[site_columns] = test[site_columns].astype(int)

# Train Logistic Regression model

In [100]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

pipe_logit = Pipeline([('scaler', StandardScaler()), ('logit', LogisticRegression(solver='lbfgs', 
                                                                                  max_iter=100))])
features = train.columns.drop(time_columns).drop('session_id').drop('target')
X = train[features]
y = train['target']

solver_list = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
C_list = np.logspace(-7, 3, 22)
class_weight_list = [None, 'balanced']
param_grid_logit = {'logit__C': C_list, 'logit__solver': solver_list, 
                    'logit__class_weight': class_weight_list}

grid_logit = GridSearchCV(pipe_logit, param_grid_logit, return_train_score=True, cv=4, n_jobs=-1)
grid_logit.fit(X, y)
grid_logit.best_params_, grid_logit.best_score_

Wall time: 7min 51s


({'logit__C': 1e-07,
  'logit__class_weight': None,
  'logit__solver': 'newton-cg'},
 0.990941035900128)