# Read train and test dasets

In [3]:
import pandas as pd
pd.set_option('max_columns', 100)

train = pd.read_csv('train_sessions.csv')
test = pd.read_csv('test_sessions.csv')

# Analyze data types in train dataset

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 22 columns):
session_id    253561 non-null int64
site1         253561 non-null int64
time1         253561 non-null object
site2         250098 non-null float64
time2         250098 non-null object
site3         246919 non-null float64
time3         246919 non-null object
site4         244321 non-null float64
time4         244321 non-null object
site5         241829 non-null float64
time5         241829 non-null object
site6         239495 non-null float64
time6         239495 non-null object
site7         237297 non-null float64
time7         237297 non-null object
site8         235224 non-null float64
time8         235224 non-null object
site9         233084 non-null float64
time9         233084 non-null object
site10        231052 non-null float64
time10        231052 non-null object
target        253561 non-null int64
dtypes: float64(9), int64(3), object(10)
memory usage: 42.6+ MB


# Analyze data types in test dataset

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82797 entries, 0 to 82796
Data columns (total 21 columns):
session_id    82797 non-null int64
site1         82797 non-null int64
time1         82797 non-null object
site2         81308 non-null float64
time2         81308 non-null object
site3         80075 non-null float64
time3         80075 non-null object
site4         79182 non-null float64
time4         79182 non-null object
site5         78341 non-null float64
time5         78341 non-null object
site6         77566 non-null float64
time6         77566 non-null object
site7         76840 non-null float64
time7         76840 non-null object
site8         76151 non-null float64
time8         76151 non-null object
site9         75484 non-null float64
time9         75484 non-null object
site10        74806 non-null float64
time10        74806 non-null object
dtypes: float64(9), int64(2), object(10)
memory usage: 13.3+ MB


# Fill null data in train dataset

In [6]:
train = train.fillna(0)
test = test.fillna(0)

# Convert time columns to datetime

In [7]:
time_columns = ['time1', 'time2', 'time3', 'time4', 'time5', 'time6', 'time7', 'time8', 'time9', 'time10']

for column in time_columns:
    train[column] = pd.to_datetime(train[column])
    test[column] = pd.to_datetime(test[column])

# Add columns with deltas in seconds

In [8]:
for index, column in enumerate(time_columns[1:], 1):
    train['delta{}'.format(index)] = (train[time_columns[index]]-train['time1']).dt.total_seconds().astype(int)
    test['delta{}'.format(index)] = (test[time_columns[index]]-test['time1']).dt.total_seconds().astype(int)

# Convert all negative delta values to -1

In [9]:
delta_columns = ['delta1', 'delta2', 'delta3', 'delta4', 'delta5', 'delta6', 'delta7', 'delta8', 'delta9']

for col in delta_columns:
    train.loc[train[col] < 0, col] = -1
    test.loc[test[col] < 0, col] = -1

# Convert all site columns to int

In [10]:
site_columns = ['site1', 'site2', 'site3', 'site4', 'site5', 'site6', 'site7', 'site8', 'site9', 'site10']
train[site_columns] = train[site_columns].astype(int)
test[site_columns] = test[site_columns].astype(int)

# Search for a best Hyperparameters of Logistic Regression model

In [82]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

pipe_logit = Pipeline([('scaler', StandardScaler()), ('logit', LogisticRegression(max_iter=100))])
features = train.columns.drop(time_columns).drop('session_id').drop('target')
X = train[features]
y = train['target']

solver_list = ['newton-cg', 'lbfgs', 'liblinear', 'saga']
C_list = np.logspace(-4, 3, 8)
class_weight_list = ['balanced', {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:85}, {0:1, 1:95}, {0:1, 1:100}]
param_grid_logit = {'logit__C': C_list, 'logit__solver': solver_list, 'logit__class_weight': class_weight_list}

grid_logit = GridSearchCV(pipe_logit, param_grid_logit, return_train_score=True, cv=4, n_jobs=-1, 
                          scoring = 'roc_auc')
grid_logit.fit(X, y)
print((grid_logit.best_params_, grid_logit.best_score_))

({'logit__C': 0.0001, 'logit__class_weight': {0: 1, 1: 50}, 'logit__solver': 'lbfgs'}, 0.6087699053636322)
CPU times: user 4.08 s, sys: 926 ms, total: 5 s
Wall time: 5min 18s


# Search for a best Hyperparameters of Random Forest model

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

n_estimators_list = [50, 100, 200]
criterion_list = ['gini', 'entropy']
min_samples_split_list = [2, 4, 6]
class_weight_list = ['balanced', {0:1, 1:50}, {0:1, 1:75}, {0:1, 1:100}]
param_grid_rf = {'criterion': criterion_list, 'min_samples_split': min_samples_split_list, 
                 'class_weight': class_weight_list, 'n_estimators': n_estimators_list}

rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf, param_grid_rf, return_train_score=True, cv=4, n_jobs=-1, scoring = 'roc_auc', 
                       verbose=10)
grid_rf.fit(X, y)
print((grid_rf.best_params_, grid_rf.best_score_))

Fitting 4 folds for each of 72 candidates, totalling 288 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 15.8min


# Predict test target with optimized Logistic Regression

In [81]:
from sklearn import preprocessing

x_test = test[features]
x_scaled_train = preprocessing.scale(x_train)
x_scaled_test = preprocessing.scale(x_test)

logit = LogisticRegression(solver='lbfgs', max_iter=2000, C=0.001, class_weight={0: 1, 1: 85})
logit.fit(x_scaled_train, y_train)
prediction = logit.predict(x_scaled_test)

submission = pd.DataFrame()
submission['session_id'] = test['session_id'].copy()
submission['target'] = prediction
submission['target'].value_counts()

0    82042
1      755
Name: target, dtype: int64