In [1]:
# Import libraries and set desired options
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
#графики в svg выглядят более четкими
%config InlineBackend.figure_format = 'svg' 

#увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 8, 5

In [2]:
# Read the training and test data sets, change paths if needed
times = ['time%s' % i for i in range(1, 11)]
train_df = pd.read_csv('C:\\Anaconda\\Menu\\train_sessions.csv',
                       index_col='session_id', parse_dates=times)
test_df = pd.read_csv('C:\\Anaconda\\Menu\\test_sessions.csv',
                      index_col='session_id', parse_dates=times)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head();
#test_df.head()

In [3]:
# Change site1, ..., site10 columns type to integer and fill NA-values with zeros
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype(np.uint16)
test_df[sites] = test_df[sites].fillna(0).astype(np.uint16)

# Load websites dictionary
with open(r"C:\\Anaconda\\Menu\\//site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# Create dataframe for the dictionary
sites_dict = pd.DataFrame(list(site_dict.keys()), index=list(site_dict.values()), 
                          columns=['site'])
print(u'Websites total:', sites_dict.shape[0])
sites_dict.head();

Websites total: 48371


In [4]:
# Our target variable
y_train = train_df['target']

# United dataframe of the initial data 
full_df = pd.concat([train_df.drop('target', axis=1), test_df])
# Index to split the training and test data sets
idx_split = train_df.shape[0]

In [5]:
# Dataframe with indices of visited websites in session
full_sites = full_df[sites]
full_sites.head();

In [6]:
# sequence of indices
sites_flatten = full_sites.values.flatten()

# and the matrix we are looking for 
# (make sure you understand which of the `csr_matrix` constructors is used here)
# a further toy example will help you with it
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                             range(0, sites_flatten.shape[0]  + 10, 10)))[:, 1:]

In [7]:
def get_auc_lr_valid(X, y, C=0.17, seed=17, ratio = 0.9):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='liblinear').fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [8]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [9]:
# Dataframe for new features
full_new_feat = pd.DataFrame(index=full_df.index)

# Add start_month feature
full_new_feat['start_month'] = full_df['time1'].apply(lambda ts: 
                                                      100 * ts.year + ts.month).astype('float64')

In [10]:
# Add the new standardized feature to the sparse matrix
tmp = StandardScaler().fit_transform(full_new_feat[['start_month']])
X_train = csr_matrix(hstack([full_sites_sparse[:idx_split,:], tmp[:idx_split,:]]))
# Compute metric on the validation set
print(get_auc_lr_valid(X_train, y_train));
X_train.shape

0.9206172044452174


(253561, 48372)

In [11]:
# Train the model on the whole training data set
# Use random_state=17 for repeatability
lr = LogisticRegression(C=0.17, random_state=17, solver='liblinear').fit(X_train, y_train)

# Make a prediction for test data set
X_test = csr_matrix(hstack([full_sites_sparse[idx_split:,:], tmp[idx_split:,:]]))
y_test = lr.predict_proba(X_test)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_2.csv')

In [12]:
# Add start_hour feature
full_new_feat['start_hour'] = full_df['time1'].apply(lambda ts: 
                                                                ts.hour).astype('float64')
full_new_feat.head();

In [21]:
# Add the new standardized feature to the sparse matrix
hr = StandardScaler().fit_transform(full_new_feat[['start_hour']])
X_train = csr_matrix(hstack([full_sites_sparse[:idx_split,:], hr[:idx_split,:]]))

# Compute metric on the validation set
print(get_auc_lr_valid(X_train, y_train))
X_train.shape

0.9611880552294827


(253561, 48372)

In [14]:
# Train the model on the whole training data set
# Use random_state=17 for repeatability
lr = LogisticRegression(C=0.17, random_state=17, solver='liblinear').fit(X_train, y_train)

# Make a prediction for test data set
X_test = csr_matrix(hstack([full_sites_sparse[idx_split:,:], hr[idx_split:,:]]))
y_test = lr.predict_proba(X_test)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_2.csv')

In [13]:
#add day_time feature
full_new_feat['morning'] = full_df['time1'].apply(lambda ts: 
                                                         (ts.hour>=7) & (ts.hour<=11)).astype('int64')
full_new_feat['day'] = full_df['time1'].apply(lambda ts: 
                                                        (ts.hour>=12) & (ts.hour<=18)).astype('int64')
full_new_feat['evening'] = full_df['time1'].apply(lambda ts: 
                                                        (ts.hour>=19) & (ts.hour<=23)).astype('int64')
full_new_feat['night'] = full_df['time1'].apply(lambda ts: 
                                                        (ts.hour>=0) & (ts.hour<=6)).astype('int64')
full_new_feat.head();

In [16]:
# Add the new feature to the sparse matrix
morn_ = full_new_feat[['morning']].values
day_ = full_new_feat[['day']].values
even_= full_new_feat[['evening']].values
night_ = full_new_feat[['night']].values
X_train = csr_matrix(hstack([full_sites_sparse[:idx_split,:],tmp[:idx_split,:], morn_[:idx_split,:], day_[:idx_split,:], 
                                                            even_[:idx_split,:], night_[:idx_split,:]]))

# Compute the metric on the validation set
print(get_auc_lr_valid(X_train, y_train))

0.9511044054847857


In [17]:
# Train the model on the whole training data set
# Use random_state=17 for repeatability
lr = LogisticRegression(C=0.17, random_state=17, solver='liblinear').fit(X_train, y_train)

# Make a prediction for test data set
X_test = csr_matrix(hstack([full_sites_sparse[idx_split:,:], tmp[idx_split:,:], morn_[idx_split:,:],day_[idx_split:,:],
                                                               even_[idx_split:,:],night_[idx_split:,:]]))
y_test = lr.predict_proba(X_test)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_3.csv')