In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack
# !pip install eli5
import eli5
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html

In [2]:
PATH_TO_DATA = 'data/input/raw/'
SEED = 17

In [3]:
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

In [4]:
X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = prepare_sparse_features(
    path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv'),
    path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv'),
    path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl'),
    vectorizer_params={'ngram_range': (1, 5), 
                       'max_features': 50000,
                       'tokenizer': lambda s: s.split()}
)

In [5]:
time_split = TimeSeriesSplit(n_splits=10)
logit = LogisticRegression(C=1, random_state=SEED, solver='liblinear')

In [6]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv("data/output/3/" + out_file, index_label=index_label)

In [7]:
def train_and_predict(model, X_train, y_train, X_test, site_feature_names=vectorizer.get_feature_names(), 
                      new_feature_names=None, cv=time_split, scoring='roc_auc',
                      top_n_features_to_show=30, submission_file_name='submission.csv'):
    
    
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, 
                            scoring=scoring, n_jobs=4)
    print('CV scores', cv_scores)
    print('CV mean: {}, CV std: {}'.format(cv_scores.mean(), cv_scores.std()))
    model.fit(X_train, y_train)
    
    if new_feature_names:
        all_feature_names = site_feature_names + new_feature_names 
    else: 
        all_feature_names = site_feature_names
    
    display_html(eli5.show_weights(estimator=model, 
                  feature_names=all_feature_names, top=top_n_features_to_show))
    
    if new_feature_names:
        print('New feature weights:')
    
        print(pd.DataFrame({'feature': new_feature_names, 
                        'coef': model.coef_.flatten()[-len(new_feature_names):]}))
    
    test_pred = model.predict_proba(X_test)[:, 1]
    write_to_submission_file(test_pred, submission_file_name) 
    
    return cv_scores



In [8]:
def add_time_features(times, X_sparse, add_hour=True):
    hour = times['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
    night = ((hour >= 0) & (hour <=6)).astype('int').values.reshape(-1, 1)
    
    objects_to_hstack = [X_sparse, morning, day, evening, night]
    feature_names = ['morning', 'day', 'evening', 'night']
    
    if add_hour:
        # we'll do it right and scale hour dividing by 24
        objects_to_hstack.append(hour.values.reshape(-1, 1) / 24)
        feature_names.append('hour')
        
    X = hstack(objects_to_hstack)
    return X, feature_names

In [9]:
def add_day_month(times, X_sparse):
    day_of_week = times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    month = times['time1'].apply(lambda t: t.month).values.reshape(-1, 1) 
    # linear trend: time in a form YYYYMM, we'll divide by 1e5 to scale this feature 
    year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
    
    objects_to_hstack = [X_sparse, day_of_week, month, year_month]
    feature_names = ['day_of_week', 'month', 'year_month']
        
    X = hstack(objects_to_hstack)
    return X, feature_names

In [10]:
def baseline_features():
    # add morning, evening etc, hour
    X_train_with_times, new_feat_names = add_time_features(train_times, X_train_sites, add_hour=False)
    X_test_with_times, _ = add_time_features(test_times, X_test_sites, add_hour=False)

    #add session duration
    train_durations = (train_times.max(axis=1) - train_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
    test_durations = (test_times.max(axis=1) - test_times.min(axis=1)).astype('timedelta64[ms]').astype(int)

    scaler = StandardScaler()
    train_dur_scaled = scaler.fit_transform(train_durations.values.reshape(-1, 1))
    test_dur_scaled = scaler.transform(test_durations.values.reshape(-1, 1))

    X_train_with_time_correct = hstack([X_train_with_times, train_dur_scaled])
    X_test_with_time_correct = hstack([X_test_with_times, test_dur_scaled])

    #add day of month
    X_train_final, more_feat_names = add_day_month(train_times, X_train_with_time_correct)
    X_test_final, _ = add_day_month(test_times, X_test_with_time_correct)

    return X_train_final, X_test_final, new_feat_names + ['sess_duration'] + more_feat_names

In [11]:
X_train_final, X_test_final, final_feat_names = baseline_features()

In [12]:
cv_scores6 = train_and_predict(model=logit, X_train=X_train_final, y_train=y_train, 
                               X_test=X_test_final, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=final_feat_names,
                               cv=time_split, submission_file_name='subm6.csv')



CV scores [0.76714843 0.81596889 0.90539886 0.96307585 0.91634694 0.95853688
 0.9272257  0.95188144 0.95895278 0.96919629]
CV mean: 0.9133732054530315, CV std: 0.06505407075192628


Weight?,Feature
+5.161,youwatch.org
+5.041,vk.com
+5.017,www.express.co.uk
+4.986,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.687,www.info-jeunes.net
+4.442,www.melty.fr
+4.385,fr.glee.wikia.com
+4.332,www.audienceinsights.net
+4.020,www.banque-chalus.fr
+3.960,api.bing.com


New feature weights:
         feature      coef
0        morning -1.660682
1            day  2.059842
2        evening -1.737586
3          night  0.000000
4  sess_duration -0.262895
5    day_of_week -0.367673
6          month  0.108554
7     year_month -2.753967


In [31]:
def my_day_month(times, X_sparse):
    day_of_week = times['time1'].apply(lambda t: t.isoweekday()).values.reshape(-1, 1)
    month = times['time1'].apply(lambda t: t.month).values.reshape(-1, 1) 
    # linear trend: time in a form YYYYMM, we'll divide by 1e5 to scale this feature 
    year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
    
    objects_to_hstack = [X_sparse, day_of_week, month, year_month]
    feature_names = ['day_of_week', 'month', 'year_month']
        
    X = hstack(objects_to_hstack)
    return X, feature_names

def my_time_features(times, X_sparse):
    hour = times['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
    night = ((hour >= 0) & (hour <=6)).astype('int').values.reshape(-1, 1)
    
    objects_to_hstack = [X_sparse, morning, day, evening, night]
    feature_names = ['morning', 'day', 'evening', 'night']
        
    X = hstack(objects_to_hstack)
    return X, feature_names

def my_features():
    # add morning, evening etc, hour
    X_train_with_times, new_feat_names = my_time_features(train_times, X_train_sites)
    X_test_with_times, _ = my_time_features(test_times, X_test_sites)

    #add session duration
    train_durations = (train_times.max(axis=1) - train_times.min(axis=1)).astype('timedelta64[ms]').astype(int)
    test_durations = (test_times.max(axis=1) - test_times.min(axis=1)).astype('timedelta64[ms]').astype(int)

    scaler = StandardScaler()
    train_dur_scaled = scaler.fit_transform(train_durations.values.reshape(-1, 1))
    test_dur_scaled = scaler.transform(test_durations.values.reshape(-1, 1))

    X_train_with_time_correct = hstack([X_train_with_times, train_dur_scaled])
    X_test_with_time_correct = hstack([X_test_with_times, test_dur_scaled])

    #add day of month
    X_train_final, more_feat_names = my_day_month(train_times, X_train_with_time_correct)
    X_test_final, _ = my_day_month(test_times, X_test_with_time_correct)

    return X_train_final, X_test_final, new_feat_names + ['sess_duration'] + more_feat_names

X_train_final, X_test_final, final_feat_names = my_features()

cv_scores_my1 = train_and_predict(model=logit, X_train=X_train_final, y_train=y_train, 
                               X_test=X_test_final, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=final_feat_names,
                               cv=time_split, submission_file_name='my1.csv')

print(cv_scores_my1 > cv_scores6)



CV scores [0.84541868 0.83916717 0.77587758 0.96154748 0.91378225 0.95413981
 0.92754742 0.94777949 0.95880997 0.9689245 ]
CV mean: 0.9092994344817136, CV std: 0.06275462818295814


Weight?,Feature
+5.165,youwatch.org
+5.044,vk.com
+5.013,www.express.co.uk
+4.987,cid-ed6c3e6a5c6608a4.users.storage.live.com
+4.681,www.info-jeunes.net
+4.449,www.melty.fr
+4.382,fr.glee.wikia.com
+4.333,www.audienceinsights.net
+4.010,www.banque-chalus.fr
+3.961,api.bing.com


New feature weights:
         feature      coef
0        morning -1.632765
1            day  2.091094
2        evening -1.728116
3          night  0.000000
4  sess_duration -0.263637
5    day_of_week -0.374772
6          month  0.109075
7     year_month -2.615216
8     is_weekend  0.157321
[ True  True False False False False  True False False False]


In [13]:
(
c_values = np.logspace(-2, 2, 20)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=2)

logit_grid_searcher.fit(X_train_final, y_train); 

print(logit_grid_searcher.best_score_, logit_grid_searcher.best_params_)
final_model = logit_grid_searcher.best_estimator_

SyntaxError: '(' was never closed (3308948854.py, line 1)

In [None]:
cv_scores7 = train_and_predict(model=final_model, X_train=X_train_final, y_train=y_train, 
                               X_test=X_test_final, 
                               site_feature_names=vectorizer.get_feature_names(),
                               new_feature_names=new_feat_names + ['sess_duration'] + more_feat_names,
                               cv=time_split, submission_file_name='final.csv')