In [2]:

import os
import pickle
import numpy as np
import pandas as pd
import time
from contextlib import contextmanager
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression


#PATH_TO_DATA = 'catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2'

In [2]:
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [3]:
PATH_TO_DATA = 'catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2'
AUTHOR = '2' # change here to <name>_<surname>
# it's a nice practice to define most of hyperparams here
SEED = 17
N_JOBS = 4
NUM_TIME_SPLITS = 10    # for time-based cross-validation
SITE_NGRAMS = (1, 5)    # site ngrams for "bag of sites"
MAX_FEATURES = 50000    # max features for "bag of sites"
BEST_LOGIT_C = 5.45559  # precomputed tuned C for logistic regression
 

# nice way to report running times
@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

In [4]:

 
def prepare_sparse_features(path_to_train, path_to_test, path_to_site_dict,
                           vectorizer_params):
    times = ['time%s' % i for i in range(1, 11)]
    train_df = pd.read_csv(path_to_train,
                       index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test,
                      index_col='session_id', parse_dates=times)

    # Sort the data by time
    train_df = train_df.sort_values(by='time1')
    
    # read site -> id mapping provided by competition organizers 
    with open(path_to_site_dict, 'rb') as f:
        site2id = pickle.load(f)
    # create an inverse id _> site mapping
    id2site = {v:k for (k, v) in site2id.items()}
    # we treat site with id 0 as "unknown"
    id2site[0] = 'unknown'
    
    # Transform data into format which can be fed into TfidfVectorizer
    # This time we prefer to represent sessions with site names, not site ids. 
    # It's less efficient but thus it'll be more convenient to interpret model weights.
    sites = ['site%s' % i for i in range(1, 11)]
    train_sessions = train_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    test_sessions = test_df[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()
    # we'll tell TfidfVectorizer that we'd like to split data by whitespaces only 
    # so that it doesn't split by dots (we wouldn't like to have 'mail.google.com' 
    # to be split into 'mail', 'google' and 'com')
    vectorizer = TfidfVectorizer(**vectorizer_params)
    X_train = vectorizer.fit_transform(train_sessions)
    X_test = vectorizer.transform(test_sessions)
    y_train = train_df['target'].astype('int').values
    
    # we'll need site visit times for further feature engineering
    train_times, test_times = train_df[times], test_df[times]
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times


In [5]:

def add_features(times, X_sparse):
    hour = times['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
    day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
    evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
   # night = ((hour >= 0) & (hour <= 6)).astype('int').values.reshape(-1, 1)
    sess_duration = (times.max(axis=1) - times.min(axis=1)).astype('timedelta64[s]')\
           .astype('int').values.reshape(-1, 1)
    day_of_week = times['time1'].apply(lambda t: t.weekday()).values.reshape(-1, 1)
    month = times['time1'].apply(lambda t: t.month).values.reshape(-1, 1) 
    year_month = times['time1'].apply(lambda t: 100 * t.year + t.month).values.reshape(-1, 1) / 1e5
   # day_of_week = times['time1'].apply(lambda x: int(x.strftime("%u"))).values.reshape(-1, 1)   
   # day_of_year = times['time1'].apply(lambda x: int(x.strftime("%j"))).values.reshape(-1, 1) 
 
    X = hstack([X_sparse, morning, day, evening,  sess_duration, day_of_week, month, year_month])
    return X




In [10]:
times = ['time%s' % i for i in range(1, 11)]

In [11]:
%%time
with timer('Building sparse site features'):
    X_train_sites, X_test_sites, y_train, vectorizer, train_times, test_times = \
        prepare_sparse_features(
            path_to_train=os.path.join('train_sessions.csv'),
            path_to_test=os.path.join('test_sessions.csv'),
            path_to_site_dict=os.path.join('site_dic.pkl'),
            vectorizer_params={'ngram_range': SITE_NGRAMS,
                               'max_features': MAX_FEATURES,
                               'tokenizer': lambda s: s.split()})

FileNotFoundError: [Errno 2] File b'train_sessions.csv' does not exist: b'train_sessions.csv'

In [7]:

with timer('Building additional features'):
    X_train_final = add_features(train_times, X_train_sites)
    X_test_final = add_features(test_times, X_test_sites)


with timer('Cross-validation'):
    time_split = TimeSeriesSplit(n_splits=NUM_TIME_SPLITS)
    logit = LogisticRegression(random_state=SEED, solver='liblinear')

    # I've done cross-validation locally, and do not reproduce these heavy computations here,
    # but this is the vest C that I've found
    c_values = [BEST_LOGIT_C]

    logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=N_JOBS, cv=time_split, verbose=1)
    logit_grid_searcher.fit(X_train_final, y_train)
    print('CV score', logit_grid_searcher.best_score_)


with timer('Test prediction and submission'):
    test_pred = logit_grid_searcher.predict_proba(X_test_final)[:, 1]
    pred_df = pd.DataFrame(test_pred, index=np.arange(1, test_pred.shape[0] + 1),
                       columns=['target'])
    pred_df.to_csv(f'submission_alice_{2}.csv', index_label='session_id')

NameError: name 'train_times' is not defined

In [5]:
a=np.array([[10.402336 ],
       [13.373717 ],
       [13.535698 ],
       [14.358599 ],
       [16.900347 ],
       [13.31413  ],
       [14.02681  ],
       [11.538004 ],
       [ 9.363679 ],
       [12.214071 ],
       [14.851657 ],
       [ 9.97743  ],
       [15.277494 ],
       [11.034542 ],
       [20.586723 ],
       [10.359436 ],
       [15.498475 ],
       [13.765459 ],
       [13.684841 ],
       [11.031345 ],
       [12.966791 ],
       [14.456987 ],
       [14.951693 ],
       [12.305708 ],
       [12.347288 ],
       [14.80225  ],
       [11.382761 ],
       [15.523981 ],
       [ 8.176135 ],
       [ 8.909668 ],
       [19.19622  ],
       [13.292833 ],
       [15.642754 ],
       [20.539385 ],
       [13.74423  ],
       [19.68315  ],
       [14.456618 ],
       [14.382275 ],
       [15.061501 ],
       [10.54332  ],
       [13.596794 ],
       [15.406761 ],
       [ 9.772394 ],
       [13.140322 ],
       [14.474539 ],
       [ 8.425964 ],
       [ 8.254733 ],
       [12.544536 ],
       [ 9.005592 ],
       [11.073776 ],
       [13.37965  ],
       [11.735427 ],
       [ 8.92501  ],
       [13.409327 ],
       [16.044352 ],
       [12.598143 ],
       [25.062616 ],
       [13.6831045],
       [ 9.882993 ],
       [15.484062 ],
       [15.883386 ],
       [15.487326 ],
       [10.454522 ],
       [10.139215 ],
       [15.061453 ],
       [11.625309 ],
       [12.548214 ],
       [ 8.770439 ],
       [12.794693 ],
       [14.781465 ],
       [10.825588 ],
       [20.409966 ],
       [17.623434 ],
       [14.408533 ],
       [11.315624 ],
       [14.74848  ],
       [11.048444 ],
       [20.71388  ],
       [ 9.032481 ],
       [12.828813 ],
       [ 9.969187 ],
       [12.60347  ],
       [12.792591 ],
       [10.918411 ],
       [12.49885  ],
       [ 8.377663 ],
       [11.889959 ],
       [15.180435 ],
       [14.394958 ],
       [11.175678 ],
       [ 9.266188 ],
       [11.466753 ],
       [12.528255 ],
       [11.332331 ],
       [13.108851 ],
       [10.400122 ],
       [10.941018 ],
       [12.14134  ],
       [13.823926 ],
       [19.309576 ],
       [12.992845 ],
       [ 7.1230855],
       [11.281725 ],
       [14.7510605],
       [12.283346 ],
       [12.222406 ],
       [10.166099 ],
       [10.9234495],
       [15.674833 ],
       [13.734234 ],
       [16.343096 ],
       [11.262772 ],
       [14.223561 ],
       [14.495502 ],
       [19.212019 ],
       [12.075214 ],
       [11.836252 ],
       [21.611807 ],
       [20.05071  ],
       [14.8509   ],
       [10.471229 ],
       [17.850819 ],
       [13.799343 ],
       [12.738356 ],
       [15.425364 ],
       [12.704123 ],
       [ 9.825504 ],
       [14.453867 ],
       [13.489315 ],
       [ 9.369244 ],
       [10.706642 ],
       [11.309019 ],
       [11.196518 ],
       [17.926126 ],
       [ 8.293824 ],
       [13.54254  ],
       [ 9.781413 ],
       [11.48018  ],
       [10.9808235],
       [16.905903 ],
       [11.617441 ],
       [14.524174 ],
       [15.188513 ],
       [14.56705  ],
       [10.947817 ],
       [16.209415 ],
       [11.829873 ],
       [15.617416 ],
       [ 9.954979 ],
       [ 9.081682 ],
       [16.200315 ],
       [10.025169 ],
       [14.799329 ],
       [16.24043  ],
       [12.824568 ],
       [11.2865505],
       [16.526327 ],
       [ 7.4518127],
       [17.960539 ],
       [ 9.947145 ],
       [12.531158 ],
       [ 8.223735 ],
       [13.589927 ],
       [12.401165 ],
       [15.035765 ],
       [13.029221 ],
       [ 9.454211 ],
       [12.062712 ],
       [18.149824 ],
       [12.235938 ],
       [15.543008 ],
       [ 7.5089173],
       [11.155765 ],
       [11.366596 ],
       [16.855505 ],
       [14.035114 ],
       [19.39398  ],
       [12.5623455],
       [13.111071 ],
       [20.707424 ],
       [ 9.298101 ],
       [11.844376 ],
       [11.531582 ],
       [13.861862 ],
       [11.749945 ],
       [13.556944 ],
       [ 8.173612 ],
       [18.713099 ],
       [10.457923 ],
       [15.274833 ],
       [14.188934 ],
       [ 8.826002 ],
       [17.637304 ],
       [15.689093 ],
       [17.157372 ],
       [15.493042 ],
       [ 9.457833 ],
       [17.76438  ],
       [ 9.231807 ],
       [ 8.682282 ],
       [19.372587 ],
       [13.759415 ],
       [16.295332 ],
       [14.065436 ],
       [17.985487 ],
       [ 9.032961 ],
       [16.93157  ],
       [15.490112 ],
       [18.59852  ],
       [13.062677 ],
       [13.818713 ],
       [13.056262 ],
       [13.807166 ],
       [15.676165 ],
       [13.381652 ],
       [16.163877 ],
       [11.822947 ],
       [13.50061  ],
       [15.227438 ],
       [20.867071 ],
       [11.843841 ],
       [15.7488785],
       [13.147612 ],
       [16.329386 ],
       [ 8.981106 ],
       [13.776291 ],
       [10.456162 ],
       [10.268742 ],
       [21.002909 ],
       [ 9.330502 ],
       [11.133322 ],
       [11.412185 ],
       [11.570679 ],
       [12.250421 ],
       [28.80337  ],
       [13.863974 ],
       [13.390383 ],
       [13.347866 ],
       [10.7967825],
       [14.771093 ],
       [11.586869 ],
       [ 9.381312 ],
       [13.551388 ],
       [14.184276 ],
       [21.045977 ],
       [12.519368 ],
       [ 9.374829 ],
       [14.188831 ],
       [25.637793 ],
       [12.38537  ],
       [15.007738 ],
       [18.577095 ],
       [14.281345 ],
       [ 9.768796 ],
       [16.892084 ],
       [11.082037 ],
       [10.8636675],
       [12.677074 ],
       [18.437399 ],
       [11.696723 ],
       [13.15107  ],
       [14.498066 ],
       [15.387747 ],
       [14.729658 ],
       [11.672194 ],
       [13.73123  ],
       [ 9.168466 ],
       [12.330165 ],
       [13.779219 ],
       [11.477919 ],
       [16.637167 ],
       [11.80629  ],
       [17.8286   ],
       [11.851812 ],
       [13.283805 ],
       [12.301764 ],
       [12.906961 ],
       [19.657402 ],
       [12.271288 ],
       [10.1127   ],
       [12.804491 ],
       [11.109789 ],
       [14.197566 ],
       [18.590342 ],
       [14.10619  ],
       [10.450213 ],
       [13.435233 ],
       [ 9.6999   ],
       [12.71826  ],
       [12.004345 ],
       [14.461774 ],
       [12.922203 ],
       [16.939016 ],
       [12.99585  ],
       [20.047915 ],
       [18.463392 ],
       [12.934393 ],
       [13.429909 ],
       [14.566083 ],
       [13.399691 ],
       [16.589607 ],
       [10.499078 ],
       [ 9.449633 ],
       [13.366922 ],
       [12.293608 ],
       [13.016384 ],
       [12.240256 ],
       [14.282314 ],
       [13.655951 ],
       [11.330044 ],
       [ 8.29335  ],
       [ 8.513496 ],
       [13.327071 ],
       [11.35002  ],
       [13.75655  ],
       [13.143845 ]])

In [6]:
a.shape

(316, 1)

In [7]:
df1 =  pd.read_csv("/content/drive/My Drive/Colab Notebooks/vgg/geekhub-2020-age-estimation-challenge/sample_submission.csv")
df1['age']=a

FileNotFoundError: [Errno 2] File /content/drive/My Drive/Colab Notebooks/vgg/geekhub-2020-age-estimation-challenge/sample_submission.csv does not exist: '/content/drive/My Drive/Colab Notebooks/vgg/geekhub-2020-age-estimation-challenge/sample_submission.csv'