In [1]:
# This file is a modified version of the original:
# https://www.kaggle.com/mark4h/jowilder-2nd-place-solution-2-generate-features

In [2]:
from cffi import FFI
import json
import numba
import numpy as np
import pandas as pd
import inspect
ffi = FFI()

In [3]:
# PREPROCESSED_DIR = '/kaggle/input/jowilder-2nd-place-solution-0-preprocess-data/'
# FEATURE_GEN_CODE_DIR = '/kaggle/input/jowilder-2nd-place-solution-1-features-code/'

PREPROCESSED_DIR = 'input/'
FEATURE_GEN_CODE_DIR = 'modules/'

In [4]:
import sys
sys.path.append(FEATURE_GEN_CODE_DIR)

import JoWilder_C_features
import JoWilder_numba_features

In [5]:
%%time
train_labels_df = pd.read_pickle(PREPROCESSED_DIR + 'preprocessed_train_labels_df.pkl')
train_df = pd.read_pickle(PREPROCESSED_DIR + 'preprocessed_train_df.pkl')

CPU times: user 6.71 s, sys: 5.35 s, total: 12.1 s
Wall time: 12.2 s


In [6]:
with open(FEATURE_GEN_CODE_DIR + 'FEATURES_GENERATION_INFO.json', 'r') as f:
    FEATURES_GENERATION_INFO = json.loads(f.read())
    
NUM_FEATURES = FEATURES_GENERATION_INFO['NUM_FEATURES']
HISTORY_LEN = FEATURES_GENERATION_INFO['HISTORY_LEN']

NUM_FEATURES, HISTORY_LEN

(1590, 3183)

In [7]:
@numba.jit(nopython=True, nogil=True, error_model='numpy', parallel=False)
def get_splits(a):
    m = np.concatenate((np.array([True]), a[1:] != a[:-1], np.array([True])))
    m = np.where(m)[0]
    return m

In [8]:
def generate_features(
    session_id,
    level_group,
    elapsed_time,
    event_name,
    name,
    level,
    hover_duration,
    session_weekday,
    building,
    room,
    fqids,
    session_event_index,
    room_coor_x,
    room_coor_y,
    screen_coor_x,
    screen_coor_y,
    text_length,
    text_numerical,
    text_fqid_numerical,
    room_fqid_numerical,
    page,
    session_hour,
):
    # session_id[:4] => [20090312431273200 20090312431273200 20090312431273200 20090312431273200]
    # len(session_id) => 26296946
    # get indices from which unique session_id begins.
    session_splits = get_splits(session_id)
    # session_splits[:4] => [   0  881 2712 3470]
    # len(session_splits) => 23563
    
    # session_id[880], session_id[881] => 20090312431273200, 20090312433251036    

    num_sessions = session_splits.shape[0] - 1
    
    num_questions_per_session = 18
    questions_per_level = np.array([3, 10, 5])
    questions_splits_per_level = np.array([0, 3, 13, 18])
    n = num_sessions * num_questions_per_session
    # n => 424116    
    
    out_session_id = np.full(n, -1, dtype=np.int64)
    out_level_group = np.full(n, -1, dtype=np.int32)
    out_question_number = np.full(n, -1, dtype=np.int32)
    out = np.full((n, NUM_FEATURES), np.nan, dtype=np.float32)
    # out.shape, out.size => (424116, 1590), 674344440
    # 424116*1590 => 674344440
    
    history = {}
    
    # cdecl => C declaration of the target C data type.
    # require_writable (defaul False) => a boolean flag indicating whether the Python buffer must be writable.
    # .from_buffer([cdecl,] python_buffer, require_writable=False) =>    
    # allows you to pass Python buffer objects directly to C functions that expect pointers or arrays.
    # return an array cdata (by default a <cdata 'char[]'>) that points to the data of the given Python buffer object.    
    x_et_pointer = ffi.from_buffer('long[]', elapsed_time)
    # x_et_pointer => <cdata 'long[]' buffer len 26296946 from 'numpy.ndarray' object>
    
    x_en_pointer = ffi.from_buffer('long[]', event_name)
    x_n_pointer = ffi.from_buffer('long[]', name)
    x_hover_duration_pointer = ffi.from_buffer('double[]', hover_duration)
    x_session_weekday_pointer = ffi.from_buffer('long[]', session_weekday)
    x_b_pointer = ffi.from_buffer('long[]', building)
    x_r_pointer = ffi.from_buffer('long[]', room)
    x_fqids_pointer = ffi.from_buffer('long[]', fqids)
    x_l_pointer = ffi.from_buffer('long[]', level)
    x_index_pointer = ffi.from_buffer('long[]', session_event_index)
    x_rc_x_pointer = ffi.from_buffer('double[]', room_coor_x)
    x_rc_y_pointer = ffi.from_buffer('double[]', room_coor_y)
    x_sc_x_pointer = ffi.from_buffer('double[]', screen_coor_x)
    x_sc_y_pointer = ffi.from_buffer('double[]', screen_coor_y)
    x_tl_pointer = ffi.from_buffer('long[]', text_length)
    x_text_numerical_pointer = ffi.from_buffer('long[]', text_numerical)
    x_text_fqid_numerical_pointer = ffi.from_buffer('long[]', text_fqid_numerical)
    x_room_fqid_numerical_pointer = ffi.from_buffer('long[]', room_fqid_numerical)
    x_page_pointer = ffi.from_buffer('double[]', page)
    x_hour_pointer = ffi.from_buffer('long[]', session_hour)
    
    out_pointer = ffi.from_buffer('float[]', out)
    
    # Numba's prange instead of range to specify that loop is now parallelized.
    # all iterations of the for loop can execute in parallel.
    for session_index in numba.prange(num_sessions): # my_
    #for session_index in range(num_sessions):
    
        try:
            hist = history[session_index]
        except KeyError:
            hist = np.full(HISTORY_LEN, np.nan, dtype=np.float64)
            history[session_index] = hist
            
        hist_pointer = ffi.from_buffer('double[]', hist)
        
        session_out_s = session_index * num_questions_per_session
        
        # session start index.
        session_s = session_splits[session_index] 
        # session end index.
        session_e = session_splits[session_index + 1]                       
        
        # level_group[session_s:session_e][:5] => [0 0 0 0 0]
        # get indices from which unique level_group begins.
        level_group_splits = get_splits(level_group[session_s:session_e])
        # level_group_splits[:5] => [  0 165 461 881]
        
        # level_group[session_s:session_e][164], level_group[session_s:session_e][165] => 0, 1        
        
        num_level_groups = level_group_splits.shape[0] - 1
        
        out_session_id[session_out_s:session_out_s+num_questions_per_session] = session_id[session_s]
        # out_session_id[:18], out_session_id[18:36], out_session_id[36:54] ... so on.
        
        for level_group_index in range(num_level_groups):
            
            # s => level_group start index within a particular session.
            s = session_s + level_group_splits[level_group_index]
            # e => level_group end index within a particular session.
            e = session_s + level_group_splits[level_group_index + 1]
            
            assert level_group[s] == level_group_index
            
            if level_group_index >= 3:
                assert False
                break
            
            # num_questions_in_level => 3 (level_group_index=0), 10 (level_group_index=1) and 5 (...)
            num_questions_in_level = questions_per_level[level_group_index]
            
            # questions_start_number => 0 (level_group_index=0), 3 (level_group_index=1) and 13 (...)
            questions_start_number = questions_splits_per_level[level_group_index] 
            
            # questions_end_number => 3 (level_group_index=0), 13 (level_group_index=1) and 18 (...)
            questions_end_number = questions_splits_per_level[level_group_index + 1]
            
            
            out_s = session_out_s + questions_start_number # 0, 3, 13
            out_e = session_out_s + questions_end_number # 3, 13, 18
            
            x_et = elapsed_time[s:e]
            x_en = event_name[s:e]
            x_n = name[s:e]
            x_hover_duration = hover_duration[s:e]
            x_session_weekday = session_weekday[s:e]
            x_b = building[s:e]
            x_r = room[s:e]
            x_fqids = fqids[s:e]
            x_l = level[s:e]
            x_index = session_event_index[s:e]
            x_rc_x = room_coor_x[s:e]
            x_rc_y = room_coor_y[s:e]
            x_sc_x = screen_coor_x[s:e]
            x_sc_y = screen_coor_y[s:e]
            x_tl = text_length[s:e]
            x_text_numerical = text_numerical[s:e]
            x_text_fqid_numerical = text_fqid_numerical[s:e]
            x_room_fqid_numerical = room_fqid_numerical[s:e]
            x_page = page[s:e]
            x_hour = session_hour[s:e]

            question_number = np.arange(questions_start_number, questions_end_number)
            out_question_number[out_s:out_e] = question_number
            # out_question_number[out_s:out_e] => [0 1 2]
            
            out_level_group[out_s:out_e] = level_group_index
            # out_level_group[out_s:out_e] => [0 0 0]
            
            
            JoWilder_numba_features.process_single(
                level_group_index,
                x_et,
                x_en,
                x_n,
                x_hover_duration,
                x_session_weekday,
                x_b,
                x_r,
                x_fqids,
                x_l,
                x_index,
                x_rc_x,
                x_rc_y,
                x_sc_x,
                x_sc_y,
                x_tl,
                x_text_numerical,
                x_text_fqid_numerical,
                x_room_fqid_numerical,
                x_page,
                x_hour,
                out[out_s:out_e],
                hist,
            )
            # x_et.shape => 165
            
            number_of_events = x_et.shape[0]
            
            JoWilder_C_features.lib.fill_history(
                level_group_index,
                x_l_pointer + s,
                hist_pointer,
                x_index_pointer + s,
                x_text_numerical_pointer + s,
                x_en_pointer + s,
                x_n_pointer + s,
                x_fqids_pointer + s,
                x_et_pointer + s, # ffi.from_buffer(x_et),
                x_rc_x_pointer + s,
                x_rc_y_pointer + s,
                x_sc_x_pointer + s,
                x_sc_y_pointer + s,
                x_b_pointer + s,
                x_r_pointer + s,
                x_room_fqid_numerical_pointer + s,
                x_text_fqid_numerical_pointer + s,
                x_page_pointer + s,
                x_hover_duration_pointer + s,
                number_of_events,
            )
            
            JoWilder_C_features.lib.add_features_batch(
                level_group_index,
                out_pointer + (out_s * NUM_FEATURES),
                hist_pointer,
                (out_e - out_s)
            )
            
        #break # my_
    return out_session_id, out_level_group, out_question_number, out

In [9]:
train_df['text'].head(3)

0                     undefined
1        Just talking to Teddy.
2    I gotta run to my meeting!
Name: text, dtype: object

In [10]:
%%time
train_df['text_length'] = train_df['text'].str.len()

CPU times: user 2.76 s, sys: 99.2 ms, total: 2.86 s
Wall time: 2.85 s


In [11]:
train_df['text_length'].head(3)

0     9.0
1    22.0
2    26.0
Name: text_length, dtype: float64

In [12]:
%%time
X_session_id, X_out_level_group, X_out_question_number, X_out = generate_features(
    train_df['session_id'].values,
    train_df['numerical_level_group'].values,
    train_df['elapsed_time'].values,
    train_df['numerical_event_name'].values,
    train_df['numerical_name'].values,
    train_df['level'].values,
    train_df['hover_duration'].values,
    train_df['session_weekday'].values,
    train_df['numerical_room_fqid_building'].values,
    train_df['numerical_room_fqid_room'].values,
    train_df['numerical_fqid'].values,
    train_df['index'].values,
    train_df['room_coor_x'].values,
    train_df['room_coor_y'].values,
    train_df['screen_coor_x'].values,
    train_df['screen_coor_y'].values,
    train_df['text_length'].values,
    train_df['numerical_text'].values,
    train_df['numerical_text_fqid'].values,
    train_df['numerical_room_fqid'].values,
    train_df['page'].values,
    train_df["session_hour"].values,
)

CPU times: user 7.74 s, sys: 688 ms, total: 8.42 s
Wall time: 8.57 s


In [13]:
print(X_out.shape)

(424116, 1590)


In [14]:
np.save('X_session_id.npy', X_session_id)
np.save('X_out_level_group.npy', X_out_level_group)
np.save('X_out_question_number.npy', X_out_question_number)
np.save('X_out.npy', X_out)

In [11]:
!ls -lh

total 2.6G
-rw-r--r-- 1 root root 2.6G Jul  4 09:51 X_out.npy
-rw-r--r-- 1 root root 1.7M Jul  4 09:51 X_out_level_group.npy
-rw-r--r-- 1 root root 1.7M Jul  4 09:51 X_out_question_number.npy
-rw-r--r-- 1 root root 3.3M Jul  4 09:51 X_session_id.npy
---------- 1 root root  20K Jul  4 09:51 __notebook__.ipynb
