In [1]:
# This file is a modified version of the original:
# https://www.kaggle.com/mark4h/jowilder-2nd-place-solution-0-preprocess-data

In [2]:
import json
import numba
import numpy as np
import pandas as pd

from numba.pycc import CC

In [3]:
DATA_DIR = 'input/'

In [4]:
%%time
train_df = pd.read_parquet(DATA_DIR + 'train.parquet')
train_labels_df = pd.read_parquet(DATA_DIR + 'train_labels.parquet')

CPU times: user 9.36 s, sys: 4.76 s, total: 14.1 s
Wall time: 3.94 s


In [5]:
train_df['room_fqid'].head(3)

0    tunic.historicalsociety.closet
1    tunic.historicalsociety.closet
2    tunic.historicalsociety.closet
Name: room_fqid, dtype: object

In [6]:
%%time
train_df['room_fqid_building'] = train_df['room_fqid'].str.split('.').str[1]
train_df['room_fqid_room'] = train_df['room_fqid'].str.split('.').str[2]

CPU times: user 1min 4s, sys: 3.29 s, total: 1min 7s
Wall time: 1min 7s


In [7]:
train_df[['room_fqid_building', 'room_fqid_room']].head(3)

Unnamed: 0,room_fqid_building,room_fqid_room
0,historicalsociety,closet
1,historicalsociety,closet
2,historicalsociety,closet


### Define Preprocessing Info

In [8]:
level_group_map = {
    '0-4': 0,
    '5-12': 1,
    '13-22': 2,
}

In [6]:
EVENT_NAMES = sorted([
    'cutscene_click',
    'person_click',
    'navigate_click',
    'observation_click',
    'notification_click',
    'object_click',
    'object_hover',
    'map_hover',
    'map_click',
    'checkpoint',
    'notebook_click',
])

event_name_map = {en: i for i, en in enumerate(EVENT_NAMES)}

In [7]:
EVENT_NAMES[10]

'person_click'

In [10]:
name_map = {
    'basic': 0,
    'undefined': 1,
    'close': 2,
    'open': 3,
    'prev': 4,
    'next': 5,
}

In [11]:
# room_fqids = train_df['room_fqid'].unique().tolist()
# room_fqids.sort()
# room_fqids

ROOM_FQIDS = sorted([
    'tunic.capitol_0.hall',
    'tunic.capitol_1.hall',
    'tunic.capitol_2.hall',
    'tunic.drycleaner.frontdesk',
    'tunic.flaghouse.entry',
    'tunic.historicalsociety.basement',
    'tunic.historicalsociety.cage',
    'tunic.historicalsociety.closet',
    'tunic.historicalsociety.closet_dirty',
    'tunic.historicalsociety.collection',
    'tunic.historicalsociety.collection_flag',
    'tunic.historicalsociety.entry',
    'tunic.historicalsociety.frontdesk',
    'tunic.historicalsociety.stacks',
    'tunic.humanecology.frontdesk',
    'tunic.kohlcenter.halloffame',
    'tunic.library.frontdesk',
    'tunic.library.microfiche',
    'tunic.wildlife.center'
])

room_fqids_map = {b: i for i, b in enumerate(ROOM_FQIDS)}

# room_fqids_map =>
# {'tunic.capitol_0.hall': 0,
#  'tunic.capitol_1.hall': 1,
#  'tunic.capitol_2.hall': 2,
#  'tunic.drycleaner.frontdesk': 3,
#  'tunic.flaghouse.entry': 4,
#  ...
# }

In [12]:
BUILDINGS = sorted([
    'capitol_0',
    'capitol_1',
    'capitol_2',
    'drycleaner',
    'flaghouse',
    'historicalsociety',
    'humanecology',
    'kohlcenter',
    'library',
    'wildlife',
])

building_map = {b: i for i, b in enumerate(BUILDINGS)}

In [13]:
ROOMS = sorted([
    'basement',
    'cage',
    'center',
    'closet',
    'closet_dirty',
    'collection',
    'collection_flag',
    'entry',
    'frontdesk',
    'hall',
    'halloffame',
    'microfiche',
    'stacks',
])

rooms_map = {r: i for i, r in enumerate(ROOMS)}

In [2]:
FQIDS = sorted([
    '',
    'archivist',
    'archivist_glasses',
    'block',
    'block_0',
    'block_1',
    'block_badge',
    'block_badge_2',
    'block_magnify',
    'block_nelson',
    'block_tocollection',
    'block_tomap1',
    'block_tomap2',
    'boss',
    'businesscards',
    'businesscards.card_0.next',
    'businesscards.card_1.next',
    'businesscards.card_bingo.bingo',
    'businesscards.card_bingo.next',
    'ch3start',
    'chap1_finale',
    'chap1_finale_c',
    'chap2_finale_c',
    'chap4_finale_c',
    'coffee',
    'colorbook',
    'confrontation',
    'crane_ranger',
    'cs',
    'directory',
    'directory.closeup.archivist',
    'door_block_clean',
    'door_block_talk',
    'doorblock',
    'expert',
    'flag_girl',
    'fox',
    'glasses',
    'gramps',
    'groupconvo',
    'groupconvo_flag',
    'intro',
    'janitor',
    'journals',
    'journals.hub.topics',
    'journals.pic_0.next',
    'journals.pic_1.next',
    'journals.pic_2.bingo',
    'journals.pic_2.next',
    'journals_flag',
    'journals_flag.hub.topics',
    'journals_flag.hub.topics_old',
    'journals_flag.pic_0.bingo',
    'journals_flag.pic_0.next',
    'journals_flag.pic_0_old.next',
    'journals_flag.pic_1.bingo',
    'journals_flag.pic_1.next',
    'journals_flag.pic_1_old.next',
    'journals_flag.pic_2.bingo',
    'journals_flag.pic_2.next',
    'journals_flag.pic_2_old.next',
    'key',
    'lockeddoor',
    'logbook',
    'logbook.page.bingo',
    'magnify',
    'need_glasses',
    'notebook',
    'outtolunch',
    'photo',
    'plaque',
    'plaque.face.date',
    'reader',
    'reader.paper0.next',
    'reader.paper0.prev',
    'reader.paper1.next',
    'reader.paper1.prev',
    'reader.paper2.bingo',
    'reader.paper2.next',
    'reader.paper2.prev',
    'reader_flag',
    'reader_flag.paper0.next',
    'reader_flag.paper0.prev',
    'reader_flag.paper1.next',
    'reader_flag.paper1.prev',
    'reader_flag.paper2.bingo',
    'reader_flag.paper2.next',
    'reader_flag.paper2.prev',
    'remove_cup',
    'report',
    'retirement_letter',
    'savedteddy',
    'seescratches',
    'teddy',
    'tobasement',
    'tocage',
    'tocloset',
    'tocloset_dirty',
    'tocollection',
    'tocollectionflag',
    'toentry',
    'tofrontdesk',
    'togrampa',
    'tohallway',
    'tomap',
    'tomicrofiche',
    'tostacks',
    'tracks',
    'tracks.hub.deer',
    'trigger_coffee',
    'trigger_scarf',
    'tunic',
    'tunic.capitol_0',
    'tunic.capitol_1',
    'tunic.capitol_2',
    'tunic.drycleaner',
    'tunic.flaghouse',
    'tunic.historicalsociety',
    'tunic.hub.slip',
    'tunic.humanecology',
    'tunic.kohlcenter',
    'tunic.library',
    'tunic.wildlife',
    'unlockdoor',
    'wells',
    'wellsbadge',
    'what_happened',
    'worker',
    'chap2_finale',
])

fqids_map = {r: i for i, r in enumerate(FQIDS)}

In [5]:
fqids_map['worker']

128

In [15]:
train_df['text'].head(3)

0                        undefined
1    Whatcha doing over there, Jo?
2           Just talking to Teddy.
Name: text, dtype: object

In [16]:
TEXT = sorted(train_df['text'].fillna("").unique().tolist())

text_map = {r: i for i, r in enumerate(TEXT)}

In [17]:
train_df['text_fqid'].head(3)

0                 tunic.historicalsociety.closet.intro
1    tunic.historicalsociety.closet.gramps.intro_0_...
2    tunic.historicalsociety.closet.gramps.intro_0_...
Name: text_fqid, dtype: object

In [18]:
train_df['text_fqid'][1]

'tunic.historicalsociety.closet.gramps.intro_0_cs_0'

In [19]:
TEXT_FQID = sorted(train_df['text_fqid'].fillna("").unique().tolist())

text_fqid_map = {r: i for i, r in enumerate(TEXT_FQID)}

In [20]:
len(text_fqid_map)

127

In [21]:
preprocess_info = {
    'EVENT_NAMES': EVENT_NAMES,
    'event_name_map': event_name_map,
    'name_map': name_map,
    'level_group_map': level_group_map,
    'BUILDINGS': BUILDINGS,
    'building_map': building_map,
    'ROOMS': ROOMS,
    'rooms_map': rooms_map,
    'FQIDS': FQIDS,
    'fqids_map': fqids_map,
    'TEXT': TEXT,
    'text_map': text_map,
    'TEXT_FQID': TEXT_FQID,
    'text_fqid_map': text_fqid_map,
    'ROOM_FQIDS': ROOM_FQIDS,
    'room_fqids_map': room_fqids_map,
}

In [22]:
with open('preprocess_info.json', "w") as f:
    f.write(json.dumps(preprocess_info))

### Create Preprocessing Functions

In [23]:
# while Numba’s main use case is Just-in-Time compilation, it also provides a facility for Ahead-of-Time compilation (AOT).


In [24]:
# generate an extension module named JoWilder_preprocess_functions. Depending on your platform, the actual filename may be - 
# - JoWilder_preprocess_functions.so, JoWilder_preprocess_functions.pyd, JoWilder_preprocess_functions.cpython-34m.so, etc.
cc = CC('JoWilder_preprocess_functions')

In [25]:
# @cc.export(..) => compiling code ahead of time.
# generated module has function "session_id_parser" operates on 64-bit integers (i8).
# Tuple((i8, i8, i8, i8))(i8) => (i8) refers argument and (i8, i8, i8, i8) refers returns.

# @numba.jit(...) => decorator, you can mark a function for optimization by Numba’s JIT compiler. 
# nopython => enable numba compilation mode "nopython".
# numba will release the GIL when entering "native code" from "python code" if you passed nogil=True.
# parallel => enables automatic parallelization (and related optimizations) for those operations in the function known to have parallel semantics.

@cc.export('session_id_parser', 'Tuple((i8, i8, i8, i8))(i8)')
@numba.jit(nopython=True, nogil=True, parallel=False)
def session_id_parser(session_id):
    '''
    parse session_id to get year, month, weekday and hour.
    '''
    temp1 = session_id//100
    session_rnd = session_id - (temp1*100)
    
    temp2 = temp1//1000
    session_ms = temp1 - (temp2*1000)
    
    temp1 = temp2//100
    session_second = temp2 - (temp1*100)
    
    temp2 = temp1//100
    session_minute = temp1 - (temp2*100)
    
    temp1 = temp2//100
    session_hour = temp2 - (temp1*100)
    
    temp2 = temp1//100
    session_weekday = temp1 - (temp2*100)
    
    temp1 = temp2//100
    session_month = temp2 - (temp1*100) + 1
    
    temp2 = temp1//100
    session_year = temp1 - (temp2*100)
    
    return session_year, session_month, session_weekday, session_hour

In [26]:
@numba.jit(nopython=True, nogil=True, error_model='numpy', parallel=False)
def get_splits(a):
    '''
    Leaving the first element, match every element of array with the previous one and return True/False.
    To this True/False list, append True at first and last.
    Return indices of True places.
    
    Usage e.g.,  to split unique "session id" from the repetitive "session ids".
    '''
    # a[1:] != a[:-1] => 
    # array([ True, False,  True, False, False, False, False, False, False,
    #         ...
    #        False, False, False, False, False, False,  True,  True, False,
    #        False, False])
    
    # (a[1:] != a[:-1]).shape => (128,)
    
    m = np.concatenate((np.array([True]), a[1:] != a[:-1], np.array([True])))
    #m = np.flatnonzero(m)
    m = np.where(m)[0]
    # m => 
    # array([  0,   1,   3,  19,  30,  35,  36,  38,  42, ..., 94, 124, 125, 129])
    
    # m.shape => (22,)
    
    return m

In [27]:
def generate_lookup_mapper_function(lookup, function_name, default_value_str, output_type_str):
    '''
    generates a function to map a list of objects to numbers using unicode transformation of object's characters. 
    '''
    
    lookup_keys_list = sorted(list(lookup.keys()))
    lookup_keys = np.array(lookup_keys_list)
    # lookup_keys => array(['', 'archivist', 'archivist_glasses', 'block', ..., 'what_happened', 'worker'], dtype='<U30')
    # <U30 => string with "less than" or "equal to" 30 unicode characters.
    
    # len(lookup_keys) => 129
    # lookup_keys.view(np.uint32) => converts each character in each string to their unicode value.
    # len(lookup_keys.view(np.uint32)) => 3870 (129*30) (30 is the length of string with maximum characters)
    # string whose characters length is not 30 are padded with zeros.
    # lookup_keys.view(np.uint32) => array([0, 0, 0, ..., 0, 0, 0], dtype=uint32)
    # np.unique(lookup_keys.view(np.uint32)) => array([  0,  46,  48,  49,  ..., 118, 119, 120, 121], dtype=uint32) 
    
    
    # e.g., =>
    # z => array(['', 'archivist', 'archivist', 'block'], dtype='<U9')
    # z = z.view(np.uint32)
    # z =>
    # array([  0,   0,   0,   0,   0,   0,   0,   0,   0,  97, 114,  99, 104,
    #        105, 118, 105, 115, 116,  97, 114,  99, 104, 105, 118, 105, 115,
    #        116,  98, 108, 111,  99, 107,   0,   0,   0,   0], dtype=uint32)    
    
    
    lookup_keys = lookup_keys.view(np.uint32).reshape((lookup_keys.shape[0], -1))
    # lookup_keys.shape => (129, 30)
    
    def process_split(lookup_keys, i, s, e, num_indents, src):
    
        splits = get_splits(lookup_keys[s:e, i]) + s

        # type(z) => numpy.ndarray
        # splits.shape => (22,0)        
        
        num_splits = splits.shape[0] - 1

        if num_splits == 1:
            process_split(lookup_keys, i+1, s, e, num_indents, src)
        else:
            for j in range(num_splits):
                s = splits[j]
                # s => 0
                e = splits[j+1]
                # e => 1

                num_cands = e - s

                #print(num_splits, num_cands)

                v = lookup_keys[s, i]
                # v => 0

                intend = "  " * num_indents

                #print(lookup_keys[s:e])

                if j == 0:
                    src.append(f"{intend}if v[{i}] == {v}:")
                else:
                    src.append(f"{intend}elif v[{i}] == {v}:")

                if num_cands == 1:
                    
                    output_value = lookup[lookup_keys_list[s]]
                    
                    #src.append(f"{intend}  return {s}")
                    src.append(f"{intend}  out[i] = {output_value}")
                    
                    # src => [... , '    if v[0] == 0:', '      out[i] = 0']
                else:
                    process_split(lookup_keys, i+1, s, e, num_indents+1, src)
                    
            # src =>
            # ['    if v[0] == 0:', # v[0] is the unicode of 0th character of a string from fqids_map keys.
            #  '      out[i] = 0', # 0 is the value of key present in fqids_map dict.
            #  '    elif v[0] == 97:',
            #  '      if v[9] == 0:',
            #  '        out[i] = 1',
            #  '      elif v[9] == 95:',
            #  '        out[i] = 2',
            #  ...
            #  '        elif v[5] == 98:',
            #  '          out[i] = 126',
            #  '      elif v[1] == 104:',
            #  '        out[i] = 127',
            #  '      elif v[1] == 111:',
            #  '        out[i] = 128']            
        return src

    cc_type = {'np.int64':'i8'}[output_type_str]
        
    src = [f"""
@cc.export('{function_name}', '{cc_type}[::1](u4[:, ::1])')
@numba.jit(nopython=True, nogil=True)
def {function_name}(data):

  n = data.shape[0]
  out = np.full(n, {default_value_str}, dtype={output_type_str})

  for i in range(n):

    v = data[i]
"""]
    
    # data =>  is a 2D NumPy array with all elements ((u4[:, ::1])) and with unsigned 32-bit integers (u4).
    # out => return is a 1D NumPy array with all elements (i8[::1]) and with 64-bit integers (i8).
    # u4 => 32-bit unsigned integer.
    # src => 
    # @cc.export('fqids_to_number', 'i8[::1](u4[:, ::1])')
    # @numba.jit(nopython=True, nogil=True)
    # def fqids_to_number(data):

    #   n = data.shape[0]
    #   out = np.full(n, 129, dtype=np.int64) # return a new array of shape as 'n' and filled with 129.

    #   for i in range(n):

    #     v = data[i]
    
    
    src = process_split(lookup_keys, 0, 0, lookup_keys.shape[0], 2, src)

    src.append("  return out")
    
    # joining the segments of function src.
    src = "\n".join(src)
    
    return src

In [28]:
for lookup_name in ['fqids', 'room_fqids', 'name', 'event_name', 'text', 'text_fqid', 'rooms', 'building']:

    # preprocess_info['fqids_map'].values() => 
    # dict_values([0, 1, 2, 3, ..., 124, 125, 126, 127, 128])
    
    max_value = max(preprocess_info[lookup_name + '_map'].values()) + 1

    print(lookup_name + '_map', max_value)
    
    # preprocess_info['fqids_map'] =>
    # {'': 0,
    #  'archivist': 1,
    #  'archivist_glasses': 2,
    #  ...
    #  'wellsbadge': 126,
    #  'what_happened': 127,
    #  'worker': 128}

    func_src = generate_lookup_mapper_function(
        preprocess_info[lookup_name + '_map'],
        function_name=f'{lookup_name}_to_number',
        default_value_str=str(max_value),
        output_type_str='np.int64', # 'np.uint8' if max_value < 255 else 'np.uint32',
    )    
    
    # func_src => purpose of this function is to map certain patterns in the input array to corresponding numeric -
    # - values in the output array.
    #print(func_src)
    # exec => dynamic execution of Python code. 
    exec(func_src) # my_ comment
    #break # my_ break

fqids_map 129
room_fqids_map 19
name_map 6
event_name_map 11
text_map 598
text_fqid_map 127
rooms_map 13
building_map 10


In [29]:
# complete function is availabe in the file "utils/func_src.py".

# func_src  =>
# @cc.export('fqids_to_number', 'i8[::1](u4[:, ::1])')
# @numba.jit(nopython=True, nogil=True)
# def fqids_to_number(data):

#   n = data.shape[0]
#   out = np.full(n, 129, dtype=np.int64) # return a new array of shape as 'n' and filled with 129.

#   for i in range(n):

#     v = data[i]

#     if v[0] == 0: # v[0] is the unicode of 0th character of a string from fqids_map keys.
#       out[i] = 0 # 0 is the value of key present in fqids_map dict.
#     elif v[0] == 97:
#       if v[9] == 0:
#         out[i] = 1
#       elif v[9] == 95:
#         out[i] = 2
#     elif v[0] == 98:
#     ...
#     elif v[0] == 119:
#       if v[1] == 101:
#         if v[5] == 0:
#           out[i] = 125
#         elif v[5] == 98:
#           out[i] = 126
#       elif v[1] == 104:
#         out[i] = 127
#       elif v[1] == 111:
#         out[i] = 128
#   return out

In [30]:
@cc.export('get_building_and_room', 'Tuple((i8[::1], i8[::1]))(u4[:, ::1])')
@numba.jit(nopython=True, nogil=True, parallel=False)
def get_building_and_room(room_fqid):
    
    n, max_chars = room_fqid.shape
    
    out_buildings = np.empty(n, dtype=np.int64)
    out_room = np.empty(n, dtype=np.int64)
    
    for i in range(n):
        
        rf = room_fqid[i]
        
        e = max_chars
        #room_char_i = 0
        #room_found = False
        for j in range(6, max_chars):

            if rf[j] == 46:
                e = j
                break

        b = building_to_number(room_fqid[i:(i+1), 6:e])
        r = rooms_to_number(room_fqid[i:(i+1), (e+1):])
        
        out_buildings[i:i+1] = b
        out_room[i:i+1] = r
        
    return out_buildings, out_room

In [31]:
@cc.export('calculate_text_length', 'f8[::1](u4[:, ::1])')
@numba.jit(nopython=True, nogil=True)
def calculate_text_length(data):
    
    n, max_chars = data.shape
    
    out = np.full(n, max_chars, dtype=np.float64)
    
    for i in range(n):
        d = data[i]
        for j in range(max_chars):
            if d[j] == 0:
                if j != 0:
                    out[i] = j
                else:
                   out[i] = np.nan
                break
                
    return out

In [32]:
cc.verbose = False
cc.compile()

### Perform Preprocessing

In [133]:
train_df['session_id'].head(3)

0    20090312431273200
1    20090312431273200
2    20090312431273200
Name: session_id, dtype: int64

In [136]:
len(train_df['session_id'][0].astype(str))

17

In [119]:
train_df['session_id_str'] = train_df['session_id'].astype(str) # .str[:4].astype(int)

train_df["session_year"] = train_df["session_id_str"].str[:2].astype(np.uint8)
train_df["session_month"] = train_df["session_id_str"].str[2:4].astype(np.uint8) + 1
train_df["session_weekday"] = train_df["session_id_str"].str[4:6].astype(np.uint8)
train_df["session_hour"] = train_df["session_id_str"].str[6:8].astype(np.uint8)
train_df["session_minute"] = train_df["session_id_str"].str[8:10].astype(np.uint8)
train_df["session_second"] = train_df["session_id_str"].str[10:12].astype(np.uint8)
train_df["session_ms"] = train_df["session_id_str"].str[12:15].astype(np.uint16)
train_df["session_rnd"] = train_df["session_id_str"].str[15:17].astype(np.uint8)

In [120]:
train_labels_df['session_id_base'] = train_labels_df['session_id'].str.split("_").str[0]
train_labels_df['session_id_question'] = train_labels_df['session_id'].str.split("_").str[1].str[1:].astype(int)
train_labels_df["session_year"] = train_labels_df["session_id_base"].str[:2].astype(np.uint8)
train_labels_df["session_month"] = train_labels_df["session_id_base"].str[2:4].astype(np.uint8) + 1
train_labels_df["session_weekday"] = train_labels_df["session_id_base"].str[4:6].astype(np.uint8)

In [121]:
train_df['numerical_level_group'] = train_df['level_group'].map(level_group_map)

In [122]:
%%time
# temp = train_df['event_name'].values.astype('<U18')
# train_df['numerical_event_name'] = JoWilder_preprocess_functions.event_name_to_number(temp.view(np.uint32).reshape((temp.shape[0], -1)))

train_df['numerical_event_name'] = train_df['event_name'].map(event_name_map)

CPU times: user 927 ms, sys: 40.2 ms, total: 967 ms
Wall time: 965 ms


In [123]:
%%time
# temp = train_df['name'].values.astype('<U9')
# train_df['numerical_name'] = JoWilder_preprocess_functions.name_to_number(temp.view(np.uint32).reshape((temp.shape[0], -1)))

train_df['numerical_name'] = train_df['name'].map(name_map)

CPU times: user 846 ms, sys: 36.1 ms, total: 882 ms
Wall time: 878 ms


In [124]:
train_df['numerical_room_fqid'] = train_df['room_fqid'].map(room_fqids_map)

In [125]:
train_df['numerical_room_fqid_building'] = train_df['room_fqid_building'].map(building_map)

In [126]:
train_df['numerical_room_fqid_room'] = train_df['room_fqid_room'].map(rooms_map)

In [35]:
train_df['fqid'].head(4)

0     intro
1    gramps
2    gramps
3    gramps
Name: fqid, dtype: object

In [44]:
# temp = train_df['fqid'].values.astype('<U30')
### temp[0:5] => ['intro' 'gramps' 'gramps' 'gramps' 'gramps']
### temp.shape => (26296946,)
### temp.view(np.uint32).shape => (788908380,)
### temp.view(np.uint32).reshape((temp.shape[0], -1)).shape => (26296946, 30)
#train_df['numerical_fqid'] = JoWilder_preprocess_functions.fquids_to_number(temp.view(np.uint32).reshape((temp.shape[0], -1)))

train_df['numerical_fqid'] = train_df['fqid'].fillna('').map(fqids_map)

In [39]:
train_df['numerical_fqid'].head(3)

0    42
1    39
2    39
Name: numerical_fqid, dtype: int64

In [128]:
train_df['numerical_text'] = train_df['text'].fillna('').map(text_map)

In [129]:
train_df['numerical_text_fqid'] = train_df['text_fqid'].fillna('').map(text_fqid_map)

### Save Preprocessed Data

In [130]:
# first sort column session_year, then session_month and so on.
train_df.sort_values(['session_year', 'session_month', 'session_weekday', 'session_id', 'numerical_level_group', 'elapsed_time', 'index'], inplace=True) # , 'elapsed_time'
train_labels_df.sort_values(['session_year', 'session_month', 'session_weekday', 'session_id_base', 'session_id_question'], inplace=True)

In [131]:
train_df.reset_index(inplace=True, drop=True)
train_labels_df.reset_index(inplace=True, drop=True)

In [132]:
train_df.to_pickle('preprocessed_train_df.pkl')
train_labels_df.to_pickle('preprocessed_train_labels_df.pkl')