In [1]:
import sys

sys.path.append('/kaggle/input/predict-student-performance-from-game-play')
import jo_wilder_310

In [2]:
from cffi import FFI
import json
import lightgbm as lgb
import numpy as np
import pandas as pd

ffi = FFI()



In [3]:
PREPROCESSED_DIR = '/kaggle/input/student-utils/student-utils/student-utils/'
FEATURE_GEN_CODE_DIR = '/kaggle/input/student-utils/student-utils/student-utils/'
MODEL_DIR = '/kaggle/input/student-utils/'

In [4]:
# import imp to see which cython extension this python version supports.
#import _imp
#_imp.extension_suffixes()
# ['.cpython-310-x86_64-linux-gnu.so', '.abi3.so', '.so']

# if your so file is not supporting then remove cython extension and leave only so extension.

In [5]:
sys.path.append(FEATURE_GEN_CODE_DIR)

import JoWilder_numba_features
import JoWilder_C_features

In [6]:
# sys.path.append(PREPROCESSED_DIR)

import JoWilder_preprocess_functions

In [7]:
with open(PREPROCESSED_DIR + "preprocess_info.json", "r") as f:
    preprocess_info = json.loads(f.read())

In [8]:
level_group_map = preprocess_info['level_group_map']

questions_per_level = np.array([3, 10, 5])
questions_splits_per_level = np.array([0, 3, 13, 18])

In [9]:
with open(FEATURE_GEN_CODE_DIR + 'FEATURES_GENERATION_INFO.json', 'r') as f:
    FEATURES_GENERATION_INFO = json.loads(f.read())
    
NUM_FEATURES = FEATURES_GENERATION_INFO['NUM_FEATURES']
HISTORY_LEN = FEATURES_GENERATION_INFO['HISTORY_LEN']

In [10]:
# with open(MODEL_DIR + 'MODEL_INFO.json', 'r') as f:
#     MODEL_INFO = json.loads(f.read())
    
THRESHOLD = 0.63#MODEL_INFO['THRESHOLD']

In [11]:
NUM_FEATURES, HISTORY_LEN, THRESHOLD

(1590, 3183, 0.63)

In [12]:
model = lgb.Booster(model_file=f'{MODEL_DIR}/model_0_0.txt')

In [13]:
def generate_features(
    session_id,
    level_group_index,
    elapsed_time,
    event_name,
    name,
    level,
    hover_duration,
    session_weekday,
    building,
    room,
    fqids,
    session_event_index,
    room_coor_x,
    room_coor_y,
    screen_coor_x,
    screen_coor_y,
    text_length,
    text_numerical,
    text_fqid_numerical,
    room_fqid_numerical,
    page,
    session_hour,
    hist,
):
    
    hist_pointer = ffi.from_buffer('double[]', hist)
    
    num_questions_in_level = questions_per_level[level_group_index]
    
    questions_start_number = questions_splits_per_level[level_group_index]
    questions_end_number = questions_splits_per_level[level_group_index + 1]
    question_number = np.arange(questions_start_number, questions_end_number)
    
    features = np.full((num_questions_in_level, NUM_FEATURES), np.nan, dtype=np.float32)
    
    features_pointer = ffi.from_buffer('float[]', features)
    
    x_et_pointer = ffi.from_buffer('long[]', elapsed_time)
    x_en_pointer = ffi.from_buffer('long[]', event_name)
    x_n_pointer = ffi.from_buffer('long[]', name)
    x_hover_duration_pointer = ffi.from_buffer('double[]', hover_duration)
    x_session_weekday_pointer = ffi.from_buffer('long[]', session_weekday)
    x_b_pointer = ffi.from_buffer('long[]', building)
    x_r_pointer = ffi.from_buffer('long[]', room)
    x_fqids_pointer = ffi.from_buffer('long[]', fqids)
    x_l_pointer = ffi.from_buffer('long[]', level)
    x_index_pointer = ffi.from_buffer('long[]', session_event_index)
    x_rc_x_pointer = ffi.from_buffer('double[]', room_coor_x)
    x_rc_y_pointer = ffi.from_buffer('double[]', room_coor_y)
    x_sc_x_pointer = ffi.from_buffer('double[]', screen_coor_x)
    x_sc_y_pointer = ffi.from_buffer('double[]', screen_coor_y)
    x_tl_pointer = ffi.from_buffer('long[]', text_length)
    x_text_numerical_pointer = ffi.from_buffer('long[]', text_numerical)
    x_text_fqid_numerical_pointer = ffi.from_buffer('long[]', text_fqid_numerical)
    x_room_fqid_numerical_pointer = ffi.from_buffer('long[]', room_fqid_numerical)
    x_page_pointer = ffi.from_buffer('double[]', page)
    x_hour_pointer = ffi.from_buffer('long[]', session_hour)
    
    x_et = elapsed_time
    x_en = event_name
    x_n = name
    x_hover_duration = hover_duration
    x_session_weekday = session_weekday
    x_b = building
    x_r = room
    x_fqids = fqids
    x_l = level
    x_index = session_event_index
    x_rc_x = room_coor_x
    x_rc_y = room_coor_y
    x_sc_x = screen_coor_x
    x_sc_y = screen_coor_y
    x_tl = text_length
    x_text_numerical = text_numerical
    x_text_fqid_numerical = text_fqid_numerical
    x_room_fqid_numerical = room_fqid_numerical
    x_page = page
    x_hour = session_hour

    JoWilder_numba_features.process_single(
        level_group_index,
        x_et,
        x_en,
        x_n,
        x_hover_duration,
        x_session_weekday,
        x_b,
        x_r,
        x_fqids,
        x_l,
        x_index,
        x_rc_x,
        x_rc_y,
        x_sc_x,
        x_sc_y,
        x_tl,
        x_text_numerical,
        x_text_fqid_numerical,
        x_room_fqid_numerical,
        x_page,
        x_hour,
        features,
        hist,
    )

    number_of_events = x_et.shape[0]

    JoWilder_C_features.lib.fill_history(
        level_group_index,
        x_l_pointer,
        hist_pointer,
        x_index_pointer,
        x_text_numerical_pointer,
        x_en_pointer,
        x_n_pointer,
        x_fqids_pointer,
        x_et_pointer,
        x_rc_x_pointer,
        x_rc_y_pointer,
        x_sc_x_pointer,
        x_sc_y_pointer,
        x_b_pointer,
        x_r_pointer,
        x_room_fqid_numerical_pointer,
        x_text_fqid_numerical_pointer,
        x_page_pointer,
        x_hover_duration_pointer,
        number_of_events,
    )

    JoWilder_C_features.lib.add_features_batch(
        level_group_index,
        features_pointer,
        hist_pointer,
        num_questions_in_level
    )
    
    return features, question_number

In [14]:
FEATURES_MASK = np.load(MODEL_DIR + f'FEATURE_MASK.npy')

In [15]:
env = jo_wilder_310.make_env()
iter_test = env.iter_test()

history = {}

for (test_df, sample_submission) in iter_test:
    test_df.sort_values(by=['elapsed_time', 'index'], inplace=True)
    
    original_session_order = sample_submission['session_id'].values.copy()
    
    sample_submission['questions'] = sample_submission['session_id'].str.split('_').str[1].str[1:].astype(int)
    sample_submission = sample_submission.sort_values(by = 'questions')
    sample_submission = sample_submission[['session_id', 'correct']]
    
    #############################
    
    level_group = test_df['level_group'].values[0]
    session_id = test_df['session_id'].values[0]
    
    try:
        hist = history[session_id]
    except KeyError:
        hist = np.full(HISTORY_LEN, np.nan, dtype=np.float64)
        history[session_id] = hist
        
    level_group_index = level_group_map[level_group]
    
    elapsed_time = test_df['elapsed_time'].values
    index = test_df['index'].values

    n = elapsed_time.shape[0]

    temp = test_df['event_name'].values.astype('<U18')
    numerical_event_name = JoWilder_preprocess_functions.event_name_to_number(temp.view(np.uint32).reshape((temp.shape[0], -1)))

    temp = test_df['name'].values.astype('<U9')
    numerical_name = JoWilder_preprocess_functions.name_to_number(temp.view(np.uint32).reshape((temp.shape[0], -1)))

    level = test_df['level'].values
    hover_duration = test_df['hover_duration'].values

    temp = test_df['fqid'].fillna("").values.astype('<U30')
    numerical_fqid = JoWilder_preprocess_functions.fqids_to_number(temp.view(np.uint32).reshape((temp.shape[0], -1)))

    room_coor_x = test_df['room_coor_x'].values
    room_coor_y = test_df['room_coor_y'].values
    screen_coor_x = test_df['screen_coor_x'].values
    screen_coor_y = test_df['screen_coor_y'].values

    _, _, session_weekday, session_hour = JoWilder_preprocess_functions.session_id_parser(session_id)

    session_weekday = np.full(n, session_weekday, dtype=np.uint8)
    session_hour = np.full(n, session_hour, dtype=np.uint8)

    temp = test_df['text'].fillna("").values.astype('<U89')
    temp = temp.view(np.uint32).reshape((temp.shape[0], -1))
    numerical_text = JoWilder_preprocess_functions.text_to_number(temp)

    text_length = JoWilder_preprocess_functions.calculate_text_length(temp)

    temp = test_df['text_fqid'].fillna("").values.astype('<U71')
    numerical_text_fqid = JoWilder_preprocess_functions.text_fqid_to_number(temp.view(np.uint32).reshape((temp.shape[0], -1)))

    temp = test_df['room_fqid'].fillna("").values.astype('<U39')
    temp = temp.view(np.uint32).reshape((temp.shape[0], -1))
    numerical_room_fqid = JoWilder_preprocess_functions.room_fqids_to_number(temp)# .astype(np.int64)

    numerical_room_fqid_building, numerical_room_fqid_room = JoWilder_preprocess_functions.get_building_and_room(temp)

    page = test_df['page'].values

    features, question_number = generate_features(
        session_id,
        level_group_index,
        elapsed_time,
        numerical_event_name,
        numerical_name,
        level,
        hover_duration,
        session_weekday,
        numerical_room_fqid_building,
        numerical_room_fqid_room,
        numerical_fqid,
        index,
        room_coor_x,
        room_coor_y,
        screen_coor_x,
        screen_coor_y,
        text_length,
        numerical_text,
        numerical_text_fqid,
        numerical_room_fqid,
        page,
        session_hour,
        hist,
    )

    features = features[:, FEATURES_MASK]
    
    p = model.predict(features)
    
    sample_submission['correct'] = (p > THRESHOLD).astype(int)
    
    #############################
    
    sample_submission = sample_submission.set_index('session_id').reindex(original_session_order).reset_index()
    
    assert np.all(sample_submission['session_id'].values == original_session_order)
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
