In [None]:
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from fx import *
from joblib import load
import pandas as pd
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import gc

In [None]:
# M1_PATH = 'models/AAAA.pkl'
# M2_PATH = 'models/BBBB.pkl'
M1_PATH = '/Users/ra/Library/CloudStorage/OneDrive-HochschuleLuzern/AICH/models/AAAA.pkl'
M2_PATH = '/Users/ra/Library/CloudStorage/OneDrive-HochschuleLuzern/AICH/models/BBBB.pkl'


def load_model(name: str) -> RandomForestClassifier:
    with open(name, 'rb') as f:
        return load(f)

m1 = load_model(M1_PATH)
m2 = load_model(M2_PATH)

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

BATCH_SIZE = 1_000_000
TEST_DATA_PATH = '../data/train_series.parquet'
DEBUG = False
submission_df = pd.DataFrame(columns=['series_id','step','event','score'])


if not 'series_id_mapping' in vars():
    series_id_mapping = dict()
    t = ds.dataset(TEST_DATA_PATH).to_table(columns=['series_id'])
    for i, data in enumerate(t.to_pandas()['series_id'].unique()):
        series_id_mapping[data] = i
    del t
    gc.collect()


for batch in methodFromJonas(BATCH_SIZE, TEST_DATA_PATH):
    try:
        # map series_id to int
        batch['series_id'] = batch['series_id'].map(series_id_mapping)
        # predict waerable_on
        pred = m1.predict(batch)
        batch['pred_worn'] = pred
        # predict awake
        batch = batch[batch['pred_worn'] == 1]
        pred_2 = m2.predict(batch)
        batch['pred_awake'] = pred_2
        # undo mapping
        reverse_mapped = dict((v,k) for k,v in series_id_mapping.items())
        batch['series_id'] = batch['series_id'].apply(lambda x: reverse_mapped[x])
        # use heuristic function
        pre_sub = heuristic_function(batch)
        # append to submission_df
        submission_df = pd.concat([submission_df, pre_sub[['series_id','step','event','score']]])
    except:
        pass


In [None]:
# solves row_id problems
submission_df = submission_df.reset_index(drop=True).reset_index(names="row_id")

In [None]:
submission_df.head()

In [None]:
submission_df.to_csv('submission.csv', index=False, encoding='utf-8', lineterminator='\n')