In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from joblib import load
import polars as pl
import kaggle_evaluation.jane_street_inference_server
import os
import gc
import lightgbm as lgb

In [None]:
# Custom R2 metric for CatBoost
class r2_cbt(object):
    def get_final_error(self, error, weight):
        return 1 - error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w * (target[i] ** 2)
            error_sum += w * ((approx[i] - target[i]) ** 2)

        return error_sum, weight_sum

In [None]:
#model = load('/kaggle/input/lgbm_v3_lags/scikitlearn/lgbm_v3_lags/1/model_lags2.pkl')
model = load('/kaggle/input/catboost_v3/other/catboost_lagone_v2/1/model_catboost_lag_32_18.pkl')

In [None]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    # feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    feature_cols = [f"feature_{idx:02d}" for idx in range(79)]+["responder_6_lag_1"]
    #[f"responder_{idx}_lag_1" for idx in range(9)]
    
xgb_feature_cols = ["date_id", "time_id", "symbol_id"] + CONFIG.feature_cols

In [None]:
lags_ : pl.DataFrame | None = None
    
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    global lags_
    if lags is not None:
        lags_ = lags

    predictions = test.select(
        'row_id',
        pl.lit(0.0).alias('responder_6'),
    )
    symbol_ids = test.select('symbol_id').to_numpy()[:, 0]

    if not lags is None:
        lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last() # pick up last record of previous date
        lags = lags[['responder_6_lag_1',"date_id", "symbol_id"]]
        test = test.join(lags, on=["date_id", "symbol_id"],  how="left")
    else:
        test = test.with_columns(
            ( pl.lit(0.0).alias(f'responder_6_lag_1'))
        )
    
    preds = np.zeros((test.shape[0],))
    preds += model.predict(test[xgb_feature_cols].to_pandas())
    print(f"predict> preds.shape =", preds.shape)
    
    predictions = \
    test.select('row_id').\
    with_columns(
        pl.Series(
            name   = 'responder_6', 
            values = np.clip(preds, a_min = -5, a_max = 5),
            dtype  = pl.Float64,
        )
    )

    # The predict function must return a DataFrame
    assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
    # with columns 'row_id', 'responer_6'
    assert list(predictions.columns) == ['row_id', 'responder_6']
    # and as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )

In [None]:
# lags_ : pl.DataFrame | None = None

# def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
#     global lags_

#     # Если lags передан, группируем и присоединяем
#     if lags is not None:
#         # Берем последнее значение для каждой группы
#             lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last() 
#             test = test.join(lags, on=["symbol_id"], how="left", suffix = '_lags')
#     else:
#         # Добавляем фиктивные lag-столбцы с нулями, если lags не передан
#         test = test.with_columns(
#             (pl.lit(0.0).alias(f'responder_{idx}_lag_1') for idx in range(9))
#         )
    
#     print(f"predict> test.shape =", test.shape)
    
#     test_ = test.drop('row_id', 'is_scored', 'weight', 'time_id_lags', 'date_id_lags').to_numpy().astype(np.float32)

    
#     preds = model.predict(test_)
#     print(f"predict> preds.shape =", preds.shape)
    
#     # Создаем итоговый DataFrame с row_id и предсказаниями
#     predictions = \
#         pl.DataFrame(test.select('row_id').with_columns(
#             pl.Series(
#                 name='responder_6', 
#                 values=preds,
#                 dtype=pl.Float32
#             )
#         ))
    
#     print(predictions)
    
#     # Проверки на корректность результата
#     assert isinstance(predictions, (pl.DataFrame, pd.DataFrame)), "Predictions must be a DataFrame (polars or pandas)"
#     assert list(predictions.columns) == ['row_id', 'responder_6'], "Predictions must have columns ['row_id', 'responder_6']"
#     assert len(predictions) == len(test), "Predictions must have the same number of rows as the test DataFrame"

#     return predictions

In [None]:
# # Global variables
# model = None  # Global model instance


# def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
#     global model
#     if model is None:
#         model = load('/kaggle/input/lgbm_v1/scikitlearn/default_lgbm_v1/1/model.pkl')  # Загрузка модели из файла

#     # 
#     test_ = test.drop('row_id', 'is_scored', 'weight').to_numpy()
#     predictions = test.select(
#         'row_id',
#         pl.lit(model.predict(test_)-0.03).alias('responder_6'))

#     if isinstance(predictions, pl.DataFrame):
#         assert predictions.columns == ['row_id', 'responder_6']
#     elif isinstance(predictions, pd.DataFrame):
#         assert (predictions.columns == ['row_id', 'responder_6']).all()
#     else:
#         raise TypeError('The predict function must return a DataFrame')
#     # Confirm has as many rows as the test data.
#     assert len(predictions) == len(test)
#     return predictions

In [None]:
# path = '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet/date_id=0/part-0.parquet'
# test_ = pl.scan_parquet(path)
# test_.columns

In [None]:
# # Global variables
# model = None  # Global model instance
# lags_ : pl.DataFrame | None = None


# def predict(test: pl.DataFrame, lags: pl.DataFrame) -> pl.DataFrame | pd.DataFrame:
#     global model
#     if model is None:
#         model = load('/kaggle/input/lgbm_v3_lags/scikitlearn/lgbm_v3_lags/1/model_lags2.pkl')  # Загрузка модели из файла

#     global lags_
#     lags_ = lags  #[lags.columns[3:]]
#     # In test there are 3700 rows, in lags there are 39
#     lags_ = pl.concat([lags_1, test.shift(1).dropna()], how="vertical")
        
#     test_ = test.join(lags_, on = ['symbol_id', 'date_id'])
#     test_ = test_.drop('row_id', 'is_scored', 'weight','date_id_right','time_id_right')

    
#     predictions = test.select(
#         'row_id',
#         pl.lit(model.predict(test_.to_numpy().astype('float32'))).alias('responder_6'))

#     if isinstance(predictions, pl.DataFrame):
#         assert predictions.columns == ['row_id', 'responder_6']
#     elif isinstance(predictions, pd.DataFrame):
#         assert (predictions.columns == ['row_id', 'responder_6']).all()
#     else:
#         raise TypeError('The predict function must return a DataFrame')
#     # Confirm has as many rows as the test data.
#     assert len(predictions) == len(test)
#     return predictions

In [None]:
# lags_ : pl.DataFrame | None = None
    
# def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
#     global lags_
#     if lags is not None:
#         lags_ = lags
        
        
#     model = load('/kaggle/input/lgbm_v3_lags/scikitlearn/lgbm_v3_lags/1/model_lags2.pkl')
#     predictions = test.select(
#         'row_id',
#         pl.lit(0.0).alias('responder_6'),
#     )
#     symbol_ids = test.select('symbol_id').to_numpy()[:, 0]

#     if not lags is None:
#         lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last() # pick up last record of previous date
#         test = test.join(lags, on=["date_id", "symbol_id"],  how="left")
    
        
#     else:
#         test = test.with_columns(
#             ( pl.lit(0.0).alias(f'responder_{idx}_lag_1') for idx in range(9) )
#         )
    
#     #preds = np.zeros((test.shape[0],))
#     test = test.drop('row_id', 'is_scored', 'weight','time_id_right')     
#     predictions = test.select(
#         'row_id',
#         pl.lit(model.predict(test.to_numpy().astype('float32'))).alias('responder_6'))


#     # The predict function must return a DataFrame
#     assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
#     # with columns 'row_id', 'responer_6'
#     assert list(predictions.columns) == ['row_id', 'responder_6']
#     # and as many rows as the test data.
#     assert len(predictions) == len(test)

#     return predictions


In [None]:
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
# alltraindata = pl.scan_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet")

# symbol_median = alltraindata.filter(pl.col("time_id")==967).group_by("symbol_id").agg(pl.col("responder_6").median()).sort("symbol_id").collect()


In [None]:
# lags_ : pl.DataFrame | None = None

# # Replace this function with your inference code.
# # You can return either a Pandas or Polars dataframe, though Polars is recommended.
# # Each batch of predictions (except the very first) must be returned within 10 minutes of the batch features being provided.
# def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
#     """Make a prediction."""
#     # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
#     # Use them as extra features, if you like.
#     global lags_
#     if lags is not None:
#         lags_ = lags
        
#     preds = test.select("row_id","symbol_id").join(symbol_median,on="symbol_id",how="left")
        
#     predictions = preds.select("row_id","responder_6")

#     # The predict function must return a DataFrame
#     assert isinstance(predictions, pl.DataFrame | pd.DataFrame)
#     # with columns 'row_id', 'responer_6'
#     assert predictions.columns == ['row_id', 'responder_6']
#     # and as many rows as the test data.
#     assert len(predictions) == len(test)

#     return predictions

In [None]:
# inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

# if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
#     inference_server.serve()
# else:
#     inference_server.run_local_gateway(
#         (
#             '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
#             '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
#         )
#     )