In [1]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from utils.common import get_root_path, read_csv_list, evaluate
from sklearn.model_selection import train_test_split
from config.features import ACTIVE_FEATURES, ALL_FEATURES, get_feature_names

In [2]:
model = XGBRegressor()

In [3]:
jf_game_id = 'dhp'
train_csv_list = [os.path.join(get_root_path(), 'processed_data', f'{jf_game_id}/{jf_game_id}_6h_{date}.csv') for date in
                    range(801, 807 + 1)]
train_csv_list = [csv for csv in train_csv_list if os.path.exists(csv)]
train_df = read_csv_list(train_csv_list)
test_csv_list = [os.path.join(get_root_path(), 'processed_data', f'{jf_game_id}/{jf_game_id}_6h_{date}.csv') for date in
                    range(808, 808 + 1)]
test_csv_list = [csv for csv in test_csv_list if os.path.exists(csv)]
test_df = read_csv_list(test_csv_list)

2024-10-07 20:14:58,947 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\dhp/dhp_6h_801.csv | sample number: 557325
  new_df = pd.read_csv(csv_file, usecols=columns, sep=sep)
2024-10-07 20:15:12,212 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\dhp/dhp_6h_802.csv | sample number: 635971
2024-10-07 20:15:29,450 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\dhp/dhp_6h_803.csv | sample number: 835078
  new_df = pd.read_csv(csv_file, usecols=columns, sep=sep)
2024-10-07 20:15:46,038 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\dhp/dhp_6h_804.csv | sample number: 789380
  new_df = pd.read_csv(csv_file, usecols=columns, sep=sep)
2024-10-07 20:15:56,830 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\dhp/dhp_6h_805.csv | sample number: 522957
  new_df = pd.read_csv(csv_file, usecols=columns, sep=sep)
2024-10-07 20:16:09,065 - common.py - INFO - Read dataframe from

In [4]:
active_feature_names = get_feature_names(ACTIVE_FEATURES['sparse']) + ACTIVE_FEATURES['dense'] + ['o2_game_id_hash', 'media_type_hash', 'media_id_hash']  # + get_feature_names(ACTIVE_FEATURES['seq'])
all_feature_names = get_feature_names(ALL_FEATURES['sparse']) + ALL_FEATURES['dense'] + ['o2_game_id_hash', 'media_type_hash', 'media_id_hash']  # + get_feature_names(ALL_FEATURES['seq'])

In [5]:
def label_cast(x, label_max_cast):
    if x < 0:
        return 0
    elif x > label_max_cast:
        return label_max_cast
    else:
        return x

In [6]:
X_train = np.array(train_df[all_feature_names])
X_test =  np.array(test_df[all_feature_names])
y_train =  np.array((train_df['ltv3']-train_df['ltv_6h']).apply(lambda x: label_cast(x, label_max_cast=2000)))
y_test =  np.array((test_df['ltv3']-test_df['ltv_6h']).apply(lambda x: label_cast(x, label_max_cast=2000)))

In [9]:
X_train = np.array(train_df[active_feature_names])
X_test =  np.array(test_df[active_feature_names])
y_train =  np.array((train_df['ltv3']).apply(lambda x: label_cast(x, label_max_cast=2000)))
y_test =  np.array((test_df['ltv3']).apply(lambda x: label_cast(x, label_max_cast=2000)))

In [7]:
model.fit(np.array(X_train), np.array(y_train))

In [8]:
y_pred = model.predict(np.array(X_test))

In [9]:
y_pred

array([0.01125324, 0.01125324, 0.04382861, ..., 0.01125324, 0.03921516,
       0.10487259], dtype=float32)

In [10]:
predict_df = pd.DataFrame({'ltv3': np.array(y_test), 'pltv3':y_pred, 'is_pay': np.array(y_test)>0., 'pay_probs': y_pred})

In [11]:
eval_result = evaluate(predict_df)

In [12]:
predict_df

Unnamed: 0,ltv3,pltv3,is_pay,pay_probs
0,0.0,0.011253,False,0.011253
1,0.0,0.011253,False,0.011253
2,0.0,0.043829,False,0.043829
3,0.0,0.011253,False,0.011253
4,0.0,0.011253,False,0.011253
...,...,...,...,...
546023,0.0,0.072288,False,0.072288
546024,0.0,0.011253,False,0.011253
546025,0.0,0.011253,False,0.011253
546026,0.0,0.039215,False,0.039215


In [13]:
eval_result

{'auc': 0.8239939505664438,
 'gini_by_pltv': 0.6853361886832359,
 'gini_by_p': 0.6853361886832359,
 'rmse': 12.136140727774805,
 'nrmse': 0.006068070363887403,
 'mae': 0.31773406774666263,
 'nmae': 0.0001588670338733313,
 'top_0.01_p_by_pltv': 0.1064102564102564,
 'top_0.01_r_by_pltv': 0.25549692172383465,
 'top_0.03_p_by_pltv': 0.053357753357753356,
 'top_0.03_r_by_pltv': 0.3843447669305189,
 'top_0.05_p_by_pltv': 0.03871653053001722,
 'top_0.05_r_by_pltv': 0.4648197009674582,
 'top_0.1_p_by_pltv': 0.02448628255375261,
 'top_0.1_r_by_pltv': 0.5879507475813545,
 'top_0.01_p_by_p': 0.1064102564102564,
 'top_0.01_r_by_p': 0.25549692172383465,
 'top_0.03_p_by_p': 0.053357753357753356,
 'top_0.03_r_by_p': 0.3843447669305189,
 'top_0.05_p_by_p': 0.03871653053001722,
 'top_0.05_r_by_p': 0.4648197009674582,
 'top_0.1_p_by_p': 0.02448628255375261,
 'top_0.1_r_by_p': 0.5879507475813545,
 'top_amount_recall_by_pltv': {},
 'top_0.01_amount_recall_by_pltv': 57190.34,
 'top_0.05_amount_recall_by_pl

In [1]:
!pip install tune-sklearn ray[tune]

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting tune-sklearn
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/d9/48/05dd77618ccd92c5439f23bf472501378f081b163aea5ee60af51ec4c1d7/tune_sklearn-0.5.0-py3-none-any.whl (42 kB)
     ---------------------------------------- 0.0/42.2 kB ? eta -:--:--
     ---------------------------------------- 42.2/42.2 kB 1.0 MB/s eta 0:00:00
Collecting ray[tune]
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/08/91/967506d22c3ef98cc6bcbd58bef04a3bfd8d15fe89979ac24de3a32b0a5f/ray-2.37.0-cp312-cp312-win_amd64.whl (24.9 MB)
     ---------------------------------------- 0.0/24.9 MB ? eta -:--:--
      --------------------------------------- 0.3/24.9 MB 20.5 MB/s eta 0:00:02
     - -------------------------------------- 0.7/24.9 MB 11.2 MB/s eta 0:00:03
     - -------------------------------------- 1.1/24.9 MB 9.0 MB/s eta 0:00:03
     --- ------------------------------------ 2.0/24.9 MB 11.4 MB/s eta 0:00:03
     ---- --