In [1]:
import os
import numpy as np
import pandas as pd
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor
from utils.common import get_root_path, read_csv_list, evaluate
from sklearn.model_selection import train_test_split
from config.features import ACTIVE_FEATURES, ALL_FEATURES, get_feature_names

In [2]:
import numpy as np
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
jf_game_id = 'wm'
train_csv_list = [os.path.join(get_root_path(), 'processed_data', f'{jf_game_id}/{jf_game_id}_6h_{date}.csv') for date in
                    range(805, 825 + 1)]
train_csv_list = [csv for csv in train_csv_list if os.path.exists(csv)]
train_df = read_csv_list(train_csv_list)
test_csv_list = [os.path.join(get_root_path(), 'processed_data', f'{jf_game_id}/{jf_game_id}_6h_{date}.csv') for date in
                    range(826, 829 + 1)]
test_csv_list = [csv for csv in test_csv_list if os.path.exists(csv)]
test_df = read_csv_list(test_csv_list)

2024-10-07 09:28:16,328 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\wm/wm_6h_805.csv | sample number: 31973
2024-10-07 09:28:16,629 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\wm/wm_6h_806.csv | sample number: 32935
2024-10-07 09:28:16,915 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\wm/wm_6h_807.csv | sample number: 30966
2024-10-07 09:28:17,206 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\wm/wm_6h_808.csv | sample number: 31255
2024-10-07 09:28:17,481 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\wm/wm_6h_809.csv | sample number: 28882
2024-10-07 09:28:17,866 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\wm/wm_6h_810.csv | sample number: 37241
2024-10-07 09:28:18,178 - common.py - INFO - Read dataframe from csv D:/codes/www\processed_data\wm/wm_6h_811.csv | sample number: 33562
2024-10-07 09:28:18,481 - common.py - INF

In [4]:
active_feature_names = get_feature_names(ACTIVE_FEATURES['sparse']) + ACTIVE_FEATURES['dense'] + ['o2_game_id_hash', 'media_type_hash', 'media_id_hash']  # + get_feature_names(ACTIVE_FEATURES['seq'])
all_feature_names = get_feature_names(ALL_FEATURES['sparse']) + ALL_FEATURES['dense'] + ['o2_game_id_hash', 'media_type_hash', 'media_id_hash']  # + get_feature_names(ALL_FEATURES['seq'])

In [5]:
def label_cast(x, label_max_cast):
    if x < 0:
        return 0
    elif x > label_max_cast:
        return label_max_cast
    else:
        return x

In [64]:
X_train = np.array(train_df[all_feature_names])
X_test =  np.array(test_df[all_feature_names])
y_train =  np.array((train_df['ltv3']-train_df['ltv_6h']).apply(lambda x: label_cast(x, label_max_cast=2000)))
y_test =  np.array((test_df['ltv3']-test_df['ltv_6h']).apply(lambda x: label_cast(x, label_max_cast=2000)))

In [6]:
X_train = np.array(train_df[active_feature_names])
X_test =  np.array(test_df[active_feature_names])
y_train =  np.array((train_df['ltv3']).apply(lambda x: label_cast(x, label_max_cast=2000)))
y_test =  np.array((test_df['ltv3']).apply(lambda x: label_cast(x, label_max_cast=2000)))

In [78]:
# y_train = np.log(y_train + 1)
# y_test = np.log(y_test + 1)

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(np.array(X_test))

In [9]:
predict_df = pd.DataFrame({'ltv3': np.array(y_test), 'pltv3':y_pred, 'is_pay': np.array(y_test)>0., 'pay_probs': y_pred})

In [10]:
eval_result = evaluate(predict_df)

In [11]:
eval_result

{'auc': 0.5761883637527261,
 'gini_by_pltv': 0.29109511022912304,
 'gini_by_p': 0.29109511022912304,
 'rmse': 1194427054.852617,
 'nrmse': 597213.5274263085,
 'mae': 7449312.2178664375,
 'nmae': 3724.6561089332185,
 'top_0.01_p_by_pltv': 0.44163424124513617,
 'top_0.01_r_by_pltv': 0.036606998871149814,
 'top_0.03_p_by_pltv': 0.26904376012965964,
 'top_0.03_r_by_pltv': 0.066924689566199,
 'top_0.05_p_by_pltv': 0.2339556592765461,
 'top_0.05_r_by_pltv': 0.09700048379293662,
 'top_0.1_p_by_pltv': 0.1939906651108518,
 'top_0.1_r_by_pltv': 0.16086115142718915,
 'top_0.01_p_by_p': 0.44163424124513617,
 'top_0.01_r_by_p': 0.036606998871149814,
 'top_0.03_p_by_p': 0.26904376012965964,
 'top_0.03_r_by_p': 0.066924689566199,
 'top_0.05_p_by_p': 0.2339556592765461,
 'top_0.05_r_by_p': 0.09700048379293662,
 'top_0.1_p_by_p': 0.1939906651108518,
 'top_0.1_r_by_p': 0.16086115142718915,
 'top_amount_recall_by_pltv': {},
 'top_0.01_amount_recall_by_pltv': 86026.0,
 'top_0.05_amount_recall_by_pltv': 18