In [1]:
import pandas as pd
from attribute_general import AttributeGeneral
from sklearn.metrics import roc_auc_score, classification_report
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/new_device.csv')

features = ['source', 'package', 'os', 'language', 'country','language_group', 'country_group',
            'first_token_trade_coin_pair',
            'first_token_trade_chain_pair', 'first_nft_trade_token_title',
            'first_connect_dapp', 'first_otc_fiat_code', 'airdrop_name',
            'invite_code', 'red_packet_code',
            'amt_usd', 
            'total_trade', 
            'future_total_trade', 'lately_login_ip',
            'wallet_type', 'level_1_user_source',
            'level_2_user_source', 'level_3_user_source']
cat_features = ['source', 'package', 'os', 'language', 'country','language_group', 'country_group',
            'first_token_trade_coin_pair',
            'first_token_trade_chain_pair', 'first_nft_trade_token_title',
            'first_connect_dapp', 'first_otc_fiat_code', 'airdrop_name',
            'invite_code', 'red_packet_code',
            'lately_login_ip',
            'wallet_type', 'level_1_user_source',
            'level_2_user_source', 'level_3_user_source']
target = 'target'

In [4]:
def count_func(query, time_col, time_set, df=df):
    try:
        if query != '':
                return df[df[time_col] == time_set].query(query).shape[0]
        else:
                return df[df[time_col] == time_set].shape[0]
    except:
          print(query)
          raise ValueError

In [5]:
t = AttributeGeneral(df, features, cat_features, target)
t.train()

[I 2024-05-09 14:17:44,929] A new study created in memory with name: no-name-a9957a60-8c94-49f2-ab20-e7f7e16dd253
[I 2024-05-09 14:17:57,647] Trial 7 finished with value: -0.7294885683755757 and parameters: {'boosting': 'dart', 'max_depth': 89, 'learning_rate': 0.27003516760423796, 'n_estimators': 195, 'num_leaves': 33}. Best is trial 7 with value: -0.7294885683755757.
[I 2024-05-09 14:18:00,055] Trial 4 finished with value: -0.6802001161205999 and parameters: {'boosting': 'goss', 'max_depth': 9, 'learning_rate': 0.03227180570322546, 'n_estimators': 103, 'num_leaves': 241}. Best is trial 7 with value: -0.7294885683755757.
[I 2024-05-09 14:18:51,420] Trial 1 finished with value: -0.8621947442238493 and parameters: {'boosting': 'gbdt', 'max_depth': 60, 'learning_rate': 0.2167047810645713, 'n_estimators': 395, 'num_leaves': 98}. Best is trial 1 with value: -0.8621947442238493.
[I 2024-05-09 14:18:53,937] Trial 5 finished with value: -0.7814033839979708 and parameters: {'boosting': 'goss',

Best Hyperparameters:  {'boosting': 'gbdt', 'max_depth': 61, 'learning_rate': 0.29574531346893984, 'n_estimators': 382, 'num_leaves': 178}


LGBMClassifier(boosting='gbdt', learning_rate=0.29574531346893984, max_depth=61,
               n_estimators=382, num_leaves=178)

In [6]:
print('Training classification report: \n', classification_report(t.y_train, t.best_model.predict(t.X_train)))
print('Training auc: \n', roc_auc_score(t.y_train, t.best_model.predict_proba(t.X_train)[:, 1]))

model = t.best_model
tree_df = t.get_model_visualization()

Training classification report: 
               precision    recall  f1-score   support

           0       0.75      0.81      0.78     15633
           1       0.78      0.71      0.74     14734

    accuracy                           0.76     30367
   macro avg       0.76      0.76      0.76     30367
weighted avg       0.76      0.76      0.76     30367

Training auc: 
 0.8742222611044457


In [7]:
tree_df.head()

Unnamed: 0,node_index,split_feature_indx,split_feature,threshold,decision_type,left_child,right_child,leaf_value,thres_vis,condition,conditions
0,0.0,4.0,country,2||5||6||10||26||30||33||36||45||56||59||69||7...,==,1.0,2.0,,"[中国, 乌兹别克斯坦, 乌干达, 亚美尼亚, 匈牙利, 卡塔尔, 印度, 厄瓜多尔, 喀麦...","country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙...",
1,1.0,13.0,invite_code,106||644||804||1633||2375,==,3.0,4.0,,"[3EWgQ7, BbdVb1, EbTSQz, VGm2rw, im4hvK]","invite_code in [""3EWgQ7"", ""BbdVb1"", ""EbTSQz"", ...","country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙..."
2,2.0,12.0,airdrop_name,25||36||47||59||60||66,==,5.0,6.0,,"[Morph-BWB, Scroll-BWB, WOOFi-BWB, bitlayer ga...","airdrop_name in [""Morph-BWB"", ""Scroll-BWB"", ""W...","~(country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""..."
3,3.0,,,,,,,0.356226,,,"country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙..."
4,4.0,17.0,future_total_trade,168.57912,<=,9.0,10.0,,,future_total_trade <= 168.57912000000002,"country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙..."


In [10]:
tree_df['2024-04-27'] = tree_df['conditions'].apply(lambda x: count_func(x, 'device_create_date', '2024-04-27'))
tree_df['2024-04-28'] = tree_df['conditions'].apply(lambda x: count_func(x, 'device_create_date', '2024-04-28'))
tree_df['ratio'] = (tree_df['2024-04-28'] - tree_df['2024-04-27'])/(tree_df[tree_df['conditions']=='']['2024-04-28'].values[0] - tree_df[tree_df['conditions']=='']['2024-04-27'].values[0])

In [12]:
tree_df.head(10)

Unnamed: 0,node_index,split_feature_indx,split_feature,threshold,decision_type,left_child,right_child,leaf_value,thres_vis,condition,conditions,2024-04-27,2024-04-28,ratio
0,0.0,4.0,country,2||5||6||10||26||30||33||36||45||56||59||69||7...,==,1.0,2.0,,"[中国, 乌兹别克斯坦, 乌干达, 亚美尼亚, 匈牙利, 卡塔尔, 印度, 厄瓜多尔, 喀麦...","country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙...",,15633,14734,1.0
1,1.0,13.0,invite_code,106||644||804||1633||2375,==,3.0,4.0,,"[3EWgQ7, BbdVb1, EbTSQz, VGm2rw, im4hvK]","invite_code in [""3EWgQ7"", ""BbdVb1"", ""EbTSQz"", ...","country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙...",3490,4489,-1.111235
2,2.0,12.0,airdrop_name,25||36||47||59||60||66,==,5.0,6.0,,"[Morph-BWB, Scroll-BWB, WOOFi-BWB, bitlayer ga...","airdrop_name in [""Morph-BWB"", ""Scroll-BWB"", ""W...","~(country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""...",12143,10245,2.111235
3,3.0,,,,,,,0.356226,,,"country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙...",3,101,-0.10901
4,4.0,17.0,future_total_trade,168.57912,<=,9.0,10.0,,,future_total_trade <= 168.57912000000002,"country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙...",3487,4388,-1.002225
5,5.0,21.0,level_2_user_source,65||66,==,11.0,12.0,,"[bitlayer galxe, bwb]","level_2_user_source in [""bitlayer galxe"", ""bwb""]","~(country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""...",2980,1937,1.160178
6,6.0,4.0,country,3||4||9||12||13||15||31||32||40||42||58||63||6...,==,13.0,14.0,,"[丹麦, 乌克兰, 也门, 伊拉克, 伊朗, 保加利亚, 卢旺达, 卢森堡, 台湾, 哈萨克...","country in [""丹麦"", ""乌克兰"", ""也门"", ""伊拉克"", ""伊朗"", ""保...","~(country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""...",9163,8308,0.951057
7,9.0,16.0,total_trade,1717.296466,<=,19.0,20.0,,,total_trade <= 1717.296466,"country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙...",3459,4388,-1.03337
8,10.0,,,,,,,-0.63371,,,"country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""匈牙...",28,0,0.031146
9,11.0,4.0,country,13||15||31||46||50||62||82||132||138||145||148...,==,23.0,24.0,,"[伊朗, 保加利亚, 卢旺达, 土耳其, 埃及, 奥地利, 德国, 立陶宛, 美国, 荷兰,...","country in [""伊朗"", ""保加利亚"", ""卢旺达"", ""土耳其"", ""埃及"", ...","~(country in [""中国"", ""乌兹别克斯坦"", ""乌干达"", ""亚美尼亚"", ""...",2669,1897,0.858732


In [13]:
import lightgbm as lgb

# 使用create_tree_digraph绘制第一棵树
graph = lgb.create_tree_digraph(model, tree_index=0, name='Tree0', show_info=['split_gain', 'internal_value', 'internal_count', 'leaf_count'])

# 显示图形
graph.render(view=True)

'Tree0.gv.pdf'