In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

#Read the data
df = pd.read_csv('../data/app_use_info_label.csv', sep=',', header=0)
df2 = pd.read_csv('../data/user_portrait.csv', sep=',', header=0)
df3 = pd.read_csv('../data/user_trajectory2.csv', sep=',', header=0)
#Merge data
df_all = pd.merge(df, df2, left_on='msisdn', right_on='userid', how='left').drop(columns = ['userid'])
df_all = df_all.merge(df3, on = 'msisdn', how = 'left')

In [3]:
#Date feature processing
def get_time_feature(df, col):
    
    df_copy = df.copy()
    prefix = col + "_"
    df_copy['new_'+col] = df_copy[col].astype(str)
    
    col = 'new_'+col
    df_copy[col] = pd.to_datetime(df_copy[col])
    df_copy[prefix + 'year'] = df_copy[col].dt.year
    df_copy[prefix + 'month'] = df_copy[col].dt.month
    df_copy[prefix + 'day'] = df_copy[col].dt.day
    # df_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweek
    # df_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 6
    # df_copy[prefix + 'quarter'] = df_copy[col].dt.quarter
    # df_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)
    # df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)
    del df_copy[col]
    
    return df_copy   
    
df_all = get_time_feature(df_all, 'stime')
df_all = get_time_feature(df_all, 'end_time')

In [4]:
#label encoding
from sklearn import preprocessing
 
enc=preprocessing.LabelEncoder() 
enc=enc.fit(df_all['app_class_1']) 
df_all['app_class_1']=enc.transform(df_all['app_class_1'])

enc2=preprocessing.LabelEncoder() 
enc2=enc2.fit(df_all['app_class_2']) 
df_all['app_class_2']=enc2.transform(df_all['app_class_2'])

In [None]:
import matplotlib.pyplot as plt # 导入作图包
import seaborn as sns    # 导入作图包

plt.figure(figsize=(20,20),dpi=600)
mask = np.triu(np.ones_like(df_all.fillna(0).corr().round(2), dtype=np.bool))
ax = sns.heatmap(df_all.fillna(0).corr().round(2), linewidths=.5, cmap='YlGnBu',annot=True,vmax=2.0,mask=mask)

In [5]:
#特征衍生
df_all['up_flow']= df_all['up_flow']/1024
df_all['down_flow']= df_all['down_flow']/1024

df_all['app1_upflow_mean'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['up_flow'].mean())
df_all['app2_upflow_mean'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['up_flow'].mean())
df_all['app1_upflow_max'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['up_flow'].max())
df_all['app2_upflow_max'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['up_flow'].max())
df_all['app1_upflow_min'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['up_flow'].min())
df_all['app2_upflow_min'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['up_flow'].min())

df_all['app1_downflow_mean'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['down_flow'].mean())
df_all['app2_downflow_mean'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['down_flow'].mean())
df_all['app1_downflow_max'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['down_flow'].max())
df_all['app2_downflow_max'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['down_flow'].max())
df_all['app1_downflow_min'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['down_flow'].min())
df_all['app2_downflow_min'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['down_flow'].min())

df_all['app1_consume_mean'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['consume'].mean())
df_all['app2_consume_mean'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['consume'].mean())
df_all['app1_consume_max'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['consume'].max())
df_all['app2_consume_max'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['consume'].max())
df_all['app1_consume_min'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['consume'].min())
df_all['app2_consume_min'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['consume'].min())

df_all['app1_age_mean'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['age'].mean())
df_all['app2_age_mean'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['age'].mean())
df_all['app1_age_max'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['age'].max())
df_all['app2_age_max'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['age'].max())
df_all['app1_age_min'] = df_all['app_class_1'].map(df_all.groupby('app_class_1')['age'].min())
df_all['app2_age_min'] = df_all['app_class_2'].map(df_all.groupby('app_class_2')['age'].min())

df_all['up-down'] = df_all['up_flow']-df_all['down_flow']
df_all['up/down'] = df_all['up_flow']/df_all['down_flow']
df_all['up+down'] = df_all['up_flow']+df_all['down_flow']

In [6]:
#分离训练集，测试集
from sklearn.model_selection import train_test_split

df_all=df_all.replace([np.inf, -np.inf], 0)
df_all.fillna(0,inplace=True)
train_df,test_df = train_test_split(df_all,train_size=0.8,shuffle=True,random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
feature_cols = [cols for cols in df_all if cols not in ['msisdn','label','end_time','stime','times_month']]
len(feature_cols)

53

In [8]:
#构建模型
import optuna
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from pytorch_tabnet import tab_model
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import torch
import warnings

warnings.filterwarnings('ignore')

# # 标准化
# def scale(train_features,test_features):
#     scaler=StandardScaler()
#     scaler.fit(train_features)
#     train_features=pd.DataFrame(scaler.transform(train_features),columns=test_features.keys())
#     test_features=pd.DataFrame(scaler.transform(test_features),columns=test_features.keys())
#     return train_features,test_features

# 模型定义
def ml_model(clf,train_x,train_y,test_x=[],test_y=[]):
    seeds=[888]
    train_oof = np.zeros([train_x.shape[0],3])
    feat_imp_df = pd.DataFrame()
    feat_imp_df['feature'] = train_x.columns
    feat_imp_df['imp'] = 0
    #标准化
    scaler=StandardScaler()
    scaler.fit(train_x)
    train_x = scaler.transform(train_x)
    if (len(test_x)!=0) and (len(test_x)!=0):
        test_oof = np.zeros([test_x.shape[0],3])
        test_x = scaler.transform(test_x)
    for seed in seeds:
        print('Seed:',seed)
        folds = 5
        kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
        acc_scores_val = []
        acc_scores_test = []
        for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
            trn_x, trn_y, val_x, val_y = train_x[train_index], train_y[train_index], train_x[valid_index], \
                                        train_y[valid_index] 
            if clf == 'xgb':
                print("|  XGB  Fold  {}  Training Start           |".format(str(i + 1)))
                xgb_params = {'booster': 'gbtree','objective': 'multi:softprob','eval_metric':'mlogloss','num_class':3,
                    'n_estimators':500,'max_depth': 8,'lambda': 10,'subsample': 0.7,'colsample_bytree': 0.8,'eta': 0.1,
                    'colsample_bylevel': 0.7,'tree_method': 'hist','seed': seed,'nthread': 16}
                #训练模型
                model = xgb.XGBClassifier(*xgb_params)
                model.fit(trn_x,trn_y,eval_set=[(trn_x, trn_y),(val_x,val_y)],early_stopping_rounds=50,verbose=100)
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('XGB_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                if (len(test_x)!=0) and (len(test_x)!=0):
                    print('************ Test_Result ************')
                    test_pred  = model.predict_proba(test_x)
                    acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                    acc_scores_test.append(acc_score_tset)
                    print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                    print('XGB_result :',classification_report(test_y, np.argmax(test_pred, axis=1)))
                    #保存测试集结果
                    test_oof += test_pred / kf.n_splits / len(seeds)
                
                #保存训练集结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
                
                #模型特征重要性
                feat_imp_df['imp'] += model.feature_importances_ / folds/ len(seeds)
                feat_imp_df = feat_imp_df.sort_values(by='imp', ascending=False).reset_index(drop=True)
                feat_imp_df['rank'] = range(feat_imp_df.shape[0])
            if clf == 'lgb':
                lgb_params = {'boosting_type': 'gbdt','n_estimators':500,'min_child_weight': 4,'num_leaves': 64,
                    'feature_fraction': 0.8,'bagging_fraction': 0.8,'bagging_freq': 4,'learning_rate': 0.02,
                    'seed': seed,'nthread': 32,'n_jobs':8,'verbose': -1}
                print("|  LGB  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = lgb.LGBMClassifier(**lgb_params)
                model.fit(trn_x,trn_y)
                
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Cat_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                if (len(test_x)!=0) and (len(test_x)!=0):
                    print('************ Test_Result ************')
                    test_pred  = model.predict_proba(test_x)
                    acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                    acc_scores_test.append(acc_score_tset)
                    print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                    print('Cat_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                    #保存测试集结果
                    test_oof += test_pred / kf.n_splits / len(seeds)
                #保存训练集结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
            if clf == 'cat':
                print("|  CAT  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = CatBoostClassifier(verbose=False)
                model.fit(trn_x,trn_y)
                    
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Cat_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                if (len(test_x)!=0) and (len(test_x)!=0):
                    print('************ Test_Result ************')
                    test_pred  = model.predict_proba(test_x)
                    acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                    acc_scores_test.append(acc_score_tset)
                    print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                    print('Cat_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                    #保存测试集结果
                    test_oof += test_pred / kf.n_splits / len(seeds)
                    
                #保存结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
            if clf == 'dt':
                print("|  DT  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = DecisionTreeClassifier()
                model.fit(trn_x,trn_y)
                    
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('DT_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                if (len(test_x)!=0) and (len(test_x)!=0):
                    print('************ Test_Result ************')
                    test_pred  = model.predict_proba(test_x)
                    acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                    acc_scores_test.append(acc_score_tset)
                    print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                    print('DT_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                    #保存测试集结果
                    test_oof += test_pred / kf.n_splits / len(seeds)
            if clf == 'Ada':
                print("|  ADA  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = AdaBoostClassifier()
                model.fit(trn_x,trn_y)
                    
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Ada_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                if (len(test_x)!=0) and (len(test_x)!=0):
                    print('************ Test_Result ************')
                    test_pred  = model.predict_proba(test_x)
                    acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                    acc_scores_test.append(acc_score_tset)
                    print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                    print('Ada_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                    #保存测试集结果
                    test_oof += test_pred / kf.n_splits / len(seeds)
            if clf == 'rf':
                print("|  RF  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = RandomForestClassifier()
                model.fit(trn_x,trn_y)
                    
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('RF_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                if (len(test_x)!=0) and (len(test_x)!=0):
                    print('************ Test_Result ************')
                    test_pred  = model.predict_proba(test_x)
                    acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                    acc_scores_test.append(acc_score_tset)
                    print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                    print('RF_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                    #保存测试集结果
                    test_oof += test_pred / kf.n_splits / len(seeds)    
                #保存结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
            if clf == 'Gnb':
                print("|  GNB  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = GaussianNB()
                model.fit(trn_x,trn_y)
                    
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Gnb_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                if (len(test_x)!=0) and (len(test_x)!=0):
                    print('************ Test_Result ************')
                    test_pred  = model.predict_proba(test_x)
                    acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                    acc_scores_test.append(acc_score_tset)
                    print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                    print('Gnb_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                    #保存测试集结果
                    test_oof += test_pred / kf.n_splits / len(seeds)
            if clf == 'svm':
                print("|  SVM  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = SVC(kernel='rbf', C=1, gamma='auto', probability=True,max_iter=1000)
                model.fit(trn_x,trn_y)
                
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Svm_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                print('************ Test_Result ************')
                test_pred  = model.predict_proba(test_x)
                acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                acc_scores_test.append(acc_score_tset)
                print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                print('Svm_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                #保存结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
                test_oof += test_pred / kf.n_splits / len(seeds)   
            if clf == 'knn':
                print("|  KNN  Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = KNeighborsClassifier()
                model.fit(trn_x,trn_y)
                
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Knn_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                print('************ Test_Result ************')
                test_pred  = model.predict_proba(test_x)
                acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                acc_scores_test.append(acc_score_tset)
                print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                print('Knn_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                #保存结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
                test_oof += test_pred / kf.n_splits / len(seeds)    
            if clf == 'tabnet':
                print(f"     Tab_model  Fold {i+1}  Training Starting       ")
                if torch.cuda.is_available():
                    print("Using GPU")
                    device = "cuda"
                else:
                    print("Using CPU")
                    device = "cpu"
                    
                torch.manual_seed(seed)
                np.random.seed(seed)
                model = tab_model.TabNetClassifier()

                model.fit(trn_x, trn_y,eval_set=[(val_x, val_y)],eval_metric=['accuracy'])
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Tabnet_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                print('************ Test_Result ************')
                test_pred  = model.predict_proba(test_x)
                acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                acc_scores_test.append(acc_score_tset)
                print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                print('Tabnet_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                #保存结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
                test_oof += test_pred / kf.n_splits / len(seeds)  
            if clf == 'Mlp':
                print("|  MLP Fold  {}  Training Start           |".format(str(i + 1)))
                #训练模型
                model = MLPClassifier()
                model.fit(trn_x,trn_y)
                
                #验证集
                print('************ Val_Result ************')
                val_pred  = model.predict_proba(val_x)
                acc_score_val = accuracy_score(val_y, np.argmax(val_pred, axis=1))
                acc_scores_val.append(acc_score_val)
                print('AVG_acc :',sum(acc_scores_val)/len(acc_scores_val))
                print('Mlp_result :',classification_report(val_y, np.argmax(val_pred, axis=1)))
                #测试集
                print('************ Test_Result ************')
                test_pred  = model.predict_proba(test_x)
                acc_score_tset = accuracy_score(test_y, np.argmax(test_pred, axis=1))
                acc_scores_test.append(acc_score_tset)
                print('AVG_acc :',sum(acc_scores_test)/len(acc_scores_test))
                print('Mlp_result :',classification_report(test_y, np.argmax(test_pred, axis=1))) 
                #保存结果
                train_oof[valid_index] = val_pred / kf.n_splits / len(seeds)
                test_oof += test_pred / kf.n_splits / len(seeds)  
        if (len(test_x)!=0) and (len(test_x)!=0):
            return train_oof,test_oof,model,scaler
        else:
            return train_oof,model,scaler

# # 训练 XGB模型
# xgb_train_oof_1,xgb_test_oof_1, xgb_model_1,scaler_1 = ml_model('xgb',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# # 训练 LGB模型
# lgb_train_oof_1,lgb_test_oof_1,lgb_model_1,scaler_1 = ml_model('lgb',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# # 训练 CAT模型
# cat_train_oof_1,cat_test_oof_1,cat_model_1,scaler_1 = ml_model('cat',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# # 训练 SVM模型
# svm_train_oof_1,svm_test_oof_1,svm_model_1,scaler_1 = ml_model('svm',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# # 训练 Tabnet模型
# tab_train_oof_1,tab_test_oof_1,tab_model_1,scaler_1 = ml_model('tabnet',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# 训练 DT模型
dt_train_oof_1,dt_test_oof_1, dt_model_1,scaler_1 = ml_model('dt',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# 训练 Ada模型
Ada_train_oof_1,Ada_test_oof_1,Ada_model_1,scaler_1 = ml_model('Ada',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# 训练 RF模型
rf_train_oof_1,rf_test_oof_1,rf_model_1,scaler_1 = ml_model('rf',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# 训练 Gnb模型
Gnb_train_oof_1,Gnb_test_oof_1,Gnb_model_1,scaler_1 = ml_model('Gnb',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# 训练 Knn模型
knn_train_oof_1,knn_test_oof_1,knn_model_1,scaler_1 = ml_model('knn',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

# 训练 Mlp模型
Mlp_train_oof_1,Mlp_test_oof_1,Mlp_model_1,scaler_1 = ml_model('Mlp',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

Seed: 888
|  dt  Fold  1  Training Start           |
************ Val_Result ************
AVG_acc : 0.7224797745659486
DT_result :               precision    recall  f1-score   support

           0       0.84      0.84      0.84      5910
           1       0.61      0.61      0.61      3685
           2       0.53      0.53      0.53      1406

    accuracy                           0.72     11001
   macro avg       0.66      0.66      0.66     11001
weighted avg       0.72      0.72      0.72     11001

************ Test_Result ************
AVG_acc : 0.7233655734128427
DT_result :               precision    recall  f1-score   support

           0       0.85      0.83      0.84      7458
           1       0.60      0.62      0.61      4501
           2       0.54      0.52      0.53      1792

    accuracy                           0.72     13751
   macro avg       0.66      0.66      0.66     13751
weighted avg       0.72      0.72      0.72     13751

|  dt  Fold  2  Training Sta

In [18]:
#训练集困难样本处理
df_pre = pd.DataFrame()
df_pre['xgb_pre'] = np.argmax(xgb_train_oof_1,axis=1)
df_pre['lgb_pre'] = np.argmax(lgb_train_oof_1,axis=1)
df_pre['cat_pre'] = np.argmax(cat_train_oof_1,axis=1)
df_pre['label'] = train_df['label']

grade_list = []
for row in df_pre.itertuples():
    grade = 0
    if getattr(row,'xgb_pre') == getattr(row,'label'):
        grade += 1
    if getattr(row,'lgb_pre') == getattr(row,'label'):
        grade += 1
    if getattr(row,'cat_pre') == getattr(row,'label'):
        grade += 1
    grade_list.append(grade)

#困难样本识别
df_pre['grade'] = grade_list
train_hard_index = df_pre.loc[(df_pre['grade']==0)].index

train_hard_df = train_df.loc[train_hard_index].reset_index(drop=True)
xgb_train_hard_oof, xgb_train_hard_model,scaler_2 = ml_model('xgb',train_hard_df[feature_cols],train_hard_df['label'])
lgb_train_hard_oof, lgb_train_hard_model,scaler_2 = ml_model('lgb',train_hard_df[feature_cols],train_hard_df['label'])
cat_train_hard_oof, cat_train_hard_model,scaler_2 = ml_model('cat',train_hard_df[feature_cols],train_hard_df['label'])

#训练集替换困难样本结果
xgb_train_oof_2 = np.argmax(xgb_train_oof_1,axis=1)
xgb_train_oof_2[train_hard_index]=np.argmax(xgb_train_hard_oof,axis=1)

lgb_train_oof_2 = np.argmax(lgb_train_oof_1,axis=1)
lgb_train_oof_2[train_hard_index]=np.argmax(lgb_train_hard_oof,axis=1)

cat_train_oof_2 = np.argmax(cat_train_oof_1,axis=1)
cat_train_oof_2[train_hard_index]=np.argmax(cat_train_hard_oof,axis=1)

Seed: 888
|  XGB  Fold  1  Training Start           |
[0]	validation_0-mlogloss:0.96997	validation_1-mlogloss:0.98364
[99]	validation_0-mlogloss:0.20272	validation_1-mlogloss:0.62778
************ Val_Result ************
AVG_acc : 0.7344444444444445
XGB_result :               precision    recall  f1-score   support

           0       0.66      0.68      0.67       550
           1       0.84      0.84      0.84       772
           2       0.65      0.62      0.64       478

    accuracy                           0.73      1800
   macro avg       0.72      0.71      0.72      1800
weighted avg       0.73      0.73      0.73      1800

|  XGB  Fold  2  Training Start           |
[0]	validation_0-mlogloss:0.97274	validation_1-mlogloss:0.98713
[99]	validation_0-mlogloss:0.20816	validation_1-mlogloss:0.62838
************ Val_Result ************
AVG_acc : 0.7380555555555556
XGB_result :               precision    recall  f1-score   support

           0       0.70      0.65      0.68       

In [19]:
#测试集困难样本处理
df_pre = pd.DataFrame()
df_pre['xgb_pre'] = np.argmax(xgb_test_oof_1,axis=1)
df_pre['lgb_pre'] = np.argmax(lgb_test_oof_1,axis=1)
df_pre['cat_pre'] = np.argmax(cat_test_oof_1,axis=1).flatten()
df_pre['label'] = test_df['label']

grade_list = []
for row in df_pre.itertuples():
    grade = 0
    if getattr(row,'xgb_pre') == getattr(row,'label'):
        grade += 1
    if getattr(row,'lgb_pre') == getattr(row,'label'):
        grade += 1
    if getattr(row,'cat_pre') == getattr(row,'label'):
        grade += 1
    grade_list.append(grade)

#测试集替换困难样本结果
df_pre['grade'] = grade_list
test_hard_index = df_pre.loc[(df_pre['grade']==0)].index
test_hard_df = test_df.loc[test_hard_index].reset_index(drop=True)

print('Test_hard result :')
xgb_test_hard_oof = xgb_train_hard_model.predict(scaler_2.transform(test_hard_df[feature_cols]))
print('xgb_test_hard result:',accuracy_score(xgb_test_hard_oof,test_hard_df['label']))
lgb_test_hard_oof = lgb_train_hard_model.predict(scaler_2.transform(test_hard_df[feature_cols]))
print('lgb_test_hard result:',accuracy_score(lgb_test_hard_oof,test_hard_df['label']))
cat_test_hard_oof = cat_train_hard_model.predict(scaler_2.transform(test_hard_df[feature_cols]))
print('cat_test_hard result:',accuracy_score(cat_test_hard_oof,test_hard_df['label']))


xgb_test_oof_2 = np.argmax(xgb_test_oof_1,axis=1)
xgb_test_oof_2[test_hard_index]=xgb_test_hard_oof

lgb_test_oof_2 = np.argmax(lgb_test_oof_1,axis=1)
lgb_test_oof_2[test_hard_index]=lgb_test_hard_oof

cat_test_oof_2 = np.argmax(cat_test_oof_1,axis=1)
cat_test_oof_2[test_hard_index]=cat_test_hard_oof.flatten()

Test_hard result :
xgb_test_hard result: 0.7598797250859106
lgb_test_hard result: 0.7800687285223368
cat_test_hard result: 0.7736254295532646


In [20]:

#xgb预测结果作为新特征(替换困难样本)
train_df['xgb_pre'] = xgb_train_oof_2
test_df['xgb_pre'] = xgb_test_oof_2
#lgb预测结果作为新特征(替换困难样本)
train_df['lgb_pre'] = lgb_train_oof_2
test_df['lgb_pre'] = lgb_test_oof_2
#cat预测结果作为新特征(替换困难样本)
train_df['cat_pre'] = cat_train_oof_2
test_df['cat_pre'] = cat_test_oof_2

#训练特征
feature_cols = [cols for cols in train_df if cols not in ['msisdn','label','end_time','stime','times_month']]
# 训练tabnet模型
final_tab_train_oof,final_tab_test_oof,final_tab_model,scaler = ml_model('tabnet',train_df[feature_cols], train_df['label'],test_df[feature_cols], test_df['label'])

Seed: 888
     Tab_model  Fold 1  Training Starting       
Using CPU
epoch 0  | loss: 0.54512 | val_0_accuracy: 0.91737 |  0:00:03s
epoch 1  | loss: 0.28215 | val_0_accuracy: 0.92328 |  0:00:07s
epoch 2  | loss: 0.2617  | val_0_accuracy: 0.92237 |  0:00:10s
epoch 3  | loss: 0.24948 | val_0_accuracy: 0.92383 |  0:00:14s
epoch 4  | loss: 0.24828 | val_0_accuracy: 0.9231  |  0:00:18s
epoch 5  | loss: 0.24792 | val_0_accuracy: 0.92292 |  0:00:21s
epoch 6  | loss: 0.24348 | val_0_accuracy: 0.92264 |  0:00:25s
epoch 7  | loss: 0.24308 | val_0_accuracy: 0.92283 |  0:00:28s
epoch 8  | loss: 0.24088 | val_0_accuracy: 0.92364 |  0:00:32s
epoch 9  | loss: 0.24043 | val_0_accuracy: 0.92373 |  0:00:35s
epoch 10 | loss: 0.23849 | val_0_accuracy: 0.92455 |  0:00:39s
epoch 11 | loss: 0.23753 | val_0_accuracy: 0.92446 |  0:00:43s
epoch 12 | loss: 0.23354 | val_0_accuracy: 0.92473 |  0:00:46s
epoch 13 | loss: 0.23128 | val_0_accuracy: 0.92383 |  0:00:50s
epoch 14 | loss: 0.23101 | val_0_accuracy: 0.9242

In [None]:
import joblib
#模型保存
def save_model(mdoel,save_path):
    # save_path = save_path + '/' +  mdoel_name +'.pkl'
    joblib.dump(mdoel, save_path)
#模型保存路径
xgb_model_path = './model/xgb.pkl'
lgb_model_path = './model/lgb.pkl'
cat_model_path = './model/cat.pkl'
svm_model_path = './model/svm.pkl'
tabnet_model_path = './model/tabnet.pkl'
final_tab_model_path = './model/final_tab.pkl'
#保存
save_model(xgb_model,xgb_model_path)
save_model(lgb_model, lgb_model_path)
save_model(cat_model,  cat_model_path)
save_model(svm_model,svm_model_path)
save_model(tab_model, tabnet_model_path)
save_model(final_tab_model,  final_tab_model_path)