In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
import lightgbm as lgb

In [2]:
import os
os.chdir("..")

In [3]:
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: c:\Users\cool8\Documents\GitHub\fraud-detection-E.SUN


In [4]:
# 加載數據
data = pd.read_csv('datasets/new_train.csv')

In [5]:

# 處理類別型特徵：轉換為數值型
label_encoders = {}
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])
# # 處理 NaN 值：填充或刪除
# imputer = SimpleImputer(strategy='median')
# data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)


In [None]:
drop_column = ['chid','cano','mchno','acqic','new_scity']
data.drop('txkey',axis=1,inplace=True)
data.drop(drop_column,axis=1,inplace=True)

In [11]:
data = data.sort_values(['locdt','hrs_loctm'])

In [30]:
tscv = TimeSeriesSplit(n_splits=8)
X = data.drop('label', axis=1)
y = data['label']
for x , y in tscv.split(X):
    print('Train locdt:','0 ~',int(data.iloc[x[-1],:].locdt),', Test locdt:',int(data.iloc[x[-1],:].locdt)+1,'~',int(data.iloc[y[-1],:].locdt))


Train locdt: 0 ~ 5 , Test locdt: 6 ~ 12
Train locdt: 0 ~ 12 , Test locdt: 13 ~ 18
Train locdt: 0 ~ 18 , Test locdt: 19 ~ 25
Train locdt: 0 ~ 25 , Test locdt: 26 ~ 31
Train locdt: 0 ~ 31 , Test locdt: 32 ~ 37
Train locdt: 0 ~ 37 , Test locdt: 38 ~ 43
Train locdt: 0 ~ 43 , Test locdt: 44 ~ 49
Train locdt: 0 ~ 49 , Test locdt: 50 ~ 55


### XGB model

In [None]:
# tscv = TimeSeriesSplit(n_splits=8)
# all_f1_scores = []

# # 分割特徵和標籤
# X = data.drop('label', axis=1)
# y = data['label']


# # 分割數據集
# for train_index, test_index in tscv.split(X):
#     X_train_resampled, X_test_resampled = X.iloc[train_index], X.iloc[test_index]
#     y_train_resampled, y_test_resampled = y.iloc[train_index], y.iloc[test_index]

#     # 過採樣
#     # smote = SMOTE(random_state=42)
#     # X_resampled, y_resampled = smote.fit_resample(X, y)

#     # 欠採樣
#     undersampler = RandomUnderSampler(sampling_strategy=0.2,random_state=42) # Ratio = #label=1/#label=0
#     X_resampled, y_resampled = undersampler.fit_resample(X_train_resampled, y_train_resampled)

#     # XGBoost模型
#     # 调整正负样本的权重
#     scale_pos_weight = 1 / np.mean(y_resampled)
#     xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42, scale_pos_weight=scale_pos_weight)
#     xgb_model.fit(X_resampled, y_resampled)
#     y_pred = xgb_model.predict(X_test_resampled)

#     # 计算F1-score
#     f1 = f1_score(y_test_resampled, y_pred, average='binary')  # binary表示只计算正类别的F1-score
#     print("F1-score:", f1)

#     # 记录F1-score
#     all_f1_scores.append(f1)

# # 计算平均F1-score
# average_f1_score = np.mean(all_f1_scores)
# print("Average F1-score:", average_f1_score)

In [41]:
tscv = TimeSeriesSplit(n_splits=6)
X = data.drop('label', axis=1)
y = data['label']
for x , y in tscv.split(X):
    print('Train locdt:','0 ~',int(data.iloc[x[-1],:].locdt),', Test locdt:',int(data.iloc[x[-1],:].locdt)+1,'~',int(data.iloc[y[-1],:].locdt))


Train locdt: 0 ~ 7 , Test locdt: 8 ~ 15
Train locdt: 0 ~ 15 , Test locdt: 16 ~ 24
Train locdt: 0 ~ 24 , Test locdt: 25 ~ 32
Train locdt: 0 ~ 32 , Test locdt: 33 ~ 39
Train locdt: 0 ~ 39 , Test locdt: 40 ~ 47
Train locdt: 0 ~ 47 , Test locdt: 48 ~ 55


In [49]:
data_abroad = data[data.new_stocn!=0]

In [50]:
data_abroad

Unnamed: 0,locdt,chid,cano,contp,etymd,mchno,acqic,mcc,conam,ecfg,...,chid_cumcount7,conam_log1p,flam1_log1p,csmam_log1p,flam1avg7_log1p_cano,flam1avg7_log1p_mcc,cano_ratio,flam1conam_diff_log1p,flam1_diff_avg7log1p_cano,flam1_diff_avg7log1p_mcc
1622716,0,238393,543598,5,5.0,17564,1826,398.0,898.00,1,...,1,6.801283,6.801283,6.801283,4.786296,5.655011,1.00,0.000000,2.014987,1.146272
4171899,0,67381,263727,5,5.0,17564,1826,398.0,390.00,1,...,1,5.968708,5.971262,5.971262,4.869189,5.604414,1.00,-0.002554,1.102073,0.366848
1108300,0,448883,453412,5,5.0,17564,1826,398.0,200.00,1,...,1,5.303305,5.298317,5.298317,5.505332,6.503824,1.00,0.004988,-0.207014,-1.205506
8588609,0,186579,425923,4,5.0,136969,7614,471.0,999.00,1,...,3,6.907755,6.912743,6.912743,3.688879,6.277165,1.00,-0.004988,3.223863,0.635577
5312850,0,131785,142548,5,5.0,17564,1826,398.0,1483.00,1,...,1,7.302496,7.303170,7.303170,4.510860,5.532831,1.00,-0.000674,2.792311,1.770339
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5746040,55,447783,468364,5,8.0,36927,5035,369.0,1928.15,1,...,4,7.564835,7.564238,7.564238,4.189655,5.633887,0.75,0.000596,3.374584,1.930352
8130889,55,308625,18767,4,5.0,151442,7671,318.0,300.00,1,...,1,5.707110,5.707110,5.707110,4.174737,6.950962,1.00,0.000000,1.532373,-1.243852
2069562,55,218029,568049,5,8.0,76730,1826,379.0,0.00,1,...,2,0.000000,0.000000,0.000000,5.273000,5.655645,1.00,0.000000,-5.273000,-5.655645
6228950,55,218029,568049,4,1.0,35555,4410,353.0,0.00,0,...,3,0.000000,0.000000,0.000000,5.002842,5.396752,1.00,0.000000,-5.002842,-5.396752


In [47]:
# 分割特徵和標籤
X = data_abroad.drop('label', axis=1)
y = data_abroad['label']

# 定义LightGBM模型
lgb_model = lgb.LGBMClassifier(objective='binary', random_state=42)

# 定义参数网格
param_grid = {
    'classifier__num_leaves': [50, 100],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__max_depth': [7, 10],
    'classifier__min_child_samples': [50, 100],
    'classifier__subsample': [0.9, 1.0],
    'classifier__colsample_bytree': [0.8],
    'classifier__scale_pos_weight': [1, 10],  # (1 - 0.0037) / 0.0037 ≈ 268.7027
    'classifier__reg_alpha': [0, 10],
    'classifier__reg_lambda': [0, 10],
}

# 创建RandomUnderSampler对象
undersampler = RandomUnderSampler(sampling_strategy=0.5, random_state=42)

# 创建Pipeline，将欠采样与模型一起包装
pipeline = Pipeline(steps=[('undersampler', undersampler), ('classifier', lgb_model)])

# 创建GridSearchCV对象
grid_search = GridSearchCV(pipeline, param_grid, scoring='f1', cv=tscv, verbose=1, n_jobs=-1)
# 执行Grid Search
grid_search.fit(X, y)

# 输出最佳参数
print("Best parameters found: ", grid_search.best_params_)
print("Best F1-score found: {:.4f}".format(grid_search.best_score_))


Fitting 6 folds for each of 256 candidates, totalling 1536 fits


MemoryError: Unable to allocate 107. MiB for an array with shape (19, 737452) and data type float64

### Visualization

In [None]:
import matplotlib.pyplot as plt

# 假设 xgb_model 是你训练好的 XGBoost 模型
# xgb_model = ...

# 获取特征的重要性得分
feature_importance = xgb_model.feature_importances_

# 获取特征名字
feature_names = X.columns

# 将特征名字和重要性得分对应起来
feature_importance_dict = dict(zip(feature_names, feature_importance))

# 排序特征重要性，从高到低
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# 打印特征重要性
for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance}")

# 可视化特征重要性
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_feature_importance)), [importance for feature, importance in sorted_feature_importance], align='center')
plt.yticks(range(len(sorted_feature_importance)), [feature for feature, importance in sorted_feature_importance])
plt.xlabel('Feature Importance')
plt.title('XGBoost Feature Importance')
plt.show()

public test

In [None]:
# 加載數據
public = pd.read_csv('datasets/new_public.csv')
txkey_public = public['txkey']

In [None]:
# 處理類別型特徵：轉換為數值型
label_encoders = {}
categorical_columns = public.select_dtypes(include=['object']).columns
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    public[col] = label_encoders[col].fit_transform(public[col])

In [None]:
drop_column = ['chid','cano','mchno','acqic','new_scity']
public.drop(drop_column,axis=1,inplace=True)

In [61]:
new_predictions = xgb_model.predict(public)
new_predictions

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# Create a DataFrame from "txkey" and "new_predictions"
result_df = pd.DataFrame({'txkey': txkey_public, 'pred': new_predictions})

# Convert "txkey" to string (if it's not already)
result_df['txkey'] = result_df['txkey'].astype(str)

# Export the DataFrame to a CSV file
result_df.to_csv('datasets/public_prediction.csv', index=False)


In [62]:
# Get unique values from the 'new_predictions' array
unique_values = np.unique(new_predictions)

# Now, 'unique_values' contains the unique values in the 'new_predictions' array
print(unique_values)


[0 1]


In [63]:
value_counts = np.bincount(new_predictions)
# Now, 'value_counts' contains the counts of each unique value
print(value_counts)

[598618   1564]


保存

In [2]:
import joblib

# 保存模型
joblib.dump(xgb_model, 'xgb_model.pkl')

# 保存 LabelEncoders 和 Imputer
for col, le in label_encoders.items():
    joblib.dump(le, f'label_encoder_{col}.pkl')
joblib.dump(imputer, 'imputer.pkl')


['imputer.pkl']

In [3]:
# 加載模型
xgb_model = joblib.load('xgb_model.pkl')

# 加載 LabelEncoders 和 Imputer
label_encoders_loaded = {}
for col in categorical_columns:
    label_encoders_loaded[col] = joblib.load(f'label_encoder_{col}.pkl')
imputer_loaded = joblib.load('imputer.pkl')


In [9]:
import pandas as pd
import joblib

def preprocess_and_predict(new_data, model_path, imputer_path, label_encoders_paths):
    # Load the trained model and preprocessing components
    model = joblib.load(model_path)
    imputer = joblib.load(imputer_path)
    label_encoders = {col: joblib.load(le_path) for col, le_path in label_encoders_paths.items()}
    
    # Apply label encodings
    for col, le in label_encoders.items():
        if col in new_data:
            new_data[col] = le.transform(new_data[col].astype(str))
    
    # Apply imputation
    new_data_preprocessed = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)
    
    # Predict using the model
    predictions = model.predict(new_data_preprocessed)
    
    return predictions

# Paths to the saved model and preprocessing components
model_path = '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/xgb_model.pkl'  # Update with the actual path
imputer_path = '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/imputer.pkl'  # Update with the actual path
label_encoders_paths = {
    'txkey': '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_txkey.pkl',  # Update with the actual paths
    'chid': '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_chid.pkl',
    'mchno':'/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_mchno.pkl',
    'cano':'/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_cano.pkl',
    'acqic':'/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_acqic.pkl'
}

# Load new data
new_data = pd.read_csv('/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/datasets/dataset_1st/public_processed.csv')  # Update with the actual path

# Predict on the new data
predictions = preprocess_and_predict(new_data, model_path, imputer_path, label_encoders_paths)
print(predictions)


ValueError: y contains previously unseen labels: 'a2c1209018e4e52e04f6fabb48f05f1b8bc09dc838ff6cb19906377fab414587'