In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# 加載數據
data = pd.read_csv('/datasets/dataset_1st/training.csv')

# 處理類別型特徵：轉換為數值型
label_encoders = {}
categorical_columns = data.select_dtypes(include=['object']).columns

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# 處理 NaN 值：填充或刪除
imputer = SimpleImputer(strategy='median')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# 分割特徵和標籤
X = data.drop('label', axis=1)
y = data['label']

# 應用 SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 分割數據集
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42)

# 訓練 XGBoost 模型
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
xgb_model.fit(X_train_resampled, y_train_resampled)

# 評估模型
y_pred = xgb_model.predict(X_test_resampled)

# 計算並顯示分類報告和混淆矩陣
class_report = classification_report(y_test_resampled, y_pred)
conf_matrix = confusion_matrix(y_test_resampled, y_pred)
print("Classification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)

# 計算並顯示其他模型指標
accuracy = accuracy_score(y_test_resampled, y_pred)
precision = precision_score(y_test_resampled, y_pred, average='macro')
recall = recall_score(y_test_resampled, y_pred, average='macro')
f1 = f1_score(y_test_resampled, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Classification Report:
               precision    recall  f1-score   support

         0.0       0.99      0.99      0.99   1731445
         1.0       0.99      0.99      0.99   1731154

    accuracy                           0.99   3462599
   macro avg       0.99      0.99      0.99   3462599
weighted avg       0.99      0.99      0.99   3462599

Confusion Matrix:
 [[1720840   10605]
 [  10214 1720940]]
Accuracy: 0.9939874643295398
Precision: 0.9939874802209477
Recall: 0.9939874737770551
F1 Score: 0.993987464324525


public test

In [57]:
new_data = pd.read_csv('/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/datasets/dataset_1st/public_processed.csv')
# Add a new column "label" with all values set to 0
new_data['label'] = 0

# Display the DataFrame
new_data.head()

Unnamed: 0,txkey,locdt,loctm,chid,cano,contp,etymd,mchno,acqic,mcc,...,stocn,scity,stscd,ovrlt,flbmk,hcefg,csmcu,csmam,flg_3dsmk,label
0,a2c1209018e4e52e04f6fabb48f05f1b8bc09dc838ff6c...,56,105157,d6d145d0d058e3f63b9ec421cf9e1543b0c3e059d18c2c...,9c454e0cf3680075b0382394ca59b00ee9435fd829da29...,4,1.0,8be206f97d10b57a46ea0ef5527155a0f9b48d16255b70...,2a608b081c09492bd2bc96d7def5371c4bc9cabf324a98...,272.0,...,0.0,15759.0,,0,0,6.0,70.0,1349,0,0
1,16c4880500059e01553789be11bbb50753b7acaae7b95b...,56,185520,b599fb83473599fa2df4a04136a668c28df7914293f1da...,9f097cba6a70333c906904ebaad438356cf75b6abf5dd4...,5,8.0,dd280b4f35a02701142ba7de47abb9736deca3eed86474...,46f63555d29f177290dfe20e989146fb033e453f1d5ff3...,320.0,...,0.0,13451.0,,0,0,6.0,70.0,306,0,0
2,623c56be3bee87724e3d119c271d9ed098eeda84233183...,59,102312,8de197129150d454df1639e9743c1aa11b1c1122a2010c...,439691cb167b272661c2eb9073124f5d422f446b0f92fb...,5,1.0,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,9b2d5b4678781e53038e91ea5324530a03f27dc1d0e5f6...,494.0,...,0.0,15759.0,,0,0,,,0,0,0
3,250da12187059cf6e3a3066656a2919d08ceb8207efd55...,58,230138,f9cbad08a76f1dd9736a4fc51255eb55637ec0a417a406...,992679b728784192c17e90939918cbfae32ad1222fad26...,5,5.0,bd2b3523b4ecd652445b75ea7583c92f6f02af3971251b...,70dac5b1dfde828f556a93321a649df72891db35430588...,272.0,...,42.0,12381.0,,0,0,6.0,70.0,8339,0,0
4,4b268e0da036f44cbbb056ddfac6a28ea336d9cf299843...,59,101937,27d235e691a425098f291105f78f9877e05bb75e1f132b...,763fd2a91420f9b632c378ca82ff9e318851fa1964cbba...,5,5.0,156987f72b8993b8c5a898fa76f3b8e6454e10e18aa238...,89b1654aaef0b65b1868004cfdce2dcbb67f15b7fba535...,217.0,...,126.0,14786.0,,0,0,6.0,68.0,5,0,0


In [58]:
label_encoders = {}
categorical_columns = new_data.select_dtypes(include=['object']).columns

categorical_columns

Index(['txkey', 'chid', 'cano', 'mchno', 'acqic'], dtype='object')

In [59]:
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    new_data[col] = label_encoders[col].fit_transform(new_data[col])

In [60]:
tran_data = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)
tran_data = tran_data.drop(columns=['label'])
tran_data

Unnamed: 0,txkey,locdt,loctm,chid,cano,contp,etymd,mchno,acqic,mcc,...,flam1,stocn,scity,stscd,ovrlt,flbmk,hcefg,csmcu,csmam,flg_3dsmk
0,381905.0,56.0,105157.0,183578.0,146248.0,4.0,1.0,28463.0,460.0,272.0,...,1349.0,0.0,15759.0,1.0,0.0,0.0,6.0,70.0,1349.0,0.0
1,53137.0,56.0,185520.0,155127.0,148756.0,5.0,8.0,45191.0,759.0,320.0,...,306.0,0.0,13451.0,1.0,0.0,0.0,6.0,70.0,306.0,0.0
2,230851.0,59.0,102312.0,121197.0,63298.0,5.0,1.0,31582.0,1618.0,494.0,...,20.0,0.0,15759.0,1.0,0.0,0.0,6.0,70.0,0.0,0.0
3,86653.0,58.0,230138.0,213589.0,143290.0,5.0,5.0,38566.0,1173.0,272.0,...,8339.0,42.0,12381.0,1.0,0.0,0.0,6.0,70.0,8339.0,0.0
4,176648.0,59.0,101937.0,33941.0,110623.0,5.0,5.0,4440.0,1438.0,217.0,...,163.0,126.0,14786.0,1.0,0.0,0.0,6.0,68.0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
600177,272384.0,56.0,93445.0,32768.0,31877.0,4.0,1.0,43612.0,217.0,231.0,...,96.0,0.0,16117.0,1.0,0.0,0.0,6.0,70.0,96.0,0.0
600178,279125.0,56.0,145420.0,189467.0,13107.0,5.0,5.0,43297.0,601.0,375.0,...,2017.0,0.0,15759.0,1.0,0.0,0.0,6.0,70.0,2017.0,0.0
600179,457650.0,58.0,3400.0,67871.0,66867.0,5.0,1.0,14124.0,1369.0,406.0,...,33.0,126.0,10300.0,1.0,0.0,0.0,6.0,70.0,33.0,0.0
600180,502156.0,59.0,134901.0,172514.0,73587.0,5.0,4.0,479.0,1504.0,288.0,...,1796.0,0.0,15759.0,1.0,0.0,0.0,6.0,70.0,1796.0,0.0


In [61]:
new_predictions = xgb_model.predict(tran_data)
new_predictions

array([0, 0, 0, ..., 0, 0, 0])

In [62]:


# Get unique values from the 'new_predictions' array
unique_values = np.unique(new_predictions)

# Now, 'unique_values' contains the unique values in the 'new_predictions' array
print(unique_values)


[0 1]


In [63]:
value_counts = np.bincount(new_predictions)

# Now, 'value_counts' contains the counts of each unique value
print(value_counts)

[598618   1564]


In [64]:
import pandas as pd

new_data = pd.read_csv('/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/datasets/dataset_1st/public_processed.csv')

# Create a DataFrame from "txkey" and "new_predictions"
result_df = pd.DataFrame({'txkey': new_data['txkey'], 'pred': new_predictions})

# Convert "txkey" to string (if it's not already)
result_df['txkey'] = result_df['txkey'].astype(str)

# Export the DataFrame to a CSV file
result_df.to_csv('/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/datasets/dataset_1st/predictions.csv', index=False)


保存

In [2]:
import joblib

# 保存模型
joblib.dump(xgb_model, 'xgb_model.pkl')

# 保存 LabelEncoders 和 Imputer
for col, le in label_encoders.items():
    joblib.dump(le, f'label_encoder_{col}.pkl')
joblib.dump(imputer, 'imputer.pkl')


['imputer.pkl']

In [3]:
# 加載模型
xgb_model = joblib.load('xgb_model.pkl')

# 加載 LabelEncoders 和 Imputer
label_encoders_loaded = {}
for col in categorical_columns:
    label_encoders_loaded[col] = joblib.load(f'label_encoder_{col}.pkl')
imputer_loaded = joblib.load('imputer.pkl')


In [9]:
import pandas as pd
import joblib

def preprocess_and_predict(new_data, model_path, imputer_path, label_encoders_paths):
    # Load the trained model and preprocessing components
    model = joblib.load(model_path)
    imputer = joblib.load(imputer_path)
    label_encoders = {col: joblib.load(le_path) for col, le_path in label_encoders_paths.items()}
    
    # Apply label encodings
    for col, le in label_encoders.items():
        if col in new_data:
            new_data[col] = le.transform(new_data[col].astype(str))
    
    # Apply imputation
    new_data_preprocessed = pd.DataFrame(imputer.transform(new_data), columns=new_data.columns)
    
    # Predict using the model
    predictions = model.predict(new_data_preprocessed)
    
    return predictions

# Paths to the saved model and preprocessing components
model_path = '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/xgb_model.pkl'  # Update with the actual path
imputer_path = '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/imputer.pkl'  # Update with the actual path
label_encoders_paths = {
    'txkey': '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_txkey.pkl',  # Update with the actual paths
    'chid': '/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_chid.pkl',
    'mchno':'/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_mchno.pkl',
    'cano':'/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_cano.pkl',
    'acqic':'/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/model/label_encoder_acqic.pkl'
}

# Load new data
new_data = pd.read_csv('/Users/linyinghsiao/Documents/GitHub/fraud-detection-E.SUN/datasets/dataset_1st/public_processed.csv')  # Update with the actual path

# Predict on the new data
predictions = preprocess_and_predict(new_data, model_path, imputer_path, label_encoders_paths)
print(predictions)


ValueError: y contains previously unseen labels: 'a2c1209018e4e52e04f6fabb48f05f1b8bc09dc838ff6cb19906377fab414587'