01_import dataset
===

In [None]:
import os
import json
import pandas as pd

base_dir = "/Users/Andypon/10_交大研究所/1141_01_機器學習與金融科技/data"

def load_json_to_df(filename: str) -> pd.DataFrame:
    """讀取 JSON 並轉為 DataFrame。
    如果是 dict of scalar → 轉成 [{"code": k, "desc": v}, ...]
    """
    file_path = os.path.join(base_dir, filename)
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (
        pd.DataFrame([{"code": k, "desc": v} for k, v in data.items()])
        if isinstance(data, dict) else pd.DataFrame(data)
    )

def load_csv_to_df(filename: str) -> pd.DataFrame:
    """讀取 CSV 並轉為 DataFrame。"""
    return pd.read_csv(os.path.join(base_dir, filename))

# JSON 資料
mcc_codes_df = load_json_to_df("mcc_codes.json")
train_fraud_labels_df = load_json_df("train_fraud_labels.json")

# CSV 資料
cards_df = load_csv_to_df("cards_data.csv")
transactions_df = load_csv_to_df("transactions_data.csv")
users_df = load_csv_to_df("users_data.csv")

# 簡單檢查
#print(mcc_codes_df.head())
#print(train_fraud_labels_df.head())
#print(cards_df.head())
#print(transactions_df.head())
#print(users_df.head())


02_rename variable in each data set
===

In [3]:
transactions_df = transactions_df.rename(columns={'mcc': 'mcc_code'})
transactions_df = transactions_df.rename(columns={'id': 'transaction_id'})
users_df = users_df.rename(columns={'id':'client_id'})
cards_df = cards_df.rename(columns={'id':'card_id'})

03_Missing value的處理
===

In [18]:
transactions_df.loc[
    transactions_df['merchant_city'].str.lower() == 'online',
    'merchant_state'
] = 'online'

transactions_df.loc[
    transactions_df['merchant_city'].str.lower() == 'online',
    'zip'
] = -1

## 我沒有全部改，這樣完之後仍有89006筆Missing
transactions_df['zip'] = transactions_df['zip'].fillna(-2)

In [None]:
##不用執行～～

##檢查89006筆Missing的zip
c_missing_zip = transactions_df[transactions_df["zip"].isna()]
c_mexico_zip = transactions_df[transactions_df["merchant_state"]=="Mexico"]
#c_mcc_mv_zip = c_missing_zip[
#    (c_missing_zip["mcc_code"] > 5400) & (c_missing_zip["mcc_code"] < 5700)
#]



# 先建立 mapping table：一組 state+city 可能對應多個 zip
mapping_df = (
    transactions_df
    .dropna(subset=["zip"])                                   # 只要 zip 有值的 row
    .drop_duplicates(subset=["merchant_state", "merchant_city", "zip"]) 
    [["merchant_state", "merchant_city", "zip"]]              # 只留下需要的欄位
)

print(mapping_df.head())


# 假設 df 已經存在
# 建立新的欄位 F，B 與 C 合併
c_missing_zip["fullname"] = c_missing_zip["merchant_city"].astype(str) + c_missing_zip["merchant_state"].astype(str)
# 建立新的 DataFrame，只取 A, D, F
df_small = c_missing_zip[["transaction_id", "fullname","zip"]]

mapping_df["mfullname"] = mapping_df["merchant_city"].astype(str) + mapping_df["merchant_state"].astype(str)

# 先建立一個 lookup 字典
lookup_dict = dict(zip(mapping_df["mfullname"], mapping_df["zip"]))

# 用 map 當作 vlookup
df_small["zip"] = df_small["zip"].fillna(df_small["fullname"].map(lookup_dict))




  merchant_state merchant_city      zip
0             ND        Beulah  58523.0
1             IA    Bettendorf  52722.0
2             CA         Vista  92084.0
3             IN   Crown Point  46307.0
4             MD       Harwood  20776.0


In [19]:
transactions_df['errors'] = transactions_df['errors'].astype('category')
transactions_df['errors'] = transactions_df['errors'].cat.add_categories('No_error').fillna('No_error')

04_變數型態統一
===

In [None]:
transactions_df['amount'] = transactions_df['amount'].replace(r'[\$,]', '', regex=True).astype(float).astype(int)
#transactions_df['mcc_code'] = transactions_df['mcc_code'].astype('int64')
cards_df['has_chip'] = cards_df['has_chip'].map({'YES': 1, 'NO': 0})

#card_type 原始種類：Debit_57%, Credit_33%, Debit(Prepaid)_9%
#card_brand 原始種類：MasterCard_52%, Visa_38%, Amex_7%, Discovery_3%
cols_to_encode = ['card_type', 'card_brand']
cards_df[cols_to_encode] = cards_df[cols_to_encode].astype('category')
dummies_cards = pd.get_dummies(
    cards_df[cols_to_encode], 
    prefix=cols_to_encode, 
    dtype='uint8'
    )
cards_df = pd.concat([cards_df, dummies_cards], axis=1)

#use_chip 原始種類：Swiped_52%, Chipe_36%, Online_12%
dummies_chip = pd.get_dummies(transactions_df['use_chip'], prefix='use_chip', dtype='uint8')
transactions_df = pd.concat([transactions_df, dummies_chip], axis=1)



05_data資料整合
===

In [None]:
#cars one heart encoding