<a href="https://colab.research.google.com/github/abidrozhan/MidTerm-Machine-Learning/blob/main/AbidRozhan_Midterm_Transaction_Data_MLDL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Cell 1: Instalasi dependensi dan pengunduhan dataset
# Jika library sudah terinstal di lingkungan Anda, bagian instalasi bisa dilewati
!pip install -q gdown polars scikit-learn imbalanced-learn xgboost

In [3]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Cell 2: Memuat dataset ke dalam data frame
import pandas as pd

# Path dasar tempat dataset tersimpan
BASE_PATH = "/content/sample_data"

# Memuat file CSV menggunakan polars (lebih cepat untuk dataset besar)
train_df = pd.read_csv(f"{BASE_PATH}/train_transaction.csv")
test_df  = pd.read_csv(f"{BASE_PATH}/test_transaction.csv")

# Menampilkan dimensi dataset (jumlah baris, jumlah kolom)
print("Train shape:", train_df.shape)  # 590540 baris, 394 kolom (393 fitur + 1 target)
print("Test shape :", test_df.shape)   # 506691 baris, 393 fitur


Train shape: (590540, 394)
Test shape : (506691, 393)


In [5]:
train_df.head() #isFraud adalah target featurenya

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
test_df.head() #uji data baru, setelah pembuatan model machine learningnya

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3663549,18403224,31.95,W,10409,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
1,3663550,18403263,49.0,W,4272,111.0,150.0,visa,226.0,debit,...,,,,,,,,,,
2,3663551,18403310,171.0,W,4476,574.0,150.0,visa,226.0,debit,...,,,,,,,,,,
3,3663552,18403310,284.95,W,10989,360.0,150.0,visa,166.0,debit,...,,,,,,,,,,
4,3663553,18403317,67.95,W,18018,452.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,


In [7]:

# Cell 4: EDA singkat

# Distribusi target isFraud
print("Distribusi target isFraud:")
print(train_df['isFraud'].value_counts(normalize=True))

# Hitung persentase missing value per kolom (tampilkan 10 teratas)
missing_pct = train_df.isnull().mean().sort_values(ascending=False) * 100
print("10 kolom dengan missing value tertinggi (%):")
print(missing_pct.head(10))


Distribusi target isFraud:
isFraud
0    0.96501
1    0.03499
Name: proportion, dtype: float64
10 kolom dengan missing value tertinggi (%):
dist2    93.628374
D7       93.409930
D13      89.509263
D14      89.469469
D12      89.041047
D6       87.606767
D9       87.312290
D8       87.312290
V153     86.123717
V149     86.123717
dtype: float64


In [8]:

# Cell 5: Optimasi memori dan preprocessing
from sklearn.preprocessing import LabelEncoder

# Pisahkan target dari fitur
y = train_df['isFraud']
X = train_df.drop(columns=['isFraud'])

# Identifikasi kolom numerik dan kategorikal
numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Konversi tipe data numerik ke format lebih kecil
X[numeric_cols] = X[numeric_cols].astype('float32')
test_df[numeric_cols] = test_df[numeric_cols].astype('float32')

# Label encoding untuk kolom kategorikal
# LightGBM (sklearn API) mengharuskan data numerik; LabelEncoder cocok untuk fitur kategorikal
for col in categorical_cols:
    le = LabelEncoder()
    # Gabungkan train dan test agar label konsisten
    combined = pd.concat([X[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

print("Preprocessing selesai. Ukuran X:", X.shape)


Preprocessing selesai. Ukuran X: (590540, 393)


In [9]:

# Cell 6: Membagi data menjadi train dan validation, serta menghitung scale_pos_weight
from sklearn.model_selection import train_test_split
import numpy as np

# Membagi data 80:20 dengan stratifikasi
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Hitung scale_pos_weight = (negatif / positif)
neg = np.sum(y_train == 0)
pos = np.sum(y_train == 1)
scale_pos_weight = neg / pos
print('Rasio scale_pos_weight:', scale_pos_weight)


Rasio scale_pos_weight: 27.580278281911674


In [19]:
!pip install --no-binary :all: lightgbm



In [10]:

# Cell 7: Pelatihan model LightGBM
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, classification_report

# Definisikan model
lgb_model = LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    device='gpu',
    gpu_platform_id=0,
    gpu_device_id=0,
    n_estimators=200,
    max_depth=-1,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=scale_pos_weight,
    n_jobs=-1,
    verbose=-1
)

# Melatih model
lgb_model.fit(X_train, y_train)

# Prediksi dan evaluasi pada validation set
val_proba = lgb_model.predict_proba(X_valid)[:,1]
val_pred = (val_proba >= 0.5).astype(int)

print("ROC-AUC:", roc_auc_score(y_valid, val_proba))
print("Classification Report:")
print(classification_report(y_valid, val_pred))


ROC-AUC: 0.9319636327682532
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.89      0.94    113975
           1       0.21      0.83      0.33      4133

    accuracy                           0.89    118108
   macro avg       0.60      0.86      0.64    118108
weighted avg       0.97      0.89      0.92    118108



In [None]:
# Cell 8: Hyperparameter tuning (opsional) dengan RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

params = {
    'n_estimators': [200, 400, 600],
    'max_depth': [-1, 4, 8, 12],
    'learning_rate': [0.03, 0.05, 0.1],
    'subsample': [0.7, 0.85, 1.0],
    'colsample_bytree': [0.7, 0.85, 1.0],
    'num_leaves': [31, 63, 127, 255]
}

model = LGBMClassifier(
    objective='binary',
    scale_pos_weight=scale_pos_weight,
    device='gpu',
    gpu_platform_id=0,
    gpu_device_id=0,
    boosting_type='gbdt',
    verbosity=-1
)

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    n_iter=12,
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=1,
    verbose=1
)

search.fit(X_train, y_train)


Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [11]:
# Cell 9: Training final model pada seluruh data dan membuat submission

# Latih ulang menggunakan model terbaik (atau lgb_model jika tuning dilewati)
final_model = best_lgb_model if 'best_lgb_model' in globals() else lgb_model
final_model.fit(X, y)

# Prediksi probabilitas pada test set
submission_proba = final_model.predict_proba(test_df)[:,1]

# Membuat DataFrame submission
submission = pd.DataFrame({
    'TransactionID': test_df['TransactionID'],
    'isFraud': submission_proba
})

# Simpan file submission
submission.to_csv('submission_lightgbm.csv', index=False)
print('File submission_lightgbm.csv telah dibuat.')

File submission_lightgbm.csv telah dibuat.


In [12]:
submission.head()

Unnamed: 0,TransactionID,isFraud
0,3663549.0,0.063076
1,3663550.0,0.169065
2,3663551.0,0.177772
3,3663552.0,0.085354
4,3663553.0,0.049336
