In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

sns.set_theme()

### Load training data

In [2]:
train_identity_df = pd.read_csv('./ieee-fraud-detection/train_identity.csv')

train_identity_df.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [None]:
train_transaction_df = pd.read_csv('./ieee-fraud-detection/train_transaction.csv')

train_transaction_df.head()

In [None]:
print(f'{train_identity_df.shape = }')
print(f'{train_transaction_df.shape = }')

In [None]:
len(set(train_identity_df['TransactionID']) - set(train_transaction_df['TransactionID']))

In [None]:
len(set(train_transaction_df['TransactionID']) - set(train_identity_df['TransactionID']))

### Load test data

In [None]:
test_identity_df = pd.read_csv('./ieee-fraud-detection/test_identity.csv')

test_identity_df.head()

In [None]:
test_transaction_df = pd.read_csv('./ieee-fraud-detection/test_transaction.csv')

test_transaction_df.head()

In [None]:
print(f'{test_identity_df.shape = }')
print(f'{test_transaction_df.shape = }')

### Join the identity and transaction tables

In [None]:
train_df = train_transaction_df.merge(train_identity_df, on='TransactionID', how='left')
test_df = test_transaction_df.merge(test_identity_df, on='TransactionID', how='left')

In [None]:
print(f'{train_df.shape =}')
train_df.head()

In [None]:
print(f'{test_df.shape = }')
test_df.head()

## Preprocessing

### Drop columns that have too many missing values

In [None]:
missing_percentages = train_df.isnull().mean()
missing_thr = 0.9

plt.figure(figsize=(10, 6))
missing_percentages[missing_percentages > missing_thr].sort_values(ascending=True).plot(kind='barh')
plt.xlabel('Percentage of Missing Values')
plt.ylabel('Columns')
plt.title(f'Percentage of Missing Values by in columns with > {missing_thr * 100} % missing values')
plt.show()

In [None]:
N = len(train_df)
n_cols_before = train_df.shape[1]
train_df = train_df.loc[:, train_df.isnull().mean() < missing_thr]
print(f"Number of columns {n_cols_before} -> {train_df.shape[1]}")

### Create os feature

In [None]:
train_df['id_30'].unique()

In [None]:
# Create a new feature called id_30
train_df['os'] = train_df['id_30'].str.split(' ', n=1, expand=True)[0]
train_df.drop('id_30', axis=1, inplace=True)
train_df['os'].head()

### Screen size feature

In [None]:
train_df['id_33'].unique()

In [None]:
train_df[['width', 'height']] = train_df['id_33'].str.split('x', n=1, expand=True).apply(pd.to_numeric)
train_df.drop('id_33', axis=1, inplace=True)
train_df[['width', 'height']].head()

### Browser feature

In [None]:
train_df['id_31'].unique()

In [None]:
train_df['browser'] = train_df['id_31'].str.split(' ', expand=True)[0].str.lower()
train_df.drop('id_31', axis=1, inplace=True)
train_df['browser'].head()

In [None]:
train_df['browser'].unique()

In [None]:
# Map browser names to most common ones and group others into "other".
browser_mapping = {
    'samsung': 'samsung',
    'samsung/sm-g532m': 'samsung',
    'samsung/sch': 'samsung',
    'samsung/sm-g531h': 'samsung',
    'mobile': 'mobile',
    'chrome': 'chrome',
    'chromium': 'chrome',
    'firefox': 'firefox',
    'mozilla/firefox': 'firefox',
    'waterfox': 'firefox',
    'cyberfox': 'firefox',
    'icedragon': 'firefox',
    'edge': 'edge',
    'ie': 'ie',
    'safari': 'safari',
    'android': 'android',
    'generic/android': 'android',
    'opera': 'opera',
    'silk': 'opera',
    'palemoon': 'other',
    'maxthon': 'other',
    'line': 'other',
    'iron': 'other',
    'blu/dash': 'other',
    'seamonkey': 'other',
    'm4tel/m4': 'other',
    'comodo': 'other',
    'lanix/ilium': 'other',
    'inco/minion': 'other',
    'cherry': 'other',
    'google': 'google',
    'facebook': 'facebook',
    'aol': 'other',
    'zte/blade': 'other',
    'nokia/lumia': 'other',
    'lg/k-200': 'other',
    'microsoft/windows': 'windows',
    np.nan: 'unknown'
}

train_df['browser'] = train_df['browser'].map(browser_mapping)

train_df['browser'].unique()

### Transaction Hour and day of the week

In [None]:
train_df['TransactionHour'] = (train_df['TransactionDT'] // (60 * 60)) % 24
train_df['TransactionDayOfWeek'] = (train_df['TransactionDT'] // (60 * 60 * 24)) % 7

In [None]:
v_cols = [c for c in train_df.columns if c.startswith("V")]

v_df = train_df[v_cols]
missing_counts = v_df.isnull().sum()
groups = {}
for col, count in missing_counts.items():
    groups.setdefault(count, []).append(col)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

count = 0
new_v_features = []
for group in groups.values():
    df_group = train_df[group].copy()
    df_group.fillna(-999, inplace=True)
    v_normalized = StandardScaler().fit_transform(df_group)
    pca = PCA().fit(v_normalized)
    explained_variance = pca.explained_variance_ratio_.cumsum()
    n_components = (explained_variance >= 0.9).argmax() + 1  # Find the first index where variance >= 90%
    print(f"Group: {group}, Components to Retain: {n_components}")
    
    pca_reduced = PCA(n_components=n_components)
    reduced_data = pca_reduced.fit_transform(v_normalized)
    for i in range(n_components):
        train_df[f'V_pca_{count}'] = reduced_data[:, i]
        new_v_features.append(f'V_pca_{count}')
        count += 1

print(f'Total V features created: {len(new_v_features)}')

In [None]:
train_df.columns

In [None]:
def create_email_features(train_df, col):
    train_df[col] = train_df[col].fillna('unknown')
    fraud_correlation = train_df.groupby(col)['isFraud'].mean()
    train_df[f'{col}_fraud_corr'] = train_df[col].map(fraud_correlation)
    
    common_providers = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com', 'icloud.com']
    train_df[f'{col}_email_provider'] = train_df[col].apply(
        lambda x: x if x in common_providers else 'other'
    )
    train_df[f'{col}_email_tld'] = train_df[col].str.split(' ', n=1, expand=True).apply(
        lambda x: x[1] if len(x) > 1 else 'unknown'
    )

create_email_features(train_df, 'P_emaildomain')
create_email_features(train_df, 'R_emaildomain')

In [None]:
cand_categorical_features = ([
    'ProductCD',
    'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2',
    'P_emaildomain_email_provider', 'P_emaildomain_email_tld',
    'R_emaildomain_email_provider', 'R_emaildomain_email_tld',
    'DeviceType', 'DeviceInfo',
    'os', 'browser',
]
+ [f'id_{i}' for i in range(12, 39)] # id features
+ [f'M{i}' for i in range(1, 10)]  # M features
)

cat_thr = 15
categorical_features = [c for c in cand_categorical_features if c in train_df and train_df[c].nunique() < cat_thr]

numerical_features = (
    ['TransactionAmt', 'TransactionHour', 'TransactionDayOfWeek', 'dist1']
    + ['P_emaildomain_fraud_corr', 'R_emaildomain_fraud_corr']
    + new_v_features  # V features
    + [f'D{i}' for i in range(1, 15) if f'D{i}' in train_df.columns] # D features.
    + [c for c in train_df.columns if c.startswith("C")] # C features
    + [c for c in cand_categorical_features if c in train_df.columns and train_df[c].nunique() >= cat_thr and c != "DeviceInfo"]
    + ['width', 'height']
)

print(categorical_features)
print(numerical_features)

### Remove outliers

Convert outliers to NaNs so that they can be imputed.

In [None]:
import math

# cand_outlier_cols = [c for c in numerical_features if "pca" not in c and "fraud_corr" not in c]
# num_features = len(cand_outlier_cols)
# cols = 4
# rows = math.ceil(num_features / cols)

# fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), sharex=False)
# axes = axes.flatten()
# for i, col in enumerate(cand_outlier_cols):
   
#     sns.boxplot(ax=axes[i], x='isFraud', y=col, data=train_df, hue='isFraud')
#     axes[i].set_title(f'{col} Box Plot by isFraud')
#     axes[i].set_ylabel(col)
#     axes[i].set_xlabel('isFraud')
#     axes[i].legend(title='isFraud')
# for j in range(i + 1, len(axes)):
#     axes[j].axis('off')

# plt.tight_layout()
# plt.suptitle('Before outlier removal')
# plt.show()

In [None]:
# Use IQR method for outlier removal

# If outliers are removed for features, the validation auc of RandomForest reduces from 0.9337 to 0.9317.
# So the outlier removal is disabled.


# for col in cand_categorical_features:
#     Q1 = train_df[col].quantile(0.25)
#     Q3 = train_df[col].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
    
#     # Mark values outside the bounds as NaN
#     train_df.loc[(train_df[col] < lower_bound) |  (train_df[col] > upper_bound), col] = np.nan


# fig, axes = plt.subplots(rows, cols, figsize=(15, 5 * rows), sharex=False)
# axes = axes.flatten()
# for i, col in enumerate(cand_outlier_cols):
   
#     sns.boxplot(ax=axes[i], x='isFraud', y=col, data=train_df, hue='isFraud')
#     axes[i].set_title(f'{col} Box Plot by isFraud')
#     axes[i].set_ylabel(col)
#     axes[i].set_xlabel('isFraud')
#     axes[i].legend(title='isFraud')
# for j in range(i + 1, len(axes)):
#     axes[j].axis('off')

# plt.tight_layout()
# plt.suptitle('After outlier removal')
# plt.show()

### Normalize numerical features

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])

### One hot encode categorical features

In [None]:
train_df_encoded = pd.get_dummies(train_df, columns=categorical_features, drop_first=True)
new_categorical_columns = train_df_encoded.columns.difference(train_df.columns).tolist()
new_categorical_columns

In [None]:
train_df_encoded.head()

In [None]:
X = train_df_encoded[new_categorical_columns + numerical_features]
y = train_df_encoded['isFraud'].to_numpy()

print(f'{X.shape = }')
print(f'{y.shape = }')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print(f'{X_train.shape = }')
print(f'{y_train.shape = }')
print(f'{X_val.shape = }')
print(f'{y_val.shape = }')

In [None]:
[c for c in train_df_encoded[new_categorical_columns].columns]
train_df_encoded[new_categorical_columns].head()

### Random Forest

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn import metrics

# rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2'],
# }

# grid_search = GridSearchCV(
#     estimator=rf, 
#     param_grid=param_grid, 
#     scoring='roc_auc',
#     cv=5,
#     verbose=2, 
#     n_jobs=-1
# )

# grid_search.fit(X_train, y_train)

# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_
# print(best_params)

In [None]:
from sklearn import metrics

def evaluate(model, X_val, y_val):
    preds_val = model.predict(X_val)
    report_dict = metrics.classification_report(y_val, preds_val, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    formatted_report = report_df.round(4)
    print(formatted_report)

    probs_val = model.predict_proba(X_val)[:, 1]
    fpr, tpr, _ = metrics.roc_curve(y_val, probs_val)
    roc_auc = metrics.auc(fpr, tpr)
    print(f'val auc: {roc_auc:.4f}')

    plt.figure(figsize=(6, 6))
    plt.title('ROC curve')
    plt.plot(fpr, tpr, label=f'AUC: {roc_auc:.2f}')
    plt.legend()
    plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

rf.fit(X_train, y_train)

In [None]:
evaluate(rf, X_val, y_val)

### Missing value imputation

Impute missing value for non-tree models.

In [None]:
# Impute missing values for numerical columns with the mean
for col in numerical_features:
    mean_col = X_train[col].mean()
    X_train[col] = X_train[col].fillna(mean_col)
    X_val[col] = X_val[col].fillna(mean_col)

In [None]:
# Impute missing values for categorical columns with the mode
for col in new_categorical_columns:
    mode_col = X_train[col].mode()[0]
    X_train[col] = X_train[col].fillna(mode_col)
    X_val[col] = X_val[col].fillna(mode_col)

In [None]:
# from sklearn.svm import SVC

# svm = SVC()

# svm.fit(X_train, y_train)

In [None]:
# evaluate(svm, X_val, y_val)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train, y_train)

evaluate(lr, X_val, y_val)

### Naives Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train, y_train)

evaluate(gnb, X_val, y_val)