In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score



In [None]:
# Load datasets
df = pd.read_csv('/kaggle/input/credit-card-transactions/credit_card_transactions-ibm_v2.csv')
user = pd.read_csv('/kaggle/input/credit-card-transactions/sd254_users.csv')
card = pd.read_csv('/kaggle/input/credit-card-transactions/sd254_cards.csv')



In [None]:
# Strip any leading/trailing spaces in column names
df.columns = df.columns.str.strip()
user.columns = user.columns.str.strip()
card.columns = card.columns.str.strip()

## Data preprocessing for user
user["User"] = range(2000)  # Create User ID to join with 'card'
## Remove the dollar signs
user["Yearly Income - Person"] = user["Yearly Income - Person"].str.replace("$", "").astype(float)
user["Total Debt"] = user["Total Debt"].str.replace("$", "").astype(float)
user["User_Location_Income"] = user["Per Capita Income - Zipcode"].str.replace("$", "").astype(float)

## Define new variable indicating users' retirement status
user['Retired'] = 'No'
user.loc[user['Current Age'] > user['Retirement Age'], 'Retired'] = 'Yes'

In [2]:
## Define variables that are the ratio of their income, debt, and the income level at their location
user['Person_Location_Income_ratio'] = user["Yearly Income - Person"] / (user["User_Location_Income"] + 0.01)
user['Person_Income_toDebt'] = user["Yearly Income - Person"] / (user["Total Debt"] + 0.01)
user['Location_Income_toDebt'] = user["User_Location_Income"] / (user["Total Debt"] + 0.01)

## Select variables used for further analysis
user = user[['User', 'Gender', "Current Age", "Retired", "User_Location_Income", 'Yearly Income - Person', "Total Debt", "Num Credit Cards", 'Person_Location_Income_ratio','Person_Income_toDebt','Location_Income_toDebt']]

## Data preprocessing for cards
card["User_Card"] = card['User'].astype(str) + '_' + card['CARD INDEX'].astype(str)  ## Create card id to join with transaction data
## Remove the dollar sign
card["Credit Limit"] = card["Credit Limit"].str.replace("$", "").astype(float)

## Select variables of interest
card = card[["User_Card", "User", 'Card Brand', "Card Type", "Credit Limit"]]
## Left join with 'user'
card = card.merge(user, on='User', how='left')
card = card.drop(columns=['User'])

## Take a look
card.head()

Unnamed: 0,User_Card,Card Brand,Card Type,Credit Limit,Gender,Current Age,Retired,User_Location_Income,Yearly Income - Person,Total Debt,Num Credit Cards,Person_Location_Income_ratio,Person_Income_toDebt,Location_Income_toDebt
0,0_0,Visa,Debit,24295.0,Female,53,No,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428
1,0_1,Visa,Debit,21968.0,Female,53,No,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428
2,0_2,Visa,Debit,46414.0,Female,53,No,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428
3,0_3,Visa,Credit,12400.0,Female,53,No,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428
4,0_4,Mastercard,Debit (Prepaid),28.0,Female,53,No,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428


In [3]:
## Create the 'User_Card' index to join with credit card information
df["User_Card"] = df['User'].astype(str) + '_' + df['Card'].astype(str)
## Remove the dollar sign
df["Amount"] = df["Amount"].str.replace("$", "").astype(float)
## Keep data with a positive amount
df = df[df['Amount'] > 0]
df = df.merge(card, on='User_Card', how='left')

## Remove columns that will not be used
df = df.drop(columns=['User', 'Card', 'User_Card', "Errors?", "Merchant Name", "Merchant State", "Zip", 'MCC'])

## Create a column with combined transaction time
df["Transaction_Time"] = df["Year"].astype(str) + '-' + df["Month"].astype(str) + '-' + df["Day"].astype(str) + ' ' + df["Time"]
df["Transaction_Time"] = pd.to_datetime(df["Transaction_Time"])

## Day of the week
df["Weekday"] = df["Transaction_Time"].dt.day_name()

## Split the time of day into 8 different periods based on the hour
df['Time_of_Day'] = ''
df.loc[(df['Transaction_Time'].dt.hour >= 23) | (df['Transaction_Time'].dt.hour < 2), 'Time_of_Day'] = 'Midnight'
df.loc[(df['Transaction_Time'].dt.hour >= 2) & (df['Transaction_Time'].dt.hour < 5), 'Time_of_Day'] = 'Early Morning'
df.loc[(df['Transaction_Time'].dt.hour >= 5) & (df['Transaction_Time'].dt.hour < 8), 'Time_of_Day'] = 'Morning'
df.loc[(df['Transaction_Time'].dt.hour >= 8) & (df['Transaction_Time'].dt.hour < 11), 'Time_of_Day'] = 'Late Morning'
df.loc[(df['Transaction_Time'].dt.hour >= 11) & (df['Transaction_Time'].dt.hour < 14), 'Time_of_Day'] = 'Noon'
df.loc[(df['Transaction_Time'].dt.hour >= 14) & (df['Transaction_Time'].dt.hour < 17), 'Time_of_Day'] = 'Afternoon'
df.loc[(df['Transaction_Time'].dt.hour >= 17) & (df['Transaction_Time'].dt.hour < 20), 'Time_of_Day'] = 'Evening'
df.loc[(df['Transaction_Time'].dt.hour >= 20) & (df['Transaction_Time'].dt.hour < 23), 'Time_of_Day'] = 'Late Night'

## The last digit of the transaction amount can be a trait of fraud
df['last_digit'] = df['Amount'].astype(str).str[-1]

## Remove columns that will not be used
df = df.drop(columns=["Time", "Transaction_Time"])

## Show what the dataframe looks like after all the steps
df.head(10)

Unnamed: 0,Year,Month,Day,Amount,Use Chip,Merchant City,Is Fraud?,Card Brand,Card Type,Credit Limit,...,User_Location_Income,Yearly Income - Person,Total Debt,Num Credit Cards,Person_Location_Income_ratio,Person_Income_toDebt,Location_Income_toDebt,Weekday,Time_of_Day,last_digit
0,2002,9,1,134.09,Swipe Transaction,La Verne,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Sunday,Morning,9
1,2002,9,1,38.48,Swipe Transaction,Monterey Park,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Sunday,Morning,8
2,2002,9,2,120.34,Swipe Transaction,Monterey Park,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Monday,Morning,4
3,2002,9,2,128.95,Swipe Transaction,Monterey Park,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Monday,Evening,5
4,2002,9,3,104.71,Swipe Transaction,La Verne,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Tuesday,Morning,1
5,2002,9,3,86.19,Swipe Transaction,Monterey Park,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Tuesday,Noon,9
6,2002,9,4,93.84,Swipe Transaction,Monterey Park,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Wednesday,Morning,4
7,2002,9,4,123.5,Swipe Transaction,Monterey Park,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Wednesday,Morning,5
8,2002,9,5,61.72,Swipe Transaction,Monterey Park,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Thursday,Morning,2
9,2002,9,5,57.1,Swipe Transaction,La Verne,No,Visa,Debit,24295.0,...,29278.0,59696.0,127613.0,5,2.038936,0.467789,0.229428,Thursday,Late Morning,1


In [5]:
def data_preprocessing(X):
    ## Separate the column by numerical or categorical
    numerical_columns_selector = selector(dtype_exclude=object)
    categorical_columns_selector = selector(dtype_include=object)
    numerical_columns = numerical_columns_selector(X)
    categorical_columns = categorical_columns_selector(X)
    
    ## One Hot encoding the categorical variables and scale the numerical ones
    categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
    numerical_preprocessor = StandardScaler()
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("one-hot-encoder", categorical_preprocessor, categorical_columns),
            ("standard_scaler", numerical_preprocessor, numerical_columns)
        ]
    )
    
    ## Transform the variable
    X_preprocessed = preprocessor.fit_transform(X)
    
    ## Save the feature names for variable importance
    feature_names = preprocessor.get_feature_names_out()
    
    return X_preprocessed, feature_names

In [7]:
def resample_split(rsd):
    # Calculate the desired number of fraud cases based on the desired proportion
    desired_proportion = 0.05
    total_samples = 500000
    fraud_samples = int(total_samples * desired_proportion)
    
    # Create RandomUnderSampler with the desired sampling strategy
    rus = RandomUnderSampler(sampling_strategy={0: total_samples - fraud_samples, 1: fraud_samples}, random_state=rsd)
    
    # Apply random undersampling to the original dataset
    X_resampled, y_resampled = rus.fit_resample(X, y)
    
    ## Perform data preprocessing on the resampled data, and keep the feature names
    X_preprocessed, feature_names = data_preprocessing(X_resampled)
    
    # Split the resampled data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_resampled, test_size=0.2, stratify=y_resampled, random_state=rsd)
    
    return X_train, X_test, y_train, y_test, feature_names

In [8]:
## Select columns that will be used for model construction
col_formd = ["Merchant City", "Gender", "Current Age", 'Card Brand', "Card Type", "Credit Limit", "Amount", "Use Chip", "Year", "Month", "Retired", "Person_Location_Income_ratio", "Person_Income_toDebt", "Location_Income_toDebt", "Weekday", "Time_of_Day", "last_digit"]

# Split the dataset into features (X) and target variable (y)
X = df[col_formd]
y = df['Is Fraud?'].apply(lambda x: 1 if x == 'Yes' else 0)

In [10]:
def cus_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    return accuracy, auc, precision, recall, f1

In [18]:
## Create an empty dictionary to store the results
results = {}

## Assign seed for each repetition
sd_list = [1234, 3456, 4567, 6789, 7890, 1111, 2222, 3333, 4444, 5555, 6666, 7777, 8888, 9999, 1010, 2020, 3030, 4040, 5050, 6060, 7070, 8080, 9090, 8765, 4321, 9054, 3388, 3360, 3596, 1998]

for rsd in sd_list:
    ## Perform undersampling to make the data more balanced
    X_train, X_test, y_train, y_test, feature_names = resample_split(rsd)
    
    ## Create LightGBM dataset
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    ## Define the parameters
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': {'binary_logloss', 'binary_error', 'auc'},
        'num_leaves': 31,
        'learning_rate': 0.1,
        'lambda_l1': 0.01,  ## Avoid overfitting
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }
    
    ## Train the model

    model = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)

    
    ## Predict the model using the testing sets
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    y_pred = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
    
    ## Obtain the model performance metrics
    accuracy, auc, precision, recall, f1 = cus_metrics(y_test, y_pred)
    
    ## Get the top 20 feature importance
    feature_importance = model.feature_importance(importance_type='gain')
    feature_importance_dict = dict(zip(feature_names, feature_importance))
    top20feat = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)[:20]
    
    ## Save the result
    results[f"Seed {rsd}"] = {
        'Accuracy': accuracy,
        'AUC': auc,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Top 20 Important Features': top20feat
    }

In [20]:

## Flatten the dictionary to separate feature names and values
flattened_results = []
for seed, data in results.items():
    row = {
        'Seed': seed,
        'Accuracy': data['Accuracy'],
        'AUC': data['AUC'],
        'Precision': data['Precision'],
        'Recall': data['Recall'],
        'F1 Score': data['F1 Score']
    }
    for i, (feat_name, feat_value) in enumerate(data['Top 20 Important Features']):
        row[f'Top{i+1} Feature Name'] = feat_name
        row[f'Top{i+1} Feature Value'] = feat_value
    flattened_results.append(row)

## Create a DataFrame from the flattened dictionary
res_df = pd.DataFrame(flattened_results)

# Save the DataFrame as a CSV file
res_df.to_csv('results_with_top_features.csv', index=False)

# Display the results DataFrame
res_df

Unnamed: 0,Seed,Accuracy,AUC,Precision,Recall,F1 Score,Top1 Feature Name,Top1 Feature Value,Top2 Feature Name,Top2 Feature Value,...,Top16 Feature Name,Top16 Feature Value,Top17 Feature Name,Top17 Feature Value,Top18 Feature Name,Top18 Feature Value,Top19 Feature Name,Top19 Feature Value,Top20 Feature Name,Top20 Feature Value
0,Seed 1234,0.978,0.823674,0.876142,0.6522,0.747764,one-hot-encoder__Use Chip_Online Transaction,102012.580865,one-hot-encoder__Merchant City_Rome,99019.352568,...,one-hot-encoder__Merchant City_Mexico City,3576.025889,one-hot-encoder__Card Type_Debit (Prepaid),3304.502712,one-hot-encoder__Weekday_Saturday,3187.011507,one-hot-encoder__Merchant City_Abuja,3118.10646,standard_scaler__Person_Income_toDebt,2961.409719
1,Seed 3456,0.97811,0.825437,0.8751,0.6558,0.749743,one-hot-encoder__Merchant City_Rome,96813.522418,one-hot-encoder__Use Chip_Online Transaction,78161.579472,...,one-hot-encoder__Merchant City_Istanbul,4905.435569,one-hot-encoder__Card Type_Debit (Prepaid),3620.104294,one-hot-encoder__Merchant City_ONLINE,3574.766668,one-hot-encoder__Merchant City_Abuja,3164.29086,standard_scaler__Person_Income_toDebt,2864.567007
2,Seed 4567,0.97716,0.822284,0.85869,0.6502,0.740041,one-hot-encoder__Use Chip_Online Transaction,110066.265115,one-hot-encoder__Merchant City_Rome,96943.300092,...,one-hot-encoder__Time_of_Day_Morning,3230.903707,one-hot-encoder__Time_of_Day_Noon,3150.190018,one-hot-encoder__Merchant City_Abuja,3009.542641,one-hot-encoder__Weekday_Saturday,2953.229422,one-hot-encoder__Time_of_Day_Late Morning,2855.555953
3,Seed 6789,0.97705,0.821847,0.856954,0.6494,0.738878,one-hot-encoder__Use Chip_Online Transaction,108827.05791,one-hot-encoder__Merchant City_Rome,98023.552222,...,one-hot-encoder__Merchant City_Mexico City,3863.367327,standard_scaler__Person_Income_toDebt,3825.622949,one-hot-encoder__Time_of_Day_Noon,3147.199053,one-hot-encoder__Merchant City_Abuja,2967.011548,one-hot-encoder__Time_of_Day_Late Morning,2573.216455
4,Seed 7890,0.97802,0.827474,0.868684,0.6602,0.750227,one-hot-encoder__Use Chip_Online Transaction,111833.543153,one-hot-encoder__Merchant City_Rome,98790.113807,...,standard_scaler__Person_Income_toDebt,4079.130177,one-hot-encoder__Time_of_Day_Midnight,3086.191851,one-hot-encoder__Merchant City_Abuja,2846.30502,one-hot-encoder__Time_of_Day_Morning,2717.054299,one-hot-encoder__Weekday_Tuesday,2663.585608
5,Seed 1111,0.9771,0.820926,0.859989,0.6474,0.738704,one-hot-encoder__Use Chip_Online Transaction,110946.268254,one-hot-encoder__Merchant City_Rome,98743.084996,...,one-hot-encoder__Merchant City_ONLINE,3237.2644,standard_scaler__Person_Income_toDebt,3209.741066,one-hot-encoder__Merchant City_Abuja,3121.175154,one-hot-encoder__Time_of_Day_Late Morning,2726.818186,one-hot-encoder__Weekday_Saturday,2720.346409
6,Seed 2222,0.97745,0.828121,0.854011,0.6622,0.745973,one-hot-encoder__Use Chip_Online Transaction,111222.052997,one-hot-encoder__Merchant City_Rome,97267.577724,...,one-hot-encoder__Time_of_Day_Midnight,3081.581146,one-hot-encoder__Merchant City_Abuja,3068.949191,standard_scaler__Person_Income_toDebt,3066.015025,one-hot-encoder__Weekday_Wednesday,2812.903088,standard_scaler__Location_Income_toDebt,2539.85814
7,Seed 3333,0.97727,0.821489,0.862923,0.6484,0.740436,one-hot-encoder__Use Chip_Online Transaction,108297.67411,one-hot-encoder__Merchant City_Rome,98597.63702,...,one-hot-encoder__Card Type_Debit (Prepaid),4473.216265,one-hot-encoder__Time_of_Day_Late Morning,3361.142496,one-hot-encoder__Merchant City_Abuja,3125.92268,one-hot-encoder__Time_of_Day_Noon,2719.201303,one-hot-encoder__Weekday_Wednesday,2676.92895
8,Seed 4444,0.97768,0.825495,0.864594,0.6564,0.746248,one-hot-encoder__Use Chip_Online Transaction,101659.232464,one-hot-encoder__Merchant City_Rome,97261.11031,...,one-hot-encoder__Time_of_Day_Evening,3587.359185,standard_scaler__Person_Income_toDebt,3569.439497,one-hot-encoder__Merchant City_Abuja,3340.36224,one-hot-encoder__Time_of_Day_Midnight,3027.512424,one-hot-encoder__Weekday_Wednesday,2800.798382
9,Seed 5555,0.97781,0.829163,0.860327,0.664,0.74952,one-hot-encoder__Use Chip_Online Transaction,105861.093602,one-hot-encoder__Merchant City_Rome,96041.645807,...,one-hot-encoder__Merchant City_Mexico City,4607.110013,standard_scaler__Person_Income_toDebt,3454.575098,one-hot-encoder__Merchant City_Abuja,3380.538336,one-hot-encoder__Time_of_Day_Noon,3090.150784,one-hot-encoder__Weekday_Saturday,2739.726718
