In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

import sys
import os
from pathlib import Path

ROOT = Path.cwd().parent
DATA = ROOT / "data"
sys.path.append(str(ROOT / "functions"))

from categoric_functions import *
from numeric_functions import *

## Import Data

In [2]:
train_path = DATA / "train_merged.csv"
df = pd.read_csv(train_path, low_memory=False)
print(f"Total shape: {df.shape}")

df = df.sort_values('TransactionDT').reset_index(drop=True)
split_idx = int(len(df) * 0.8)

Total shape: (590540, 434)


In [3]:
train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

train_df = reduce_mem_usage(train_df.copy())
test_df = reduce_mem_usage(test_df.copy())

Memory usage decreased to 505.96 Mb (67.7% reduction)
Memory usage decreased to 128.29 Mb (67.2% reduction)


# Handling Missing Values

Some features have more than 95% missing values. These are redundat for ml models. I am defining a threshold to determine whether features with missing values will remain in the dataset.

In [4]:
threshold = 95 
high_missing = top_missing_cols(train_df, thresh=threshold)
cols_to_drop = high_missing[high_missing['missing_percent'] > threshold]['col'].tolist()

train_df = train_df.drop(columns=cols_to_drop)
print(f" Threshold : {threshold}%\n Dropped feature num : {len(cols_to_drop)}")

test_df = test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns])

There are 414 columns with missing values.
There are 9 columns with missing percent > 95%
 Threshold : 95%
 Dropped feature num : 9


### Target

In [9]:
print("Class distribution for 'isFraud':")
print(train_df['isFraud'].value_counts())
print("\nPercentage:")
print(train_df['isFraud'].value_counts(normalize=True))

# counts = train_df['isFraud'].value_counts()
# percentages = train_df['isFraud'].value_counts(normalize=True) * 100

# colors = ['green', 'red'] 
# bars = plt.bar(counts.index, counts.values, color=colors[:len(counts)])

# plt.title('isFraud Class Frequencies')
# plt.xlabel('isFraud')
# plt.ylabel('Count')
# plt.xticks([0, 1])

# for bar, perc in zip(bars, percentages):
#     plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05, f'{perc:.1f}%', ha='center', va='bottom')

# plt.show()

Class distribution for 'isFraud':
isFraud
0    455833
1     16599
Name: count, dtype: int64

Percentage:
isFraud
0    0.964865
1    0.035135
Name: proportion, dtype: float64


It is known which features are categorical. After cleaning up missing values, the remaining features need to be analysed.

**Categorical features are initially divided into two groups**:
* High cardinality -> 18 features
* Ready for analysis -> 24 features

In [10]:
categorical_features = [
    'ProductCD', 'P_emaildomain', 'R_emaildomain','DeviceType', 'DeviceInfo',
    'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
    'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20',
    'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
    'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38'
]
numerical_features = [col for col in train_df.columns if col not in categorical_features]

present_cat_cols = list(set(categorical_features) & set(train_df.columns))

In [None]:
cardinality_threshold = 15
low_cardinality = [col for col in present_cat_cols if train_df[col].nunique() <= cardinality_threshold]
high_cardinality = [col for col in present_cat_cols if train_df[col].nunique() > cardinality_threshold]

categorical_features = low_cardinality
cardinality_features = high_cardinality 

18