In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

import sys
import os
from pathlib import Path

ROOT = Path.cwd().parent
DATA = ROOT / "data"
sys.path.append(str(ROOT / "functions"))

from categoric_functions import *
from numeric_functions import *

## Import Data

In [2]:
train_path = DATA / "train_merged.csv"
df = pd.read_csv(train_path, low_memory=False)
print(f"Total shape: {df.shape}")

df = df.sort_values('TransactionDT').reset_index(drop=True)
split_idx = int(len(df) * 0.8)

Total shape: (590540, 434)


In [3]:
train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

train_df = reduce_mem_usage(train_df.copy())
test_df = reduce_mem_usage(test_df.copy())

Memory usage decreased to 505.96 Mb (67.7% reduction)
Memory usage decreased to 128.29 Mb (67.2% reduction)


# Handling Missing Values

Some features have more than 95% missing values. These are redundat for ml models. I am defining a threshold to determine whether features with missing values will remain in the dataset.

In [4]:
threshold = 95 
high_missing = top_missing_cols(train_df, thresh=threshold)
cols_to_drop = high_missing[high_missing['missing_percent'] > threshold]['col'].tolist()

train_df = train_df.drop(columns=cols_to_drop)
print(f" Threshold : {threshold}%\n Dropped feature num : {len(cols_to_drop)}")

test_df = test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns])

There are 414 columns with missing values.
There are 9 columns with missing percent > 95%
 Threshold : 95%
 Dropped feature num : 9


### Target

In [5]:
print("Class distribution for 'isFraud':")
print(train_df['isFraud'].value_counts())
print("\nPercentage:")
print(train_df['isFraud'].value_counts(normalize=True))

# counts = train_df['isFraud'].value_counts()
# percentages = train_df['isFraud'].value_counts(normalize=True) * 100

# colors = ['green', 'red'] 
# bars = plt.bar(counts.index, counts.values, color=colors[:len(counts)])

# plt.title('isFraud Class Frequencies')
# plt.xlabel('isFraud')
# plt.ylabel('Count')
# plt.xticks([0, 1])

# for bar, perc in zip(bars, percentages):
#     plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05, f'{perc:.1f}%', ha='center', va='bottom')

# plt.show()

Class distribution for 'isFraud':
isFraud
0    455833
1     16599
Name: count, dtype: int64

Percentage:
isFraud
0    0.964865
1    0.035135
Name: proportion, dtype: float64


It is known which features are categorical. After cleaning up missing values, the remaining features need to be analysed.

**Categorical features are initially divided into two groups**:
* High cardinality -> 18 features
* Ready for analysis -> 24 features

In [31]:
categorical_features = [
    'ProductCD', 'P_emaildomain', 'R_emaildomain','DeviceType', 'DeviceInfo',
    'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2',
    'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
    'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20',
    'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30',
    'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38'
]
numerical_features = [col for col in train_df.columns if col not in categorical_features]
original_numerical_features = numerical_features.copy() 
present_cat_cols = list(set(categorical_features) & set(train_df.columns))

In [7]:
cardinality_threshold = 15
low_cardinality = [col for col in present_cat_cols if train_df[col].nunique() <= cardinality_threshold]
high_cardinality = [col for col in present_cat_cols if train_df[col].nunique() > cardinality_threshold]

categorical_features = low_cardinality
cardinality_features = high_cardinality 

The functions that should be applied for categorical variables are ready, but I want to set them up as a pipeline.

In [None]:
# 4. Main pipeline , rare_maps are containing all rare category mappings
def apply_categorical_engineering(df, rare_maps=None):
    """
    Complete categorical feature engineering pipeline.
    functions chained together.
    """
    df = df.copy()
    
    # Rare encoding
    df, rare_maps = encode_rare_categories(
        df, 
        columns={'card3': 200, 'card5': 300},
        rare_maps=rare_maps
    )
    
    # Domain-specific transformations
    df = (df
        .pipe(clean_email_domains)
        .pipe(create_email_match)
        .pipe(consolidate_device_info)
        .pipe(extract_screen_features)
    )
    return df, rare_maps

In [10]:
# 1. Rare category encoding (reusable) --> car3 ve card5 te kullanılacak.
def encode_rare_categories(df, columns, thresh = 200, rare_maps=None):
    """
    Replace rare categories with 'Others'.
    
    Args:
        columns: dict {col: threshold} or list
        rare_maps: dict {col: [rare_values]} for test set
    """
    is_train = rare_maps is None
    if is_train:
        rare_maps = {}
    
    if isinstance(columns, list):
        columns = {col: 200 for col in columns}
    
    for col, thresh in columns.items():
        if col not in df.columns:
            continue
            
        if is_train:
            counts = df[col].value_counts()
            rare_maps[col] = counts[counts < thresh].index.tolist()
        
        df.loc[df[col].isin(rare_maps[col]), col] = 'Others'
    
    return df, rare_maps


#  Screen resolution features
def extract_screen_features(df):
    """Parse id_33 into width, height, pixels, aspect ratio."""
    if 'id_33' not in df.columns:
        return df
    
    split = df['id_33'].astype(str).str.split('x', expand=True)
    if split.shape[1] != 2:
        return df
    
    df['screen_width'] = pd.to_numeric(split[0], errors='coerce')
    df['screen_height'] = pd.to_numeric(split[1], errors='coerce')
    df['total_pixels'] = df['screen_width'] * df['screen_height']
    df['aspect_ratio'] = (df['screen_width'] / df['screen_height']).round(2)
    
    return df

def create_interaction_features(df, interactions, prefix='inter'):
    """
    Create interaction features from predefined column pairs.
    Safe for train/test - no data leakage.
    
    Args:
        df: DataFrame
        interactions: list of tuples [(col1, col2), ...]
        prefix: prefix for new feature names
    
    Returns:
        df with new interaction columns
    """
    df = df.copy()
    
    for col1, col2 in interactions:
        if col1 in df.columns and col2 in df.columns:
            new_name = f"{prefix}_{col1}_x_{col2}"
            df[new_name] = (
                df[col1].astype(str).fillna('missing') + '_' + 
                df[col2].astype(str).fillna('missing')
            )
    
    return df

In [11]:
# Train set - learn rare categories
train_df, rare_maps = apply_categorical_engineering(train_df, rare_maps=None)

# Test set - use learned rare categories
test_df, _ = apply_categorical_engineering(test_df, rare_maps=rare_maps)

In [12]:
# # Test email features --> ilerleyen aşamalrda özellik ekleyerek ne kadar CV skoruna etki ettiğine bakabiliriz.
# email_features = ['P_emaildomain_bin', 'R_emaildomain_bin', 'email_match']
# for feat in email_features:
#     auc = test_single_feature(train_df, feat)
#     print(f"{feat}: {auc:.4f}")

In [13]:
categorical_to_scan = [
'P_emaildomain_bin', 'R_emaildomain_bin', 'email_match', 'OS_type', 'Device_name',
'card5', 'card3', 'card6','card4', 'ProductCD','DeviceType', 'screen_width' , 'screen_height', 'total_pixels' , 'aspect_ratio'
, 'id_28' , 'id_20', 'id_15','id_19'
]

# Step 1: Scan ONLY on train to identify top interactions
top_combos = scan_all_bivariate_combinations(
    train_df, 
    feature_list=categorical_to_scan,
    target='isFraud',
    min_samples=50,
    top_n=20
)

# Step 2: Extract column pairs (without fraud rates)
interactions = [
    (row['feature1'], row['feature2']) 
    for _, row in top_combos.iterrows()
]

# Step 3: Apply SAME interactions to both train and test
train_df = create_interaction_features(train_df, interactions)
test_df = create_interaction_features(test_df, interactions)

Scanning 171 feature pairs...
Progress: 100/171 pairs processed...

Analysis complete! Found 171 valid combinations.
Top fraud rate: 98.4%


Some features have become unnecessary after the transformation, and this is certain...

In [14]:
# Original columns that were transformed
drop_cols = [
    'P_emaildomain',      # replaced by P_emaildomain_bin
    'R_emaildomain',      # replaced by R_emaildomain_bin
    'id_30',              # replaced by OS_type
    'DeviceInfo',         # replaced by Device_name
    'id_33',              # replaced by screen_width, screen_height, etc.
]

train_df = train_df.drop(columns=drop_cols, errors='ignore')
test_df = test_df.drop(columns=drop_cols, errors='ignore')

# Encoding

In [15]:
# train_df'teki object tipindeki sütunların listesini almak için:
object_columns = train_df.select_dtypes(include=['object']).columns.tolist()

object_columns = train_df.select_dtypes(include=['object']).columns.tolist()


cardinality_threshold = 15

# Düşük ve yüksek kardinaliteyi ayır
low_cardinality_objects = [col for col in object_columns if train_df[col].nunique() <= cardinality_threshold]
high_cardinality_objects = [col for col in object_columns if train_df[col].nunique() > cardinality_threshold]

print("Düşük Kardinalite Object Sütunları:")
print(low_cardinality_objects)

print("\nYüksek Kardinalite Object Sütunları:")
print(high_cardinality_objects)

# Frequency encoding
train_df, freq_maps = apply_frequency_encoding(train_df, high_cardinality_objects, normalize=True)
test_df, _ = apply_frequency_encoding(test_df, high_cardinality_objects, freq_dict=freq_maps)

# Label encoding  
train_df, label_encoders = apply_label_encoding(train_df, low_cardinality_objects)
test_df, _ = apply_label_encoding(test_df, low_cardinality_objects, encoder_dict=label_encoders)

Düşük Kardinalite Object Sütunları:
['ProductCD', 'card3', 'card4', 'card6', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'P_emaildomain_bin', 'R_emaildomain_bin', 'email_match', 'OS_type', 'Device_name']

Yüksek Kardinalite Object Sütunları:
['card5', 'id_31', 'inter_card3_x_id_19', 'inter_Device_name_x_card3', 'inter_R_emaildomain_bin_x_aspect_ratio', 'inter_R_emaildomain_bin_x_screen_height', 'inter_R_emaildomain_bin_x_total_pixels', 'inter_ProductCD_x_aspect_ratio', 'inter_ProductCD_x_total_pixels', 'inter_ProductCD_x_screen_height', 'inter_P_emaildomain_bin_x_screen_height', 'inter_P_emaildomain_bin_x_total_pixels', 'inter_P_emaildomain_bin_x_aspect_ratio', 'inter_P_emaildomain_bin_x_id_19', 'inter_R_emaildomain_bin_x_id_19', 'inter_aspect_ratio_x_id_28', 'inter_aspect_ratio_x_id_15', 'inter_email_match_x_screen_width', 'inter_email_match_x_total_pixels', 'inter_id_20_x

#### Evaluate the feature importance (categoric)

In [16]:

def test_single_feature(df, feature, target='isFraud'):
    """Quick AUC test for a single feature."""
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import train_test_split
    
    valid_data = df[[feature, target]].dropna()
    X_train, X_val, y_train, y_val = train_test_split(
        valid_data[[feature]], valid_data[target], 
        test_size=0.3, random_state=42, stratify=valid_data[target]
    )
    
    from lightgbm import LGBMClassifier
    model = LGBMClassifier(n_estimators=100, random_state=42, verbose=-1)
    model.fit(X_train, y_train)
    
    pred = model.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, pred)
    
    return auc

In [17]:
# Test email features --> ilerleyen aşamalrda özellik ekleyerek ne kadar CV skoruna etki ettiğine bakabiliriz.
email_features = ['P_emaildomain_bin', 'R_emaildomain_bin', 'email_match']
for feat in email_features:
    auc = test_single_feature(train_df, feat)
    print(f"{feat}: {auc:.4f}")

P_emaildomain_bin: 0.5869
R_emaildomain_bin: 0.6728
email_match: 0.6567


 # Numerical Features

 Elimizdeki sayısal özelliklerle ilgili en dikkat çekici şey tabi ki boyut :) Datasetin ham haline 383 den fazla sayısal özellik bulunmaktadır.

Sayısal özelliklerin büyük çoğunluğu vesta engineering tarafından türetilen "V_" özellikleridir. İsimleri gizlenmiş olan bu özellikler için boyut indirgeme yöntemleri kullanılmalıdır.

* group_by_missing_pattern ve test_feature_discrimination fonksiyonları kullanılacak bu iş için.

In [18]:
len(numerical_features)

383

In [19]:
v_cols = [col for col in train_df[numerical_features].columns if col.startswith('V')] 
c_cols = [col for col in train_df[numerical_features].columns if col.startswith('C')]
d_cols = [col for col in train_df[numerical_features].columns if col.startswith('D')]
id_cols = [col for col in train_df[numerical_features].columns if col.startswith('id_')]

grouped_cols = set(v_cols + c_cols + d_cols + id_cols) # bunlardan olmayanları bulmak için

indep_cols = [col for col in train_df[numerical_features].columns if col not in grouped_cols]

In [None]:
# v_missing = train_df[v_cols].isnull()

# missing_rates = v_missing.mean().sort_values(ascending=False)

# plt.figure(figsize=(20, 6))
# missing_rates.plot(kind='bar')
# plt.title('V Sütunları Missing Rates')
# plt.ylabel('Missing Rate')
# plt.xlabel('V Columns')
# plt.axhline(0.5, color='red', linestyle='--', label='50% threshold')
# plt.legend()
# plt.tight_layout()
# plt.show()

In [20]:
v_pattern_groups = group_by_missing_pattern(train_df, v_cols)

representative_v = []

for pattern_id, info in v_pattern_groups.items():
    group_cols = info['columns']

    if len(group_cols) == 1:
        representative_v.extend(group_cols)
        continue

    ks_res = test_feature_discrimination(train_df, group_cols, test='ks')

    if not ks_res.empty:
        best = ks_res.iloc[0]['Feature']
        representative_v.append(best)

print("Selected Representatives Summary")
print("--------------------------------")
print(f"Original V columns: {len(v_cols)}")
print(f"Selected representatives: {len(representative_v)}")
print(f"Reduction: {(1 - len(representative_v)/len(v_cols)) * 100:.1f}%")

# V sütunlarını filtrele - HEM TRAIN HEM TEST
cols_to_keep = train_df.columns.difference(v_cols).union(pd.Index(representative_v))
train_df = train_df[cols_to_keep]
test_df = test_df[[c for c in cols_to_keep if c in test_df.columns]]

# v_cols listesini güncelle
v_cols = [col for col in train_df.columns if col.startswith('V')]

print(f"\nFinal shapes:")
print(f"  Train: {train_df.shape}")
print(f"  Test: {test_df.shape}")

Selected Representatives Summary
--------------------------------
Original V columns: 339
Selected representatives: 14
Reduction: 95.9%

Final shapes:
  Train: (472432, 124)
  Test: (118108, 124)


In [21]:
# 1. Time baseline first
train_df = convert_dt_to_day(train_df)
test_df = convert_dt_to_day(test_df)

# 2. Create UID (needs D1 in original form!)
train_df = create_uid(train_df, uid_cols=['card1', 'addr1', 'D1'])
test_df = create_uid(test_df, uid_cols=['card1', 'addr1', 'D1'])

# 3. NOW normalize D columns (after UID creation)
train_df = normalize_d_columns(train_df)
test_df = normalize_d_columns(test_df)

# 4. Amount features
train_df = extract_amt_decimal(train_df)
test_df = extract_amt_decimal(test_df)

# 5. Apply log transform to TransactionAmt
train_df['TransactionAmt_log'] = np.log1p(train_df['TransactionAmt'])
test_df['TransactionAmt_log'] = np.log1p(test_df['TransactionAmt'])

# 6. C velocity features
train_df = create_c_velocity_features(train_df)
test_df = create_c_velocity_features(test_df)

# 7. UID aggregations (IMPORTANT: only on train!)
train_df = create_uid_aggregations(train_df, uid_col='uid')
test_df = create_uid_aggregations(test_df, uid_col='uid')

In [None]:
# Exclude non-numeric and ID columns
exclude_cols = ['TransactionID', 'TransactionDT', 'isFraud', 'uid']


numerical_features = train_df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [col for col in numerical_features if col not in exclude_cols]

print(f"Total numerical features: {len(numerical_features)}")

Total numerical features: 160


In [33]:
# Orijinal sayısal sütunları isimlere göre seç
original_numerical_features = [col for col in train_df.columns if col.startswith(('V', 'C', 'D', 'id_')) and col not in exclude_cols]
print(f"Total original numerical features: {len(original_numerical_features)}")


Total original numerical features: 101


In [35]:
ks_results = test_feature_discrimination(
    train_df,
    columns=original_numerical_features,
    target='isFraud',
    test='ks',
    min_samples=30
)

# Display top results
display(ks_results.head(101))

Unnamed: 0,Feature,Test_Stat,P_Value,Significance,n_fraud,n_normal,Unique_Ratio_Fraud,Unique_Ratio_Normal
0,V258,0.4613,0.000000,***,8174,101761,0.006,0.000
1,D5,0.4414,0.000000,***,8569,210018,0.036,0.003
2,C13_C1_ratio,0.4014,0.000000,***,16599,455833,0.071,0.008
3,V52,0.3835,0.000000,***,11537,321086,0.001,0.000
4,D8_normalized,0.3785,0.000000,***,6247,55363,0.999,0.994
...,...,...,...,...,...,...,...,...
96,D2_uid_std,0.0202,0.000458,***,10775,216130,0.019,0.006
97,id_04,0.0126,0.396785,ns,5645,48445,0.002,0.000
98,id_11,0.0092,0.479335,ns,9036,108760,0.007,0.001
99,id_10,0.0051,0.998398,ns,6247,55363,0.002,0.001


In [None]:
def remove_high_correlation(df, features, threshold=0.95):
    """
    Remove highly correlated features.
    Keeps the feature with higher fraud discrimination.
    """
    corr_matrix = df[features].corr().abs()
    
    # Upper triangle to avoid duplicates
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    # Find correlated pairs
    to_drop = set()
    
    for column in upper.columns:
        correlated = upper[column][upper[column] > threshold].index.tolist()
        if correlated:
            # Keep the one with better KS score
            for corr_col in correlated:
                ks_col = ks_results[ks_results['Feature'] == column]['Test_Stat'].values[0]
                ks_corr = ks_results[ks_results['Feature'] == corr_col]['Test_Stat'].values[0]
                
                if ks_col < ks_corr:
                    to_drop.add(column)
                else:
                    to_drop.add(corr_col)
    
    print(f"Removing {len(to_drop)} highly correlated features (r > {threshold})")
    return [f for f in features if f not in to_drop]

# Apply
filtered_features = remove_high_correlation(
    train_df, 
    strong_features + moderate_features, 
    threshold=0.95
)

def cap_outliers(df, columns, lower_percentile=1, upper_percentile=99):
    """Cap extreme outliers at percentiles."""
    for col in columns:
        lower = df[col].quantile(lower_percentile / 100)
        upper = df[col].quantile(upper_percentile / 100)
        df[col] = df[col].clip(lower, upper)
    return df