In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2

In [2]:
df_transactions = pd.read_json('data/transactions.json', lines=True)
df_users = pd.read_csv('data/users.csv')
df_merchants = pd.read_csv('data/merchants.csv')

In [3]:
df_merged = pd.merge(df_transactions, df_users, on='user_id', how='left')

# Then, join the result with df_merchants on merchant_id
df_merged = pd.merge(df_merged, df_merchants, on='merchant_id', how='left')

df = df_merged

In [4]:
categorical_cols = [
    'channel', 'currency', 'device', 'payment_method', 'category',
    'country_x', 'country_y', 'sex', 'education', 'primary_source_of_income'
]

binary_cols = [
    'is_international', 'is_first_time_merchant', 'has_fraud_history'
]

numerical_cols = [
    'amount', 'session_length_seconds', 'age', 'risk_score', 'trust_score',
    'number_of_alerts_last_6_months', 'avg_transaction_amount',
    'account_age_months', 'sum_of_monthly_expenses', 'sum_of_monthly_installments'
]

In [5]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['signup_date'] = pd.to_datetime(df['signup_date'])

# Temporal features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['days_since_signup'] = (df['timestamp'] - df['signup_date']).dt.total_seconds() / (3600 * 24)

numerical_cols += ['hour', 'day_of_week', 'days_since_signup']

# Location
df['lat'] = df['location'].apply(lambda x: x['lat'] if isinstance(x, dict) else np.nan)
df['long'] = df['location'].apply(lambda x: x['long'] if isinstance(x, dict) else np.nan)

numerical_cols += ['lat', 'long']

In [6]:
# Process categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Process binary variables
for col in binary_cols:
    df[col] = df[col].astype(int)

# Normalize numerical features
scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 35 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   transaction_id                  500000 non-null  object        
 1   timestamp                       500000 non-null  datetime64[ns]
 2   user_id                         500000 non-null  object        
 3   merchant_id                     500000 non-null  object        
 4   amount                          500000 non-null  float64       
 5   channel                         500000 non-null  int32         
 6   currency                        500000 non-null  int32         
 7   device                          500000 non-null  int32         
 8   location                        500000 non-null  object        
 9   payment_method                  500000 non-null  int32         
 10  is_international                500000 non-null  int32  

In [8]:
#Create Time-Based Sequences
target_column = 'is_fraud'
user_column = 'user_id'
time_window_hours = 24  # Look back window of 24 hours
sequence_length = 10    # Max transactions to consider per window
padding_value = 0  

data_sequences = []
labels = []
sequence_indices = []  # To track which rows correspond to which sequences

# Group by user
grouped = df.groupby(user_column)

for user_id, user_data in grouped:
    # Sort by timestamp
    user_data = user_data.sort_values(by='timestamp')
    
    # Convert to numpy arrays
    user_data_values = user_data[numerical_cols].values
    timestamps = user_data['timestamp'].values
    
    # Convert timestamps to numpy datetime64
    if not np.issubdtype(timestamps.dtype, np.datetime64):
        timestamps = np.array([np.datetime64(ts) for ts in timestamps])
    
    for i in range(len(user_data)):
        current_time = timestamps[i]
        time_threshold = current_time - np.timedelta64(time_window_hours, 'h')
        
        # Get indices of transactions within last 24 hours
        mask = (timestamps >= time_threshold) & (timestamps <= current_time)
        window_indices = np.where(mask)[0]
        
        # Get transactions within window
        window_data = user_data_values[window_indices]
        
        if len(window_data) > 0:
            # Pad if fewer than sequence_length transactions
            if len(window_data) < sequence_length:
                pad_length = sequence_length - len(window_data)
                padding = np.full((pad_length, len(numerical_cols)), padding_value)
                window_data = np.vstack([padding, window_data])
            
            # Take most recent sequence_length transactions
            window_data = window_data[-sequence_length:]
            
            data_sequences.append(window_data)
            labels.append(user_data[target_column].iloc[i])
            sequence_indices.append(user_data.index[i])  # Store the original index

# Convert to numpy arrays
data_sequences = np.array(data_sequences)
labels = np.array(labels)

In [13]:
# Create Transaction-Count-Based Sequences
target_column = 'is_fraud'
user_column = 'user_id'
sequence_length = 7    # Last 7 transactions to consider
padding_value = 0  

data_sequences = []
labels = []
sequence_indices = []  # To track which rows correspond to which sequences

# Group by user
grouped = df.groupby(user_column)

for user_id, user_data in grouped:
    # Sort by timestamp to ensure chronological order
    user_data = user_data.sort_values(by='timestamp')
    
    # Convert to numpy arrays
    user_data_values = user_data[numerical_cols].values
    
    for i in range(len(user_data)):
        # Get indices of previous transactions (up to sequence_length-1)
        start_idx = max(0, i - (sequence_length - 1))
        window_indices = range(start_idx, i + 1)
        
        # Get the sequence of transactions
        window_data = user_data_values[window_indices]
        
        # Pad if fewer than sequence_length transactions
        if len(window_data) < sequence_length:
            pad_length = sequence_length - len(window_data)
            padding = np.full((pad_length, len(numerical_cols)), padding_value)
            window_data = np.vstack([padding, window_data])
        
        data_sequences.append(window_data)
        labels.append(user_data[target_column].iloc[i])
        sequence_indices.append(user_data.index[i])  # Store the original index

# Convert to numpy arrays
data_sequences = np.array(data_sequences)
labels = np.array(labels)

In [14]:
# Create DataFrame index to position mapping
index_to_pos = {idx: pos for pos, idx in enumerate(df.index)}

# Get the positions of the last transaction in each sequence
seq_positions = [index_to_pos[idx] for idx in sequence_indices]

# Split into train and test sets (80-20 split)
(X_seq_train, X_seq_test, 
 seq_pos_train, seq_pos_test,
 y_train, y_test) = train_test_split(
    data_sequences,
    seq_positions,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

# Prepare input dictionaries
def create_input_dict(sequences, positions, df):
    inputs = {'numerical_seq': sequences}
    
    # Add categorical inputs
    for col in categorical_cols:
        inputs[f'{col}_input'] = df.iloc[positions][col].values.reshape(-1, 1)
    
    # Add binary features
    inputs['binary_features'] = df.iloc[positions][binary_cols].values
    
    return inputs

train_data = create_input_dict(X_seq_train, seq_pos_train, df)
test_data = create_input_dict(X_seq_test, seq_pos_test, df)

In [15]:
# Analyze categorical feature cardinalities
categorical_cardinalities = {col: df[col].nunique() for col in categorical_cols}
embedding_dims = {
    col: min(50, max(2, card // 2 + 1)) 
    for col, card in categorical_cardinalities.items()
}

In [16]:
def create_deep_rnn_model(sequence_length, numerical_cols, categorical_cols, binary_cols, 
                         categorical_cardinalities, embedding_dims):
    # Numerical sequence input (Deep LSTM path)
    numerical_input = Input(shape=(sequence_length, len(numerical_cols)), name='numerical_seq')
    
    # Deep LSTM layers (3 layers)
    lstm_layer1 = LSTM(128, return_sequences=True, name='LSTM_1')(numerical_input)
    lstm_layer1 = Dropout(0.2)(lstm_layer1)
    
    lstm_layer2 = LSTM(128, return_sequences=True, name='LSTM_2')(lstm_layer1)
    lstm_layer2 = Dropout(0.2)(lstm_layer2)
    
    lstm_layer3 = LSTM(128, name='LSTM_3')(lstm_layer2)
    lstm_layer3 = Dropout(0.2)(lstm_layer3)
    
    # Entity embeddings for categorical features
    categorical_inputs = []
    categorical_embeddings = []
    
    for col in categorical_cols:
        input_layer = Input(shape=(1,), name=f'{col}_input')
        categorical_inputs.append(input_layer)
        
        embedding = Embedding(
            input_dim=categorical_cardinalities[col] + 1,
            output_dim=embedding_dims[col],
            name=f'{col}_embedding',
            embeddings_regularizer=l2(1e-4)
        )(input_layer)
        
        flattened = Flatten()(embedding)
        flattened = Dropout(0.1)(flattened)
        categorical_embeddings.append(flattened)
    
    # Binary features input
    binary_input = Input(shape=(len(binary_cols),), name='binary_features')
    
    # Combine all features
    combined = Concatenate()([lstm_layer3] + categorical_embeddings + [binary_input])
    
    # Feedforward 
    x = Dense(256, activation='relu')(combined)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    
    x = Dense(128, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    # Output layer
    output = Dense(1, activation='sigmoid')(x)
    
    # Create model
    model = Model(
        inputs=[numerical_input] + categorical_inputs + [binary_input],
        outputs=output
    )
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
    )
    
    return model

In [17]:
train_data.keys()

dict_keys(['numerical_seq', 'channel_input', 'currency_input', 'device_input', 'payment_method_input', 'category_input', 'country_x_input', 'country_y_input', 'sex_input', 'education_input', 'primary_source_of_income_input', 'binary_features'])

In [20]:
# Create the model with correct input names
model = create_deep_rnn_model(
    sequence_length=sequence_length,
    numerical_cols=numerical_cols,
    categorical_cols=categorical_cols,
    binary_cols=binary_cols,
    categorical_cardinalities=categorical_cardinalities,
    embedding_dims=embedding_dims
)

# Compute class weights
class_weights = compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}

# Train the model
history = model.fit(
    train_data,
    y_train,
    batch_size=128,
    epochs=50,
    validation_data=(test_data, y_test),
    class_weight=class_weight_dict,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_auc',
            mode='max',
            patience=5,
            restore_best_weights=True,
            verbose=1
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_auc',
            mode='max',
            factor=0.1,
            patience=5,
            min_lr=1e-6,
            verbose=1
        ),
        tf.keras.callbacks.ModelCheckpoint(
            'best_model.keras',
            monitor='val_auc',
            mode='max',
            save_best_only=True,
            verbose=1
        )
    ],
    verbose=1
)

Epoch 1/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.5081 - auc: 0.5243 - loss: 0.7487
Epoch 1: val_auc improved from -inf to 0.55311, saving model to best_model.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 22ms/step - accuracy: 0.5081 - auc: 0.5243 - loss: 0.7487 - val_accuracy: 0.6812 - val_auc: 0.5531 - val_loss: 0.6584 - learning_rate: 0.0010
Epoch 2/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5146 - auc: 0.5482 - loss: 0.6906
Epoch 2: val_auc improved from 0.55311 to 0.55653, saving model to best_model.keras
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 21ms/step - accuracy: 0.5146 - auc: 0.5482 - loss: 0.6906 - val_accuracy: 0.5104 - val_auc: 0.5565 - val_loss: 0.6922 - learning_rate: 0.0010
Epoch 3/50
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.4963 - auc: 0.5445 - loss: 0.6934
Ep

In [21]:
test_results = model.evaluate(test_data, y_test)
print("\nTest Set Evaluation:")
print(f"Loss: {test_results[0]:.4f}")
print(f"Accuracy: {test_results[1]:.4f}")
print(f"AUC: {test_results[2]:.4f}")


y_pred = model.predict(test_data)
y_pred_classes = (y_pred > 0.4).astype(int)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))

[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - accuracy: 0.4989 - auc: 0.5749 - loss: 0.6742

Test Set Evaluation:
Loss: 0.6741
Accuracy: 0.4993
AUC: 0.5737
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     91518
           1       0.12      0.04      0.06      8482

    accuracy                           0.89    100000
   macro avg       0.52      0.51      0.50    100000
weighted avg       0.85      0.89      0.87    100000


Confusion Matrix:
[[89106  2412]
 [ 8151   331]]


In [None]:
len(df)

500000

In [26]:
from sklearn.metrics import accuracy_score
# y_pred = model.predict(test_data)
y_pred_classes = (y_pred > 0.55).astype(int)

print("\nModel Evaluation:")
print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_classes)))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))


Model Evaluation:
Accuracy: 0.7103

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.75      0.82     91518
           1       0.11      0.33      0.16      8482

    accuracy                           0.71    100000
   macro avg       0.52      0.54      0.49    100000
weighted avg       0.85      0.71      0.77    100000


Confusion Matrix:
[[68220 23298]
 [ 5673  2809]]


In [28]:
from sklearn.metrics import accuracy_score
# y_pred = model.predict(test_data)
y_pred_classes = (y_pred > 0.525).astype(int)

# Calculate metrics
print("\nModel Evaluation:")
print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_classes)))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))


Model Evaluation:
Accuracy: 0.5742

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.58      0.71     91518
           1       0.10      0.53      0.17      8482

    accuracy                           0.57    100000
   macro avg       0.52      0.55      0.44    100000
weighted avg       0.86      0.57      0.67    100000


Confusion Matrix:
[[52954 38564]
 [ 4015  4467]]


In [29]:
from sklearn.metrics import accuracy_score
# y_pred = model.predict(test_data)
y_pred_classes = (y_pred > 0.575).astype(int)

# Calculate metrics
print("\nModel Evaluation:")
print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred_classes)))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_classes))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_classes))


Model Evaluation:
Accuracy: 0.8420

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.91     91518
           1       0.12      0.14      0.13      8482

    accuracy                           0.84    100000
   macro avg       0.52      0.52      0.52    100000
weighted avg       0.85      0.84      0.85    100000


Confusion Matrix:
[[83049  8469]
 [ 7335  1147]]
