In [15]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [10]:
# Load data
train_path = 'data/train.csv'
test_path = 'data/test.csv'
sample_submission_path = 'data/sample_submission.csv'

# Read data
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

In [None]:
# Feature Engineering
def preprocess_and_engineer_features(df):
    # Date and time features
    df['trans_date'] = pd.to_datetime(df['trans_date'])
    df['trans_year'] = df['trans_date'].dt.year
    df['trans_month'] = df['trans_date'].dt.month
    df['trans_day'] = df['trans_date'].dt.day
    df['trans_weekday'] = df['trans_date'].dt.weekday
    df['is_weekend'] = df['trans_weekday'].isin([5, 6]).astype(int)

    # Transaction time
    df['trans_time'] = pd.to_datetime(df['trans_time'], format='%H:%M:%S').dt.hour * 3600 + \
                       pd.to_datetime(df['trans_time'], format='%H:%M:%S').dt.minute * 60 + \
                       pd.to_datetime(df['trans_time'], format='%H:%M:%S').dt.second
    # Average Spend
    df['average_spend'] = df.groupby('cc_num')['amt'].transform('mean')

    # Recency spend ratio
    # Calculate rolling average spend for each cardholder
    df['recency_spend'] = df.groupby('cc_num')['amt'].rolling(window=10, min_periods=1).mean().reset_index(0, drop=True)

    # Calculate the recency spend ratio
    df['recency_spend_ratio'] = df['amt'] / df['recency_spend']

    # average spend at a given merchant
    df['merchant_average_spend'] = df.groupby('merchant')['amt'].transform('mean')

    # Calculate distance between user and merchant
    df['distance'] = np.sqrt((df['lat'] - df['merch_lat'])**2 + (df['long'] - df['merch_long'])**2)

    # Log-transform transaction amount
    df['log_amt'] = np.log1p(df['amt'])

    # Ensure no missing values in cc_num
    df['cc_num'] = df['cc_num'].fillna(-1)
    df['cc_num'] = df['cc_num'].fillna(-1)

    # Calculate percentage change in spending for each cardholder
    df['spend_percentage_change'] = df.groupby('cc_num')['amt'].pct_change().fillna(0)
    df['spend_percentage_change'] = df.groupby('cc_num')['amt'].pct_change().fillna(0)

    # Extract transaction day
    df['trans_day'] = pd.to_datetime(df['trans_date']).dt.date

    # Calculate velocity (number of transactions per day per cardholder)
    df['velocity'] = df.groupby(['cc_num', 'trans_day'])['trans_num'].transform('count')

    # Extract age from DOB
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = (df['trans_date'] - df['dob']).dt.days // 365

    return df

In [16]:
# Preprocess datasets
train_df = preprocess_and_engineer_features(train_df)
test_df = preprocess_and_engineer_features(test_df)

ValueError: time data "24579" doesn't match format "%H:%M:%S", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
# Separate features and target
# Define the target column as 'is_fraud'
# Drop unnecessary columns from train and test datasets to keep only relevant features
target = 'is_fraud'
features = train_df.drop(columns=['id', 'trans_num', 'trans_date', 'recency_spend_ratio', 'average_spend','distance', 'spend_percentage_change','velocity', 'trans_time', 'dob', 'is_fraud'])
test_features = test_df.drop(columns=['id', 'trans_num', 'trans_date','recency_spend_ratio', 'average_spend', 'distance', 'spend_percentage_change', 'velocity', 'trans_time', 'dob'])

# Separate numerical and categorical columns
# Identify numerical columns as those with data types 'float64' or 'int64'
# Identify categorical columns as those with the data type 'object'
numerical_cols = features.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = features.select_dtypes(include=['object']).columns

# Define preprocessing for numerical features
# Apply mean imputation for missing values and scale the data using StandardScaler
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
# Define preprocessing for categorical features
# Apply most frequent imputation for missing values and one-hot encoding to handle categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors into a ColumnTransformer
# Apply the numerical transformer to numerical columns and the categorical transformer to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Preprocess the data
# Define X as the feature set and y as the target for the training data
# Define X_test as the preprocessed test feature set
X = features
y = train_df[target]
X_test = test_features

In [None]:
# Train-test split
# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the preprocessor
# Fit the preprocessor on the training data and transform both training and validation sets
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
# Transform the test data using the same preprocessor
X_test = preprocessor.transform(X_test)

# Train the model
# originally was using random forest but I realized it was much slower and 
# less accurate to XGBClassifier
model = XGBClassifier(n_estimators=1000, max_depth=20, scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), learning_rate=0.3, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_val_pred = model.predict(X_val)
val_f1 = f1_score(y_val, y_val_pred)
print(f"Validation F1-Score: {val_f1:.4f}")

# Predict on the test dataset
# Predict the validation set labels and calculate the F1-Score for validation
test_predictions = model.predict(X_test)

# Create the submission file
# Copy the sample submission file and add predictions to the 'is_fraud' column
submission = sample_submission.copy()
submission['is_fraud'] = test_predictions
# Save the submission file as 'submission.csv'
submission.to_csv('submission.csv', index=False)

# Confirm successful creation of the submission file
print("Submission file 'submission.csv' created successfully.")

Validation F1-Score: 0.9727
Submission file 'submission.csv' created successfully.
