In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, average_precision_score
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

# Load the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Preprocess the data
def preprocess_data(df):
    # Convert categorical variables into dummy/indicator variables
    df = pd.get_dummies(df, columns=['category', 'gender', 'state', 'job'])
    
    # Convert date columns to datetime
    df['trans_date'] = pd.to_datetime(df['trans_date'])
    df['dob'] = pd.to_datetime(df['dob'])
    
    # Extract useful features from date columns
    df['trans_year'] = df['trans_date'].dt.year
    df['trans_month'] = df['trans_date'].dt.month
    df['trans_day'] = df['trans_date'].dt.day
    df['dob_year'] = df['dob'].dt.year
    df['dob_month'] = df['dob'].dt.month
    df['dob_day'] = df['dob'].dt.day
    
    # Create interaction features
    df['amt_trans_hour'] = df['amt'] * pd.to_datetime(df['trans_time'], format='%H:%M:%S').dt.hour
    df['age_amt'] = (2023 - df['dob'].dt.year) * df['amt']
    
    # Binning continuous features
    df['amt_bin'] = pd.cut(df['amt'], bins=10, labels=False)
    df['age_bin'] = pd.cut(2023 - df['dob'].dt.year, bins=10, labels=False)
    
    # Drop columns that are not useful for prediction
    df.drop(['trans_num', 'trans_date', 'trans_time', 'unix_time', 'cc_num', 'first', 'last', 'street', 'city', 'zip', 'lat', 'long', 'dob', 'merchant', 'merch_lat', 'merch_long'], axis=1, inplace=True)
    
    return df

train = preprocess_data(train)
test = preprocess_data(test)

# Separate features and target variable from training data
X = train.drop('is_fraud', axis=1)
y = train['is_fraud']

# Address class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test = scaler.transform(test)

# Define the LightGBM model with class weight adjustment and early stopping
model = lgb.LGBMClassifier(
    random_state=42, 
    learning_rate=0.1, 
    n_estimators=2000, 
    num_leaves=31, 
    max_depth=-1, 
    min_child_samples=20,
    reg_alpha=0.1,  # L1 regularization
    reg_lambda=0.1,  # L2 regularization
    class_weight='balanced',  # Adjust class weights
)

# Train the model with stratified K-fold cross-validation
skf = StratifiedKFold(n_splits=5)
for train_index, val_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], eval_metric='binary_logloss')

# Make predictions on the validation set with probability threshold adjustment
y_val_pred_proba = model.predict_proba(X_val)[:, 1]
optimal_threshold = 0.5  # You can adjust this threshold based on your validation set performance
y_val_pred = (y_val_pred_proba >= optimal_threshold).astype(int)

# Evaluate the model using various metrics for imbalanced data
f1 = f1_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_pred_proba)
average_precision = average_precision_score(y_val, y_val_pred_proba)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"ROC AUC Score: {roc_auc}")
print(f"Average Precision Score: {average_precision}")
print(f"F1 Score: {f1}")

# Make predictions on the test set with probability threshold adjustment
test_predictions_proba = model.predict_proba(test)[:, 1]
test_predictions = (test_predictions_proba >= optimal_threshold).astype(int)

# Create a submission file
submission = sample_submission.copy()
submission['is_fraud'] = test_predictions

submission.to_csv('submission16.csv', index=False)

print("Submission file created successfully.")

[LightGBM] [Info] Number of positive: 210070, number of negative: 210286
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.483221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3560
[LightGBM] [Info] Number of data points in the train set: 420356, number of used features: 717
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 210071, number of negative: 210286
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.021301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3562
[LightGBM] [Info] Number of data points in the train set: 420357, number of used features: 717
[LightGBM] [