In [None]:
# Libraries
from datasets import load_dataset

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import warnings 
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# https://huggingface.co/datasets/Nooha/cc_fraud_detection_dataset
ds = load_dataset("Nooha/cc_fraud_detection_dataset")
df = ds["train"].to_pandas() # convert to pandas dataframe

In [3]:
# Split into train and test 
raw_train_df = df.copy()
raw_test_df = df.copy()

# Train: Jan 2021 - Apr 2023
raw_train_df['trans_date'] = pd.to_datetime(raw_train_df['trans_date'], errors = 'coerce')
raw_train_df = raw_train_df.loc[raw_train_df['trans_date'] < '2023-05-01']

# Test: May 2023 - Dec 2023
raw_test_df['trans_date'] = pd.to_datetime(raw_test_df['trans_date'], errors = 'coerce')
raw_test_df = raw_test_df.loc[raw_test_df['trans_date'] >= '2023-05-01']

# Check train and test data split
print(f"Train-test split size: {round(len(raw_train_df)/len(df), 2) * 100}/{round(len(raw_test_df)/len(df), 2)}")

Train-test split size: 75.0/0.25


In [4]:
# Uncomment to save a copy of prepared train and test dfs into csv files
#raw_train_df.to_csv("../../data/raw_train_data.csv", index = False)
#raw_test_df.to_csv("../../data/raw_test_data_raw.csv", index = False)

# Preprocessing
Drop or alter categorical columns

In [5]:
def calc_age(data):
    est = data['year'] - data['dob'].year
    if (data['trans_date'].month < data['dob'].month) or ((data['trans_date'].month == data['dob'].month) and (data['trans_date'].day < data['dob'].day)):
        return est - 1
    else:
        return est

In [6]:
# Apply data cleaning (similar to EDA)
def clean_df(df):

    # Convert date and time to appropriate data types
    df['dob'] = pd.to_datetime(df['dob'], errors = 'coerce') 
    df['trans_time'] = pd.to_datetime(df['trans_time'], format='%H:%M:%S')

    # Extract transaction time features
    df['hour'] = df['trans_time'].dt.hour
    df['day_of_week'] = df['trans_date'].dt.dayofweek
    df['month'] = df['trans_date'].dt.month
    df['year'] = df['trans_date'].dt.year
    df['trans_datetime'] = pd.to_datetime(df['trans_date'].dt.date.astype(str) + ' ' + df['trans_time'].dt.time.astype(str))

    # Sort df by ssn and transaction time
    df = df.sort_values(['ssn', 'trans_datetime'])

    # Calculate age at the time of the transaction 
    df['age'] = df.apply(lambda x: calc_age(x), axis = 1)

    # Convert gender to binary
    df['gender'] = (df['gender'] == 'M').astype(int)

    # Convert category to categorical encoding
    df['category']=df['category'].astype('category').cat.codes
    
    # Drop categorical and unused columns
    df = df.drop(columns=['ssn', 'first', 'last', 'cc_num', 'acct_num', 'city', 'state', 'merchant', 'job', 'trans_datetime', 'unix_time', 'trans_num', 'trans_date', 'trans_time', 'dob'])

    return df

raw_train = clean_df(raw_train_df)
raw_test = clean_df(raw_test_df)

In [7]:
y_train, x_train = raw_train['is_fraud'], raw_train.drop(columns=['is_fraud'])


# Model

In [8]:
model = LogisticRegression(solver='saga')
model.fit(x_train, y_train)

y_pred = model.predict(x_train) 



In [9]:
f1_scores = cross_val_score(model, x_train, y_train, cv=5, scoring='f1')
print(f"Mean F1 Score: {f1_scores.mean():.4f}")

auc_scores = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc')
print(f"Mean AUC: {auc_scores.mean():.4f}")



Mean F1 Score: 0.0000




Mean AUC: 0.5089


