## Grabs the top 5 names from the fraudTest.csv file.

In [2]:
import pandas as pd

# Load the CSV file. Adjust the filename/path as needed.
df = pd.read_csv('../fraudTest.csv', index_col=0)

# Group by the first and last name, and count transactions for each group.
transaction_counts = df.groupby(['first', 'last']).size().reset_index(name='transaction_count')

# Sort the result in descending order by the transaction count.
top_names = transaction_counts.sort_values('transaction_count', ascending=False)

print(top_names.head(5))


        first        last  transaction_count
788     Scott      Martin               1965
385   Jeffrey       Smith               1526
314      Gina      Grimes               1474
655  Michelle     Gregory               1466
132    Carrie  Washington               1462


In [None]:
# Convert the top 5 names to a list of tuples for easier iteration
top_5_names = top_names.head(5)[['first', 'last']].values.tolist()

transactions_by_purchaser = {}

for first, last in top_5_names:
    # Filter the original DataFrame for each purchaser
    transactions = df[(df['first'] == first) & (df['last'] == last)]
    transactions_by_purchaser[(first, last)] = transactions
    print(f"Transactions for {first} {last}:")
    print(transactions)


Transactions for Scott Martin:
       trans_date_trans_time            cc_num  \
407      2020-06-21 14:31:28  3502088871723054   
517      2020-06-21 15:08:54  3502088871723054   
770      2020-06-21 16:36:37  3502088871723054   
962      2020-06-21 17:43:07  4334230547694630   
1704     2020-06-21 22:13:27  3502088871723054   
...                      ...               ...   
554914   2020-12-31 19:15:35  4334230547694630   
554924   2020-12-31 19:16:58  4334230547694630   
555016   2020-12-31 19:46:44  3502088871723054   
555029   2020-12-31 19:54:46  4334230547694630   
555401   2020-12-31 22:04:19  3502088871723054   

                                  merchant       category    amt  first  \
407               fraud_Altenwerth-Kilback           home  27.12  Scott   
517                      fraud_Osinski Inc  personal_care  23.33  Scott   
770                    fraud_Hills-Witting   shopping_net  59.46  Scott   
962     fraud_Gutmann, McLaughlin and Wiza           home   9.62  Sc

## Simulating Transactions

In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import load_model

def haversine(lat1, lon1, lat2, lon2):
    """
    Compute the great circle distance between two points on the earth (in kilometers)
    using the haversine formula.
    """
    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Earth radius in kilometers
    return c * r

def load_and_preprocess_data(csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)
    
    # Convert transaction date/time to datetime objects
    df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
    
    # Sort transactions by credit card number and transaction time to preserve sequence order
    df = df.sort_values(by=["cc_num", "unix_time"])
    
    # Compute the distance between the user's location and merchant's location
    df["distance"] = haversine(df["lat"], df["long"], df["merch_lat"], df["merch_long"])
    
    # Compute the time difference (in seconds) between consecutive transactions for each user
    df["time_diff"] = df.groupby("cc_num")["unix_time"].diff().fillna(0)
    
    return df

def scale_features(df, features):
    # Note: Ideally, use the same scaler (or its saved parameters) as used during training.
    scaler = MinMaxScaler()
    df[features] = scaler.fit_transform(df[features])
    return df, scaler

def create_sequences(df, features, target, sequence_length=10):
    X, y = [], []
    # Group by user (using cc_num as unique identifier)
    for user, group in df.groupby("cc_num"):
        group = group.sort_values("unix_time")
        transactions = group[features].values
        fraud_flags = group[target].values
        # Create sequences: each sequence of transactions is used to predict the next transaction's fraud flag.
        for i in range(len(group) - sequence_length):
            X.append(transactions[i:i+sequence_length])
            y.append(fraud_flags[i+sequence_length])
    return np.array(X), np.array(y)

def main():
    # Path to your test dataset
    csv_file = "../fraudTrain.csv"
    
    # Load and preprocess the data
    df = load_and_preprocess_data(csv_file)
    
    # Select features and the target variable
    features = ["amt", "distance", "time_diff"]
    target = "is_fraud"
    
    # Scale the features. For consistent results, the same scaler used during training should be applied.
    df, scaler = scale_features(df, features)
    
    # Define the sequence length (should match the one used during training)
    SEQUENCE_LENGTH = 10
    X, y = create_sequences(df, features, target, sequence_length=SEQUENCE_LENGTH)
    
    print("Test sequence data shape:", X.shape)
    print("Test target data shape:", y.shape)
    
    # Load the pre-trained model (ensure that fraud_detection_model.h5 exists)
    model = load_model("../fraud_detection_model.h5")
    model.summary()
    
    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(X, y, batch_size=32)
    print("Test Loss: {:.4f}, Test Accuracy: {:.4f}".format(loss, accuracy))
    
if __name__ == "__main__":
    main()




Test sequence data shape: (1286910, 10, 3)
Test target data shape: (1286910,)


[1m40216/40216[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 953us/step - accuracy: 0.9970 - loss: 0.0135
Test Loss: 0.0133, Test Accuracy: 0.9970


In [88]:
import numpy as np
import pandas as pd
import io
import pickle
import warnings
import logging
from math import radians, cos, sin, asin, sqrt
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Silence warnings
warnings.filterwarnings('ignore')
logging.getLogger('tensorflow').setLevel(logging.ERROR)
logging.getLogger('absl').setLevel(logging.ERROR)


def predict_fraud_for_new_transaction(user_transactions, new_transaction, scaler, max_seq_len, model):
    """
    Predict fraud for a new transaction using the user's historical transactions.
    In this modified version, each transaction is treated independently, so
    user_transactions is expected to be an empty list.
    """
    seq = user_transactions + [new_transaction]
    seq = np.array(seq)
    seq = scaler.transform(seq)
    seq_padded = pad_sequences([seq], maxlen=max_seq_len, dtype='float32', padding='pre')
    fraud_prob = model.predict(seq_padded, verbose=0)  # verbose=0 to silence prediction output
    return fraud_prob[0][0]


# ------------------------------
# Step 1: Load the Existing Model and Scaler
# ------------------------------

# Replace these file paths with the actual paths to your files
MODEL_PATH = '../fraud_detection_model.h5'
SCALER_PATH = '../scaler.pkl'

# Load the pre-trained Keras model
model = load_model(MODEL_PATH, compile=False)  # compile=False to avoid compilation warnings

# Load the fitted scaler (e.g., StandardScaler) from disk
with open(SCALER_PATH, 'rb') as f:
    scaler = pickle.load(f)

# Define the maximum sequence length as used in your training process.
max_seq_len = 10

# ------------------------------
# Step 2: Define the Haversine Function for Distance Calculation
# ------------------------------

def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance between two points on Earth.
    """
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # Haversine formula calculation
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

# ------------------------------
# Step 3: Load and Process Test Data
# ------------------------------

# Load the test data
df = pd.read_csv("test.csv")

# Drop the is_fraud column before processing if it exists
if 'is_fraud' in df.columns:
    df = df.drop('is_fraud', axis=1)

# Group transactions by user (first name + last name)
user_groups = df.groupby(['first', 'last'])

# Process each user's transactions
for (first, last), user_transactions in user_groups:
    print(f"\nProcessing transactions for {first} {last}")
    
    # Sort transactions by time
    user_transactions = user_transactions.sort_values('unix_time')
    
    # Process each transaction for this user independently
    for idx, transaction in user_transactions.iterrows():
        # Extract features
        amt = float(transaction['amt'])
        
        # Calculate distance between user's location and merchant's location
        user_lat = float(transaction['lat'])
        user_lon = float(transaction['long'])
        merch_lat = float(transaction['merch_lat'])
        merch_lon = float(transaction['merch_long'])
        distance = haversine(user_lat, user_lon, merch_lat, merch_lon)
        
        # For independent evaluation, set time_diff to 0
        time_diff = 0
        
        # Create feature list for this transaction
        transaction_features = [amt, distance, time_diff]
        
        # Evaluate transaction independently (no historical context)
        history = []  # Empty history for independent evaluation
        
        fraud_probability = predict_fraud_for_new_transaction(
            history,
            transaction_features,
            scaler,
            max_seq_len,
            model
        )
      
    
        
        print(f"Transaction {transaction['trans_num']}: Fraud Probability = {fraud_probability:.4f}")
      



Processing transactions for Bill Zhang
Transaction c3939b412c44c4eece77f4a527479629: Fraud Probability = 0.4636
Transaction 3d11340fab65815a86edd2c5386dd664: Fraud Probability = 0.5749
Transaction 9494a3f97b837dccdccbfca029b48d0f: Fraud Probability = 0.8581

Processing transactions for Lia Sindhunirmala
Transaction 902360c2990e949a8f7fbc2bd28322a0: Fraud Probability = 0.4791
Transaction d8103c37a1dbb77b12f083a597476478: Fraud Probability = 0.5883
Transaction f5e607cb00ed9926334429a02c7ec212: Fraud Probability = 0.8232

Processing transactions for Lisa Juan
Transaction 765ed5fc5af9dc18191ef41c4b5670f1: Fraud Probability = 0.4645
Transaction 9ab1ae384dceeb899cd4f859f87ca9ab: Fraud Probability = 0.7642
Transaction 4c3feb6a6795326731c7e8ceaa5a0671: Fraud Probability = 0.8311

Processing transactions for Warren Yun
Transaction 45e3842baf79e19f647b64c2174200e6: Fraud Probability = 0.5697
Transaction e3727fc6901a29a76e573b9c52e96c5b: Fraud Probability = 0.8665
Transaction 7cd3d0445bbee5a45f6

# Running the method