<a href="https://colab.research.google.com/github/canterville184/FraudDetection/blob/main/FraudDetection_feature_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U 'hopsworks[python]' --quiet

In [None]:
!pip install faker



In [None]:
import pandas as pd
import os
import numpy as np
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
zip_data = zipfile.ZipFile('/content/drive/MyDrive/Colab Notebooks/ML_portfolio/FraudDetection/synthetic_financial_dataset_fraud_detection.zip', 'r')
data = pd.read_csv(zip_data.open('PS_20174392719_1491204439457_log.csv'))



In [None]:
data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


# Feature Engineering

In [None]:
from datetime import timedelta
import random

def random_date(start, end):
    """
    This function will return a random datetime between two datetime
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = random.randrange(int_delta)
    return start + timedelta(seconds=random_second)

In [None]:
from datetime import datetime

d1 = datetime.strptime('1/1/2018 1:30 PM', '%m/%d/%Y %I:%M %p')
d2 = datetime.strptime('1/1/2019 4:50 AM', '%m/%d/%Y %I:%M %p')

print(random_date(d1, d2))

2018-11-26 18:39:06


In [None]:
# Set a date for the transaction based on step
x_date = random_date(d1, d2)
data['date'] = [x_date + pd.DateOffset(hours=x) for x in data['step']]

In [None]:
# Set a transaction_id
data.index = np.arange(1, len(data) + 1)
data['transaction_id'] = data.index

In [None]:
from faker import Faker

# Initialize the Faker generator
fake = Faker()

data['state'] = [fake.state_abbr() for _ in range(len(data))]
data['zipcode'] = [fake.zipcode_in_state(state) for state in data['state']]
data['ipv4'] = [fake.ipv4() for _ in range(len(data))]
data['mac_address'] = [fake.mac_address() for _ in range(len(data))]


In [None]:
data.head()


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,date,transaction_id,state,zipcode,ipv4,mac_address
1,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,2018-02-12 12:47:34,1,IN,47952,63.246.78.18,0e:34:d4:31:82:87
2,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,2018-02-12 12:47:34,2,ID,83509,98.202.124.92,5a:52:9e:a5:44:0d
3,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,2018-02-12 12:47:34,3,NH,3211,164.179.147.212,08:2b:76:89:b5:f3
4,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,2018-02-12 12:47:34,4,AZ,86410,102.172.178.234,e8:c8:17:19:a9:c3
5,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,2018-02-12 12:47:34,5,CA,95865,140.16.79.169,6c:0a:a1:51:08:61


# Customer Profile Information
- Customer Account Age: Helps assess risk based on account longevity.
- Transaction Frequency: Number of transactions per customer over different time windows (hourly, daily, weekly, etc.).
- Historical Fraud Flag: Has the customer been involved in past fraudulent activity?

In [None]:
data.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud', 'date', 'transaction_id', 'state', 'zipcode', 'ipv4',
       'mac_address'],
      dtype='object')

# Creating Feature Groups in Feature Store

In [None]:
customer_list = list(set(list(data['nameOrig'].unique()) + list(data['nameDest'].unique())))

In [None]:
data['isFraud'].value_counts()

Unnamed: 0_level_0,count
isFraud,Unnamed: 1_level_1
0,6354407
1,8213


In [None]:
import random

max_account_age = 120
limit = 20

customer_ds = pd.DataFrame()
cust_id = 1
for customer in customer_list:
  trans_cust = data[(data['nameOrig'] == customer) | (data['nameDest'] == customer)]
  days_diff = trans_cust['date'].max() - trans_cust['date'].max()
  days_diff = days_diff.days
  total_trans = trans_cust.shape[0]
  total_fraud_trans = trans_cust[trans_cust['isFraud']==1].shape[0]
  trans_per_day = int(round(total_trans/days_diff, 0))
  account_age =  random.randint(0, limit) if total_fraud_trans > 1 else random.randint(limit, max_account_age)
  customer_ds.loc[cust_id, 'customer_id'] = customer
  customer_ds.loc[cust_id, 'account_age'] = account_age
  customer_ds.loc[cust_id, 'trans_per_day'] = trans_per_day
  cust_id += 1
  print('total_trans: {}, total_fraud_trans: {}, account_age: {}, days_diff: {}, trans_per_day: {}'.format(total_trans, total_fraud_trans, account_age, days_diff, trans_per_day))

total_trans: 1, total_fraud_trans: 0, account_age: 24
total_trans: 1, total_fraud_trans: 0, account_age: 120
total_trans: 1, total_fraud_trans: 0, account_age: 106
total_trans: 1, total_fraud_trans: 0, account_age: 105
total_trans: 1, total_fraud_trans: 0, account_age: 87
total_trans: 1, total_fraud_trans: 0, account_age: 67
total_trans: 1, total_fraud_trans: 0, account_age: 95
total_trans: 1, total_fraud_trans: 0, account_age: 67
total_trans: 1, total_fraud_trans: 0, account_age: 71
total_trans: 5, total_fraud_trans: 0, account_age: 77
total_trans: 14, total_fraud_trans: 0, account_age: 29
total_trans: 1, total_fraud_trans: 0, account_age: 63
total_trans: 1, total_fraud_trans: 0, account_age: 89
total_trans: 1, total_fraud_trans: 0, account_age: 64
total_trans: 1, total_fraud_trans: 0, account_age: 112
total_trans: 5, total_fraud_trans: 0, account_age: 55
total_trans: 1, total_fraud_trans: 0, account_age: 117
total_trans: 1, total_fraud_trans: 0, account_age: 76
total_trans: 1, total_

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

Copy your Api Key (first register/login): https://c.app.hopsworks.ai/account/api/generated

Paste it here: ··········

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1213626


In [None]:
# Get or create the 'transactions' feature group
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Transaction data",
    primary_key=["transaction_id"],
    event_time="date",
    online_enabled=True,
)

In [None]:
# Insert data into feature group
trans_fg.insert(data)



Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1213626/fs/1200267/fg/1400957


Uploading Dataframe: 100.00% |██████████| Rows 6362620/6362620 | Elapsed Time: 09:15 | Remaining Time: 00:00


Launching job: transactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1213626/jobs/named/transactions_1_offline_fg_materialization/executions


(Job('transactions_1_offline_fg_materialization', 'SPARK'), None)

In [None]:
# Update feature descriptions
feature_descriptions = [
    {"name": "transaction_id", "description": "Transaction id"},
    {"name": "date", "description": "Transaction time"},
    {"name": "step", "description": "maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation)."},
    {"name": "amount", "description": "amount of the transaction in local currency"},
    {"name": "nameorig", "description": "customer who started the transaction"},
    {"name": "oldbalanceorg", "description": "initial balance before the transaction"},
    {"name": "newbalanceorig", "description": "new balance after the transaction"},
    {"name": "namedest", "description": "customer who is the recipient of the transaction"},
    {"name": "oldbalancedest", "description": "initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants)"},
    {"name": "newbalancedest", "description": "new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants)"},
    {"name": "isfraud", "description": "fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system"},
    {"name": "isflaggedfraud", "description": "An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction"},
    {"name": "state", "description": "state where transaction happened"},
    {"name": "zipcode", "description": "zipcode where transaction happened"}
]

for desc in feature_descriptions:
    trans_fg.update_feature_description(desc["name"], desc["description"])

In [None]:
# Select features for training data
data_cols = ['transaction_id', 'step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud', 'date']
selected_features = trans_fg.select(data_cols)
selected_features.show(5)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (53.73s) 


Unnamed: 0,transaction_id,step,type,amount,nameorig,oldbalanceorg,newbalanceorig,namedest,oldbalancedest,newbalancedest,isfraud,isflaggedfraud,date
0,1050236,95,CASH_IN,327060.99,C277542917,9679606.12,10006667.11,C18058083,449422.09,122361.11,0,0,2018-01-18 16:08:15+00:00
1,487075,19,CASH_OUT,98056.68,C1739707237,8469.0,0.0,C1965963623,529722.54,627779.22,0,0,2018-01-15 12:08:15+00:00
2,1925088,167,PAYMENT,3457.36,C1367949555,1052.0,0.0,M194892973,0.0,0.0,0,0,2018-01-21 16:08:15+00:00
3,423455,18,CASH_IN,113256.95,C826616230,11674912.08,11788169.03,C1010549417,517791.59,778301.47,0,0,2018-01-15 11:08:15+00:00
4,455607,19,CASH_OUT,260417.16,C253927786,36519.0,0.0,C2008807592,680742.69,941159.85,0,0,2018-01-15 12:08:15+00:00


In [None]:
type(selected_features.s)

TypeError: Query.show() missing 1 required positional argument: 'n'