In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
from datetime import datetime

In [158]:
df = pd.read_csv('../../datas/credit_card_purchases.csv')
df = df.dropna().reset_index(drop=True)

In [159]:
def to_hash(cc_num):
    return int(hashlib.sha256(cc_num.encode()).hexdigest(), 16) % 10**8

df['customer_id'] = df['cc_num'].astype('str').apply(to_hash)

In [160]:
df = df.drop(columns=['Unnamed: 0', 'cc_num', 'first', 'last', 'street', 'lat', 'long', 'trans_num', 'merchant'])

In [161]:
df['trans_date_trans_time'] = df['trans_date_trans_time'].str[:10]
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])
df = df.rename(columns={'trans_date_trans_time': 'trans_date'})

In [162]:
df['merch_zipcode'] = df['merch_zipcode'].astype('int')

In [163]:
def find_age(dob):
    today = datetime.today()
    age = today.year - dob.year

    if (today.month, today.day) < (dob.month, dob.day):
        age -= 1
    return age

df['age'] = df['dob'].apply(find_age)

In [164]:
df.head()

Unnamed: 0,trans_date,category,amt,gender,city,state,zip,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode,customer_id,age
0,2019-01-01,misc_net,4.97,F,Moravian Falls,NC,28654,3495,"Psychologist, counselling",1988-03-09,1325376018,36.011293,-82.048315,0,28705,67709251,36
1,2019-01-01,entertainment,220.11,M,Malad City,ID,83252,4154,Nature conservation officer,1962-01-19,1325376051,43.150704,-112.154481,0,83236,29260595,62
2,2019-01-01,misc_pos,41.96,M,Doe Hill,VA,24433,99,Dance movement psychotherapist,1986-03-28,1325376186,38.674999,-78.632459,0,22844,50636118,38
3,2019-01-01,gas_transport,94.63,F,Dublin,PA,18917,2158,Transport planner,1961-06-19,1325376248,40.653382,-76.152667,0,17972,42522027,63
4,2019-01-01,gas_transport,71.65,M,Edinburg,VA,22824,6018,"Designer, multimedia",1947-08-21,1325376308,38.948089,-78.540296,0,22644,34306959,76


In [165]:
df = df.iloc[:, [13, 0, 15, 1, 2, 3, 16, 8, 9, 4, 5, 6, 7, 11, 12, 14, 10]]

In [166]:
df.head()

Unnamed: 0,is_fraud,trans_date,customer_id,category,amt,gender,age,job,dob,city,state,zip,city_pop,merch_lat,merch_long,merch_zipcode,unix_time
0,0,2019-01-01,67709251,misc_net,4.97,F,36,"Psychologist, counselling",1988-03-09,Moravian Falls,NC,28654,3495,36.011293,-82.048315,28705,1325376018
1,0,2019-01-01,29260595,entertainment,220.11,M,62,Nature conservation officer,1962-01-19,Malad City,ID,83252,4154,43.150704,-112.154481,83236,1325376051
2,0,2019-01-01,50636118,misc_pos,41.96,M,38,Dance movement psychotherapist,1986-03-28,Doe Hill,VA,24433,99,38.674999,-78.632459,22844,1325376186
3,0,2019-01-01,42522027,gas_transport,94.63,F,63,Transport planner,1961-06-19,Dublin,PA,18917,2158,40.653382,-76.152667,17972,1325376248
4,0,2019-01-01,34306959,gas_transport,71.65,M,76,"Designer, multimedia",1947-08-21,Edinburg,VA,22824,6018,38.948089,-78.540296,22644,1325376308


In [167]:
# misc_pos and misc_net most likely mean miscellaneous transactions made in person and online
df['category'].unique()

array(['misc_net', 'entertainment', 'misc_pos', 'gas_transport',
       'grocery_pos', 'shopping_net', 'shopping_pos', 'food_dining',
       'grocery_net', 'health_fitness', 'travel', 'kids_pets',
       'personal_care', 'home'], dtype=object)

In [170]:
# no fraud
df_nf = df[df['is_fraud'] == 0]
df_nf = df.drop(columns=['is_fraud'])


In [171]:
df_nf.head()

Unnamed: 0,trans_date,customer_id,category,amt,gender,age,job,dob,city,state,zip,city_pop,merch_lat,merch_long,merch_zipcode,unix_time
0,2019-01-01,67709251,misc_net,4.97,F,36,"Psychologist, counselling",1988-03-09,Moravian Falls,NC,28654,3495,36.011293,-82.048315,28705,1325376018
1,2019-01-01,29260595,entertainment,220.11,M,62,Nature conservation officer,1962-01-19,Malad City,ID,83252,4154,43.150704,-112.154481,83236,1325376051
2,2019-01-01,50636118,misc_pos,41.96,M,38,Dance movement psychotherapist,1986-03-28,Doe Hill,VA,24433,99,38.674999,-78.632459,22844,1325376186
3,2019-01-01,42522027,gas_transport,94.63,F,63,Transport planner,1961-06-19,Dublin,PA,18917,2158,40.653382,-76.152667,17972,1325376248
4,2019-01-01,34306959,gas_transport,71.65,M,76,"Designer, multimedia",1947-08-21,Edinburg,VA,22824,6018,38.948089,-78.540296,22644,1325376308
