In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import datetime
from imblearn.over_sampling import SMOTE

In [2]:
from zipfile import ZipFile

zip_file = ZipFile('cc_fraud.zip')
# df = pd.read_csv(zip_file.open('fraudTrain.csv'))
# df.head(1)

In [3]:
zip_file.namelist()

['fraudTest.csv', 'fraudTrain.csv']

In [4]:
def acquire_fraud(zip_file_name_in_quotes):
    '''
    Create a df from zipped file source. Requires ZipFile import
    '''
    zip_file = ZipFile(zip_file_name_in_quotes)
    df_test = pd.read_csv(zip_file.open(zip_file.namelist()[0]))
    df_train = pd.read_csv(zip_file.open(zip_file.namelist()[1]))
    return df_test, df_train

In [5]:
test, df = acquire_fraud('cc_fraud.zip')

In [6]:
df.sample()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
872104,872104,2019-12-20 19:39:21,4926376199189801,"fraud_Schuppe, Nolan and Hoeger",shopping_pos,2.14,Claire,Davis,F,83685 Matthew Center Suite 870,...,36.3011,-91.5281,4726,Pharmacologist,1977-06-07,aef33bd3cae97896f092f3cb6ecd9659,1356032361,35.656891,-91.654446,0


In [7]:
df.shape

(1296675, 23)

In [None]:
# df.info()

In [8]:
def prep(df):
    '''
    Apply some clean and prep to the df
    '''
    df = df.set_index(pd.to_datetime(df['trans_date_trans_time'],format= '%Y-%m-%d %H:%M:%S')).sort_index()
    df['age'] = (df.index - pd.DatetimeIndex(df['dob']))// pd.Timedelta('365D')
    df['dayofweek'] = df.index.day_name()
    df['hourofday'] = df.index.hour
    df = df.drop(columns=['Unnamed: 0','trans_date_trans_time','cc_num','first','last','street','city','state','trans_num','lat','long','dob','trans_num','unix_time'])
    df['age_group'] = pd.cut(df['age'],[0,25,35,45,55,65,75,100], labels= ['Youth','Young_Adult','Adult','Early_Mid_Age','Mid_Age','Retirement_Age','Older_Person'],right=False)
    
    return df

In [9]:
df = prep(df)
df.head()

Unnamed: 0_level_0,merchant,category,amt,gender,zip,city_pop,job,merch_lat,merch_long,is_fraud,age,dayofweek,hourofday,age_group
trans_date_trans_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,28654,3495,"Psychologist, counselling",36.011293,-82.048315,0,30,Tuesday,0,Young_Adult
2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,99160,149,Special educational needs teacher,49.159047,-118.186462,0,40,Tuesday,0,Adult
2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,M,83252,4154,Nature conservation officer,43.150704,-112.154481,0,56,Tuesday,0,Mid_Age
2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,59632,1939,Patent attorney,47.034331,-112.561071,0,52,Tuesday,0,Early_Mid_Age
2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,M,24433,99,Dance movement psychotherapist,38.674999,-78.632459,0,32,Tuesday,0,Young_Adult


In [11]:
# df.dropna()
# There are no NA rows

In [None]:
df = df.set_index(pd.to_datetime(df['trans_date_trans_time'],format= '%Y-%m-%d %H:%M:%S')).sort_index()

In [None]:
df.index.min(),df.index.max()

In [None]:
df.head(1)

In [None]:
# (pd.to_datetime('now') - pd.date_range('19791215',freq='D',periods=365))//pd.Timedelta('1D')

In [None]:
df['age'] = (df.index - pd.DatetimeIndex(df['dob']))// pd.Timedelta('365D')

In [None]:
df['dayofweek'] = df.index.day_name()

In [None]:
df['hourofday'] = df.index.hour

In [None]:
df = df.drop(columns=['Unnamed: 0','trans_date_trans_time','cc_num','first','last','street','city','state','trans_num','lat','long','dob','trans_num','unix_time'])

In [None]:
# raw = pd.read_csv(zip_file.open('fraudTrain.csv'))
# deduplicated = raw.groupby(level=0).first()  # remove duplicates
# deduplicated.flags.allows_duplicate_labels = False  # disallow going forward

In [None]:
# deduplicated = deduplicated.set_index(pd.to_datetime(deduplicated['trans_date_trans_time'],format= '%Y-%m-%d %H:%M:%S')).sort_index()
# deduplicated.head()

In [None]:
# raw[raw.duplicated()]
# by_min = raw.asfreq('M')

In [None]:
# df.hourofday.value_counts()

In [None]:
# plt.plot(df.is_fraud)
from matplotlib.pyplot import figure

figure(figsize=(20, 8), dpi=80)
plt.plot(df.is_fraud.resample('D').mean())

In [None]:
df.sample()

In [None]:
pd.date_range(start=df.index[0], end=df.index[-1]).difference(df.index)

In [None]:
# pd.date_range("00:00", "23:59", freq="8H").strftime('%H:%M:%S') #+ pd.Timedelta(5,unit="H")

In [None]:
df['age_group'] = pd.cut(df['age'],[0,25,35,45,55,65,75,100], labels= ['Youth','Young_Adult','Adult','Early_Mid_Age','Mid_Age','Retirement_Age','Older_Person'],right=False)

In [None]:
df.age_group.value_counts().sort_values()

In [None]:
df.dropna()

## Putting "shift" bin on ice until further notice

In [None]:
# first_shift = pd.date_range(pd.to_datetime('21:00:00'),freq='S',periods= 480)
# first_shift

In [None]:
# first_shift[0]

In [None]:
# first_shift = pd.date_range('00:00:00','08:00:00', periods= 1)
#second_shift = pd.date_range('08:00:00','16:00:00', periods= 1)
#overnight_shift = pd.date_range('16:00:00','00:00:00', periods= 1)
#bin_times = [first_shift[0],second_shift[0],overnight_shift[0]]
#bin_labels = ['first_shift','second_shift']

In [None]:
#df['time_bin_test'] = pd.cut(df.index, bins = bin_times, labels=bin_labels, right=False)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
# notebookbreak

In [None]:
# df['test3'] = pd.to_datetime(df.index,format= '%Y-%m-%d %H:%M:%S').astype('str')

In [None]:
# df['datetime_for_bin'] = pd.bdate_range(start=df.index[0].hour,end=df.index[-1],freq='H')
# df.index[0].time()

In [None]:
# define the bins
# shift_bins = [5, 13, 21]

# add custom labels if desired
# labels = ['night', 'first', 'second']

# add the bins to the dataframe
# df['shift_bin'] = pd.cut(df.test1.dt.hour, bins, labels=labels, right=False)

In [None]:
df.columns

In [None]:
def get_target_and_features(df):
    target = df.columns.to_list()[9]
    features = df.columns[df.columns != target].to_list()
    return target,features

In [None]:
target,features = get_target_and_features(df)
print(target)
print('______')
print(features)

In [None]:
def train_val(df):
    y = df[target]
    x = df[features]
    x_train, x_validate, y_train, y_validate = train_test_split(x,y,test_size=.30, random_state=42)
    return x_train,x_validate,y_train,y_validate

In [None]:
x_train,x_validate,y_train,y_validate = train_val(df)

In [None]:
x_train.sample()

# Need to encode here. (pd.get_dummies)

In [None]:
features

In [None]:
categoricals = ['merchant','category','gender','zip','job','dayofweek','hourofday','age_group']
#non-target categoricals that is

In [None]:
# dummy_train = pd.get_dummies([])
# dummy_val
# dummy_test - need to create test df since this was previously split and exists as its own dataset

In [None]:
# non_categoricals = df.columns[~df.columns.isin(categoricals)].to_list()
# non_categoricals

In [None]:
# xtrain_smote = df[non_categoricals]
# xvalidate_smote = df[non_categoricals]
# xtrain_smote.sample()

In [None]:
# sns.histplot(y_train)
# plt.title("Class Imbalance: is_Fraud")

In [None]:
# y_train.value_counts()

In [None]:
# from imblearn.over_sampling import SMOTENC

# sm = SMOTENC(random_state = 42,categorical_features = categoricals)
# xtrain_resample, ytrain_resample = sm.fit_resample(x_train, y_train)
# xval_resample, yval_resample = sm.fit_resample(x_validate,y_validate)

In [None]:
# from imblearn.over_sampling import SMOTE

# sm = SMOTE(random_state = 42)
# xtrain_resample, ytrain_resample = sm.fit_resample(x_train, y_train)
# xval_resample, yval_resample = sm.fit_resample(x_validate,y_validate)

In [None]:
# plot_resample(x_train,y_train,xtrain_resample,ytrain_resample,"SMOTE REBALANCING")