In [1]:
import numpy as np
import pandas as pd
import datetime
from multiprocessing import Pool

In [2]:
train = pd.read_csv('Data/fraudTrain.csv')
train = train.iloc[:, 1:]

test = pd.read_csv('Data/fraudTest.csv')
test = test.iloc[:, 1:]

test_ans = test['is_fraud']
test.drop(['is_fraud'], axis = 1, inplace = True)

traintest = pd.concat([train, test])
traintest.shape

(1852394, 22)

In [3]:
train.shape

(1296675, 22)

In [4]:
train.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
for col in train.columns:
    print("Value Counts of ", col)
    print(train[col].value_counts())
    print('\n\n')

Value Counts of  trans_date_trans_time
2020-06-02 12:47:07    4
2019-04-22 16:02:01    4
2020-06-01 01:37:47    4
2019-01-15 06:58:17    3
2019-12-09 23:43:01    3
                      ..
2019-08-06 14:59:58    1
2019-01-27 11:39:15    1
2020-05-13 03:30:33    1
2019-11-08 22:47:24    1
2020-01-15 14:03:50    1
Name: trans_date_trans_time, Length: 1274791, dtype: int64



Value Counts of  cc_num
571365235126           3123
4512828414983801773    3123
36722699017270         3119
213112402583773        3117
3545109339866548       3113
                       ... 
340187018810220           7
4975457191020             7
3545578418030657          7
4734310647841293          7
4714017207228610634       7
Name: cc_num, Length: 983, dtype: int64



Value Counts of  merchant
fraud_Kilback LLC                       4403
fraud_Cormier LLC                       3649
fraud_Schumm PLC                        3634
fraud_Kuhn LLC                          3510
fraud_Boyer PLC                         349

- Maybe treat it like an anomaly detection problem instead of a regular classification problem
- Get time of the day, check for office hours
- Check for weekend buying
- Which generation faces more fraud?
- Drop Unix Time, cc number, transaction number

In [6]:
def get_time_details(row):
    date_time = str(row['trans_date_trans_time'])
    date, time = date_time.split(' ')
    year, month, day = date.split('-')
    hour = time.split(':')[0]
    
    row['year'] = year
    row['month'] = month
    row['day'] = day
    row['hour'] = hour
    
    return row

def apply_func(df):
    df = df.apply(get_time_details, axis = 1)
    return df

In [2]:
def parallelize_dataframe(df, func, n_cores = 10):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [8]:
train = parallelize_dataframe(train, apply_func)

In [9]:
traintest = parallelize_dataframe(traintest, apply_func)

traintest.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,year,month,day,hour
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,...,1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0.0,2019,1,1,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,...,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0.0,2019,1,1,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,...,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0.0,2019,1,1,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,...,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0.0,2019,1,1,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,...,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0.0,2019,1,1,0


In [10]:
drop_cols = ['cc_num', 'trans_num', 'unix_time']

train.drop(drop_cols, axis = 1, inplace = True)
traintest.drop(drop_cols, axis = 1, inplace = True)

In [11]:
drop_cols = ['street', 'city']

train.drop(drop_cols, axis = 1, inplace = True)
traintest.drop(drop_cols, axis = 1, inplace = True)

In [12]:
train.to_csv('Data/train_mod.csv', index = False)
traintest.to_csv('Data/traintest_mod.csv', index = False)

### Reloading Data because of memory issues

In [3]:
train = pd.read_csv('Data/train_mod.csv')
traintest = pd.read_csv('Data/traintest_mod.csv')

In [4]:
day_dict = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}

def get_day_of_week(row):
    date_time = str(row['trans_date_trans_time'])
    date = date_time.split(' ')[0]
    year, month, day = date.split('-')
    
    date = datetime.datetime(int(year), int(month), int(day))
    row['DOW'] = day_dict[date.weekday()]
    
    return row

def apply_DOW_func(df):
    df = df.apply(get_day_of_week, axis = 1)
    return df

In [5]:
traintest.shape

(1852394, 21)

In [6]:
train = parallelize_dataframe(train, apply_DOW_func)

In [7]:
traintest = parallelize_dataframe(traintest, apply_DOW_func)

In [8]:
traintest.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,first,last,gender,state,zip,lat,...,job,dob,merch_lat,merch_long,is_fraud,year,month,day,hour,DOW
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,NC,28654,36.0788,...,"Psychologist, counselling",1988-03-09,36.011293,-82.048315,0.0,2019,1,1,0,Tuesday
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,WA,99160,48.8878,...,Special educational needs teacher,1978-06-21,49.159047,-118.186462,0.0,2019,1,1,0,Tuesday
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,ID,83252,42.1808,...,Nature conservation officer,1962-01-19,43.150704,-112.154481,0.0,2019,1,1,0,Tuesday
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,MT,59632,46.2306,...,Patent attorney,1967-01-12,47.034331,-112.561071,0.0,2019,1,1,0,Tuesday
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,VA,24433,38.4207,...,Dance movement psychotherapist,1986-03-28,38.674999,-78.632459,0.0,2019,1,1,0,Tuesday


In [9]:
def get_birth_range(row):
    dob = str(row['dob'])
    year = dob.split('-')[0][2]
    row['generation'] = str(int(year)*10) + 's'
    
    return row

def apply_gen_func(df):
    df = df.apply(get_birth_range, axis = 1)
    return df

In [10]:
train = parallelize_dataframe(train, apply_gen_func)

In [None]:
traintest = parallelize_dataframe(traintest, apply_gen_func)

In [None]:
train.to_csv('Data/train_mod.csv', index = False)
traintest.to_csv('Data/traintest_mod.csv', index = False)