In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.svm import OneClassSVM
from sklearn.preprocessing import LabelEncoder

import warnings

warnings.simplefilter("ignore")


In [3]:
df_train = pd.read_csv("data/fraudTrain.csv")
df_train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
from scripts.model import Model

model = Model(df_train)

model.fraud_rows_index.shape

(0,)

In [9]:
df_train.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [10]:
df_train['state'].unique()

array(['NC', 'WA', 'ID', 'MT', 'VA', 'PA', 'KS', 'TN', 'IA', 'WV', 'FL',
       'CA', 'NM', 'NJ', 'OK', 'IN', 'MA', 'TX', 'WI', 'MI', 'WY', 'HI',
       'NE', 'OR', 'LA', 'DC', 'KY', 'NY', 'MS', 'UT', 'AL', 'AR', 'MD',
       'GA', 'ME', 'AZ', 'MN', 'OH', 'CO', 'VT', 'MO', 'SC', 'NV', 'IL',
       'NH', 'SD', 'AK', 'ND', 'CT', 'RI', 'DE'], dtype=object)

In [11]:
df_train['is_fraud'].value_counts(normalize=True)

0    0.994211
1    0.005789
Name: is_fraud, dtype: float64

In [13]:
data = df_train[['merchant', 'category', 'amt', 'gender', 'state']]
data.head()

Unnamed: 0,merchant,category,amt,gender,state
0,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,NC
1,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,WA
2,fraud_Lind-Buckridge,entertainment,220.11,M,ID
3,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,M,MT
4,fraud_Keeling-Crist,misc_pos,41.96,M,VA


In [None]:
# data[data.columns[0]]

In [14]:
le = LabelEncoder()

for col in data.select_dtypes('O'):
    data.loc[:, col] = le.fit_transform(data[col])

data.head()

Unnamed: 0,merchant,category,amt,gender,state
0,514,8,4.97,0,27
1,241,4,107.23,0,47
2,390,0,220.11,1,13
3,360,2,45.0,1,26
4,297,9,41.96,1,45


In [None]:
model = OneClassSVM(nu=0.005789).fit(data)

In [None]:
y_pred = model.predict(data)

In [None]:
anomaly_values = data.iloc[np.where(y_pred == -1)]
anomaly_values

Unnamed: 0,merchant,category,amt,gender
138,32,12,636.41,1
409,355,8,494.16,0
511,416,8,1636.87,0
583,346,11,1.13,0
798,346,11,1.13,0
...,...,...,...,...
1295433,688,8,1.03,1
1295491,571,11,1210.91,0
1295868,688,8,343.81,0
1296228,673,9,645.13,1


In [None]:
anomaly_values.to_csv('fraud_anomalies.csv', index=False)

In [18]:
df_train[df_train['is_fraud'] == 1][['merchant', 'category', 'amt', 'gender']]

Unnamed: 0,merchant,category,amt,gender
2449,fraud_Rutherford-Mertz,grocery_pos,281.06,M
2472,"fraud_Jenkins, Hauck and Friesen",gas_transport,11.52,F
2523,fraud_Goodwin-Nitzsche,grocery_pos,276.31,F
2546,fraud_Erdman-Kertzmann,gas_transport,7.03,M
2553,fraud_Koepp-Parker,grocery_pos,275.73,F
...,...,...,...,...
1295399,fraud_Kassulke PLC,shopping_net,977.01,F
1295491,fraud_Schumm PLC,shopping_net,1210.91,F
1295532,"fraud_Tillman, Dickinson and Labadie",gas_transport,10.24,M
1295666,fraud_Corwin-Collins,gas_transport,21.69,F


In [None]:
# grouping the databy full name and counting the number of transactions per customer
fraud_trans_count = (
    fraud_trans.groupby(["full_name", "trans_date"])
    .agg({"trans_num": "count"})
    .reset_index()
    .rename(columns={"trans_num": "no_transactions_per_customer"})
)
# grouping the databy full name and counting the number of transactions per customer
legit_trans_count = (
    legit_trans.groupby(["full_name", "trans_date"])
    .agg({"trans_num": "count"})
    .reset_index()
    .rename(columns={"trans_num": "no_transactions_per_customer"})
)
