In [74]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import cross_val_score

from sklearn import metrics
from collections import Counter

In [75]:
train_df = pd.read_csv('data/fraudTest.csv')
test_df = pd.read_csv('data/fraudTrain.csv')

In [76]:
train_df.shape

(555719, 23)

In [77]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [78]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [79]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

def label_encoding(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = label_encoder.fit_transform(df[column])


In [80]:
def time_features(df, function):
    if function == 'drop':
        df.drop('trans_date_trans_time', axis=1, inplace=True)

In [81]:
def drop_columns(df):
    df.drop(columns=['Unnamed: 0','cc_num','first', 'last', 'street', 'zip', 'trans_num'],inplace=True)

In [82]:
def data_preprocessing(df):
    drop_columns(df)
    label_encoding(df)
    time_features(df, 'drop')
    df.fillna(0, inplace=True)
    return df


In [83]:
train_df = data_preprocessing(train_df)
X = train_df.drop('is_fraud', axis=1)
y = train_df['is_fraud']

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.3, random_state = 42)

In [85]:
def train_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'Model score: {model.score(X_train, y_train)}')
    print(f'Accuracy: {metrics.accuracy_score(y_test, y_pred)}')
    return model

In [86]:
from sklearn.svm import SVC
model = SVC()
train_model(model, X_train, y_train, X_test, y_test)

Model score: 0.9961414179325095
Accuracy: 0.9961371434055519


In [87]:
test_df = data_preprocessing(test_df)
X = train_df.drop('is_fraud', axis=1)
y = train_df['is_fraud']

In [88]:
def evaluate_model(model, X, y):
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f'Cross-validated scores: {scores}')
    print(f'Mean score: {scores.mean()}')

In [89]:
evaluate_model(model, X, y)

Cross-validated scores: [0.99614014 0.99614014 0.99614014 0.99614014 0.99614011]
Mean score: 0.9961401355720646
