In [1]:
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, plot_roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_selector 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
skip_start = True

# Import data

In [3]:
if not skip_start:
    train_identity = pd.read_csv('data/ieee-fraud-detection/train_identity.csv')
    train_transaction = pd.read_csv('data/ieee-fraud-detection/train_transaction.csv')
    train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
    print(train.shape)
    del train_identity, train_transaction
    gc.collect()
    train.to_pickle("train.pkl")
else:
    train = pd.read_pickle("train.pkl")

In [4]:
y = train['isFraud']   # target class
X = train.drop('isFraud', axis=1)
print(X.shape, y.shape)

(590540, 433) (590540,)


In [5]:
del train
gc.collect()

40

# split data into test and train

In [6]:
X_train_hold, X_test_hold, y_train_hold, y_test_hold = train_test_split(X, y, test_size=0.33, random_state=42)
X_train_hold.shape, X_test_hold.shape, y_train_hold.shape, y_test_hold.shape

((395661, 433), (194879, 433), (395661,), (194879,))

In [7]:
del X, y
gc.collect()

40

# Now work on train

### Parameters to tune:

In [8]:
pick_sample_size_for_speed = 20000 # to speed up the processing - False meaning skip 

In [9]:
threshold_column_missing = 0.65 # more than this number is not used

In [10]:
correct_bias_isFruad_times_multiply = 2 # x times as many not fraud as fraud

## handle bias in data for isFraud

In [11]:
X_train = X_train_hold.copy()

In [12]:
X_train.loc[:, 'isFraud'] = y_train_hold

In [13]:
total_is_fraud = X_train[X_train["isFraud"] == 1]["isFraud"].value_counts().item()
total_is_not_fraud = X_train[X_train["isFraud"] == 0]["isFraud"].value_counts().item()
(X_train
    .filter(['isFraud'])
    .value_counts()
)

isFraud
0          381944
1           13717
dtype: int64

In [14]:
X_train = X_train[X_train["isFraud"] == 1].append(X_train[X_train["isFraud"] == 0].sample(total_is_fraud*correct_bias_isFruad_times_multiply))
X_train.shape

(41151, 434)

In [15]:
(X_train
    .filter(['isFraud'])
    .value_counts()
)

isFraud
0          27434
1          13717
dtype: int64

In [16]:
if pick_sample_size_for_speed:
    print("pick_sample_size_for_speed", pick_sample_size_for_speed)
    X_train = X_train.sample(pick_sample_size_for_speed)
    y_train = X_train.loc[:, 'isFraud']
    X_train.drop(labels=['isFraud'], axis=1, inplace = True)
else:
    X_train = X_train
    y_train = X_train.loc[:, 'isFraud']
    X_train.drop(labels=['isFraud'], axis=1, inplace = True)

pick_sample_size_for_speed 20000


In [17]:
print("dropping:")
dropped_columns = X_train.loc[:,
    (X_train
        .isna()
        .mean()
        .ge(threshold_column_missing)
     )
].columns.tolist()
dropped_columns

dropping:


['dist1',
 'dist2',
 'R_emaildomain',
 'D6',
 'D7',
 'D8',
 'D9',
 'D12',
 'D13',
 'D14',
 'V138',
 'V139',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V160',
 'V161',
 'V162',
 'V163',
 'V164',
 'V165',
 'V166',
 'V167',
 'V168',
 'V169',
 'V170',
 'V171',
 'V172',
 'V173',
 'V174',
 'V175',
 'V176',
 'V177',
 'V178',
 'V179',
 'V180',
 'V181',
 'V182',
 'V183',
 'V184',
 'V185',
 'V186',
 'V187',
 'V188',
 'V189',
 'V190',
 'V191',
 'V192',
 'V193',
 'V194',
 'V195',
 'V196',
 'V197',
 'V198',
 'V199',
 'V200',
 'V201',
 'V202',
 'V203',
 'V204',
 'V205',
 'V206',
 'V207',
 'V208',
 'V209',
 'V210',
 'V211',
 'V212',
 'V213',
 'V214',
 'V215',
 'V216',
 'V217',
 'V218',
 'V219',
 'V220',
 'V221',
 'V222',
 'V223',
 'V224',
 'V225',
 'V226',
 'V227',
 'V228',
 'V229',
 'V230',
 'V231',
 'V232',
 'V233',
 'V234',
 'V235',
 'V236',
 'V237',
 'V238',
 

In [18]:
X_train = X_train.loc[:,
    (X_train
        .isna()
        .mean()
        .le(threshold_column_missing)
     )
]

In [19]:
X_train

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
149657,3136657,3089033,485.950,W,15323,216.0,150.0,visa,226.0,credit,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
251749,3238749,6019402,125.000,R,11862,562.0,150.0,visa,226.0,credit,...,125.000000,125.000000,125.000000,125.000000,0.0,0.000000,0.000000,225.0,225.0,225.0
76949,3063949,1694145,100.000,R,1260,512.0,150.0,mastercard,117.0,debit,...,0.000000,0.000000,0.000000,0.000000,100.0,100.000000,100.000000,0.0,0.0,0.0
473736,3460736,12246555,95.000,W,12598,111.0,150.0,visa,166.0,debit,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
344812,3331812,8515982,226.000,W,10613,250.0,150.0,visa,226.0,debit,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,280.0,280.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541930,3528930,14304704,280.000,W,5606,318.0,150.0,visa,166.0,debit,...,157.000000,146.000000,457.000000,300.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
475387,3462387,12277745,29.000,W,2518,555.0,150.0,visa,226.0,debit,...,49.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
180274,3167274,3963848,49.000,W,15372,241.0,150.0,visa,226.0,debit,...,49.000000,49.000000,152.500000,49.000000,49.0,108.000000,49.000000,0.0,0.0,0.0
363227,3350227,9010206,45.436,C,18301,459.0,185.0,mastercard,224.0,credit,...,39.333401,39.333401,39.333401,39.333401,0.0,38.970901,38.970901,0.0,0.0,0.0


In [20]:
X_train.dtypes

TransactionID       int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
card1               int64
                   ...   
V317              float64
V318              float64
V319              float64
V320              float64
V321              float64
Length: 224, dtype: object

In [21]:
numeric_features = make_column_selector(dtype_include=['float64','int64'])
numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
    ('imputer', KNNImputer(n_neighbors=3)),
    ('scaler', StandardScaler())])

In [22]:
categorical_features = make_column_selector(dtype_include='object')
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [24]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('pca', PCA(n_components=50)),
                      ('classifier', SVC(probability=True))],
               verbose=True)

In [25]:
ss = ShuffleSplit(test_size=0.5, n_splits=1, random_state=7)

In [26]:
param_grid = [{
                  'classifier': (KNeighborsClassifier(),),
                  'classifier__n_neighbors':[20], 
                  'classifier__metric':['manhattan'],
                  'classifier__weights':['distance'],
                  'pca__n_components':[100]        
              },
              {
                  'classifier': (SVC(gamma="auto"),),
                  'pca__n_components':[100]  
              },
              {
                  'classifier': (MLPClassifier(hidden_layer_sizes=(100,), max_iter=20, alpha=1e-4,
                    solver='sgd', verbose=0, random_state=1,
                    learning_rate_init=.1),),
                  'pca__n_components':[100],
                  'classifier__alpha':[1e-6],
                  'classifier__hidden_layer_sizes':[(50,)],
                  'classifier__max_iter':[100],
                  'classifier__learning_rate_init':[0.01],
              },
                  {
                  'classifier': (LogisticRegression(solver='liblinear'),),
                  'pca__n_components':[100]  
              }
              ]


In [27]:
pipe_gs = GridSearchCV(pipe, param_grid, cv=ss, scoring="roc_auc",
                      verbose = 10, n_jobs = -1)

In [28]:
pipe_gs = pipe_gs.fit(X_train, y_train)

Fitting 1 folds for each of 4 candidates, totalling 4 fits
[Pipeline] ...... (step 1 of 3) Processing preprocessor, total= 5.5min
[Pipeline] ............... (step 2 of 3) Processing pca, total=   1.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  47.2s


In [29]:
pipe_gs.best_params_

{'classifier': SVC(gamma='auto'), 'pca__n_components': 100}

In [30]:
pipe_gs.cv_results_

{'mean_fit_time': array([181.63479519, 204.0879879 , 186.44016051, 183.39459372]),
 'std_fit_time': array([0., 0., 0., 0.]),
 'mean_score_time': array([228.06550479, 206.89650273, 214.16485524, 215.20980072]),
 'std_score_time': array([0., 0., 0., 0.]),
 'param_classifier': masked_array(data=[KNeighborsClassifier(), SVC(gamma='auto'),
                    MLPClassifier(learning_rate_init=0.1, max_iter=20, random_state=1, solver='sgd',
                                  verbose=0)                                                        ,
                    LogisticRegression(solver='liblinear')],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__metric': masked_array(data=['manhattan', --, --, --],
              mask=[False,  True,  True,  True],
        fill_value='?',
             dtype=object),
 'param_classifier__n_neighbors': masked_array(data=[20, --, --, --],
              mask=[False,  True,  True,  True],
    

In [31]:
(pd.concat([pd.DataFrame(pipe_gs
                        .cv_results_["params"]),
           pd.DataFrame(pipe_gs.cv_results_["mean_test_score"],
                        columns=["roc_auc"])],
          axis=1)
 .sort_values("roc_auc",ascending=False)
)


Unnamed: 0,classifier,classifier__metric,classifier__n_neighbors,classifier__weights,pca__n_components,classifier__alpha,classifier__hidden_layer_sizes,classifier__learning_rate_init,classifier__max_iter,roc_auc
1,SVC(gamma='auto'),,,,100,,,,,0.850178
0,KNeighborsClassifier(),manhattan,20.0,distance,100,,,,,0.839816
2,"MLPClassifier(learning_rate_init=0.1, max_iter...",,,,100,1e-06,"(50,)",0.01,100.0,0.839188
3,LogisticRegression(solver='liblinear'),,,,100,,,,,0.81893


# test model on hold out test from above

In [32]:
X_test_hold = X_test_hold.drop(columns=dropped_columns)

In [None]:
y_pred_gs = pipe_gs.predict(X_test_hold)

In [None]:
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test_hold,y_pred_gs)))

# Finally predict the Kaggle test data set

In [None]:
test_identity = pd.read_csv('data/ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('data/ieee-fraud-detection/test_transaction.csv')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
test.shape

In [None]:
test.columns=test.columns.str.replace('-','_')

In [None]:
def package_for_kaggle(test_isFraud_prob):
    submission = pd.DataFrame({"TransactionID": test.index, "isFraud": test_isFraud_prob[:,1]}) 
    submission.to_csv("data/submission.csv", index=False)
    
package_for_kaggle(test_isFraud_prob)