In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import gc
import matplotlib.pyplot as plt  
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, plot_roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_selector 
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
skip_start = True

# Import data

In [3]:
if not skip_start:
    train_identity = pd.read_csv('data/ieee-fraud-detection/train_identity.csv')
    train_transaction = pd.read_csv('data/ieee-fraud-detection/train_transaction.csv')
    train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
    print(train.shape)
    del train_identity, train_transaction
    gc.collect()
    train.to_pickle("train.pkl")
else:
    train = pd.read_pickle("train.pkl")
    train = train.sample(200000)

In [4]:
y = train['isFraud']   # target class
X = train.drop('isFraud', axis=1)
print(X.shape, y.shape)

(200000, 433) (200000,)


In [5]:
del train
gc.collect()

40

# split data into test and train

In [6]:
X_train_hold, X_test_hold, y_train_hold, y_test_hold = train_test_split(X, y, test_size=0.33, random_state=42)
X_train_hold.shape, X_test_hold.shape, y_train_hold.shape, y_test_hold.shape

((134000, 433), (66000, 433), (134000,), (66000,))

In [7]:
del X, y
gc.collect()

40

# Now work on train

### Parameters to tune:

In [8]:
threshold_column_missing = 0.22 # more than this number is not used

In [9]:
correct_bias_isFruad_times_multiply = 3 # x times as many not fraud as fraud

## handle bias in data for isFraud

In [10]:
X_train = X_train_hold.copy()

In [11]:
X_train.loc[:, 'isFraud'] = y_train_hold

In [12]:
total_is_fraud = X_train[X_train["isFraud"] == 1]["isFraud"].value_counts().item()
total_is_not_fraud = X_train[X_train["isFraud"] == 0]["isFraud"].value_counts().item()
(X_train
    .filter(['isFraud'])
    .value_counts()
)

isFraud
0          129362
1            4638
dtype: int64

In [13]:
X_train = X_train[X_train["isFraud"] == 1].append(X_train[X_train["isFraud"] == 0].sample(total_is_fraud*correct_bias_isFruad_times_multiply))
X_train.shape

(18552, 434)

In [14]:
(X_train
    .filter(['isFraud'])
    .value_counts()
)

isFraud
0          13914
1           4638
dtype: int64

In [15]:
y_train = X_train.loc[:, 'isFraud']
X_train.drop(labels=['isFraud'], axis=1, inplace = True)

In [None]:
data = [(col, X_train[col].isnull().sum() / len(X_train)) 
        for col in X_train.columns if X_train[col].isnull().sum()]
col_names = ['column', 'percent_missing']
missing_df = pd.DataFrame(data, columns=col_names).sort_values('percent_missing')
missing_df.sort_values(["percent_missing"], ascending=True, inplace=True)


plt.plot(missing_df.percent_missing)

plt.show()

In [17]:
print("dropping:")
dropped_columns = X_train.loc[:,
    (X_train
        .isna()
        .mean()
        .ge(threshold_column_missing)
     )
].columns.tolist()
dropped_columns

dropping:


['dist1',
 'dist2',
 'R_emaildomain',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D11',
 'D12',
 'D13',
 'D14',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V138',
 'V139',
 'V140',
 'V141',
 'V142',
 'V143',
 'V144',
 'V145',
 'V146',
 'V147',
 'V148',
 'V149',
 'V150',
 'V151',
 'V152',
 'V153',
 'V154',
 'V155',
 'V156',
 'V157',
 'V158',
 'V159',
 'V160',
 'V161',
 'V162',
 'V163',
 'V164',
 'V165',
 'V166',
 'V167',
 'V168',
 'V169',
 'V170',
 'V171',
 'V172',
 'V173',
 'V174',
 'V175',
 'V176',
 'V177',
 'V178',
 'V179',
 'V180',
 'V181',
 'V182',
 'V183',
 'V184',
 'V185',
 'V186',
 'V187',
 'V188',
 'V189',
 'V190',
 'V191',
 'V192',
 'V193',
 'V194',
 'V195',
 'V196',
 'V197',
 'V198',
 'V199',
 'V200',
 'V201',
 'V202',
 'V

In [18]:
X_train = X_train.loc[:,
    (X_train
        .isna()
        .mean()
        .le(threshold_column_missing)
     )
]

In [19]:
X_train.set_index("TransactionID", inplace=True)
X_train

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3329536,8448784,171.000,W,7919,194.0,150.0,mastercard,166.0,debit,269.0,...,59.000000,59.000000,59.000000,59.000000,177.0,177.0,177.0,0.0,0.0,0.0
3155699,3618877,100.000,H,5714,170.0,150.0,visa,195.0,credit,337.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3300036,7783908,29.002,C,10568,204.0,185.0,visa,226.0,credit,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3166731,3953005,100.000,H,15266,555.0,119.0,visa,102.0,credit,296.0,...,0.000000,0.000000,0.000000,0.000000,100.0,100.0,100.0,0.0,0.0,0.0
3260871,6644977,250.000,R,17188,321.0,150.0,visa,226.0,debit,299.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3433988,11385239,171.000,W,17781,555.0,150.0,mastercard,117.0,debit,177.0,...,107.949997,107.949997,359.799988,107.949997,0.0,0.0,0.0,0.0,0.0,0.0
3145799,3340104,49.000,W,7919,194.0,150.0,mastercard,202.0,debit,123.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3479756,12858497,30.864,C,3887,202.0,185.0,mastercard,137.0,credit,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3183759,4422811,59.000,W,14712,111.0,150.0,visa,226.0,debit,264.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
start = dt.datetime(2021,1,1,0,0) # need a reference point use this for train and test
X_train.loc[:, "TransactionDT"] = X_train["TransactionDT"].apply(lambda x: start + pd.Timedelta(seconds=x)).dt.day_name()

In [21]:
X_train

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3329536,Thursday,171.000,W,7919,194.0,150.0,mastercard,166.0,debit,269.0,...,59.000000,59.000000,59.000000,59.000000,177.0,177.0,177.0,0.0,0.0,0.0
3155699,Thursday,100.000,H,5714,170.0,150.0,visa,195.0,credit,337.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3300036,Thursday,29.002,C,10568,204.0,185.0,visa,226.0,credit,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3166731,Monday,100.000,H,15266,555.0,119.0,visa,102.0,credit,296.0,...,0.000000,0.000000,0.000000,0.000000,100.0,100.0,100.0,0.0,0.0,0.0
3260871,Thursday,250.000,R,17188,321.0,150.0,visa,226.0,debit,299.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3433988,Wednesday,171.000,W,17781,555.0,150.0,mastercard,117.0,debit,177.0,...,107.949997,107.949997,359.799988,107.949997,0.0,0.0,0.0,0.0,0.0,0.0
3145799,Monday,49.000,W,7919,194.0,150.0,mastercard,202.0,debit,123.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3479756,Saturday,30.864,C,3887,202.0,185.0,mastercard,137.0,credit,,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3183759,Sunday,59.000,W,14712,111.0,150.0,visa,226.0,debit,264.0,...,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X_train.dtypes

TransactionDT      object
TransactionAmt    float64
ProductCD          object
card1               int64
card2             float64
                   ...   
V317              float64
V318              float64
V319              float64
V320              float64
V321              float64
Length: 180, dtype: object

In [23]:
numeric_features = make_column_selector(dtype_include=['float64','int64'])
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=3)),
    ('scaler', StandardScaler())])

In [24]:
categorical_features = make_column_selector(dtype_include='object')
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [26]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                        ('pca', PCA(n_components=50)),
                      ('classifier', SVC())],
               verbose=True)

In [27]:
ss = ShuffleSplit(test_size=0.5, n_splits=1, random_state=7)

In [28]:
param_grid = [{
                  'classifier': (KNeighborsClassifier(),),
                  'classifier__n_neighbors':[20], 
                  'classifier__metric':['manhattan'],
                  'classifier__weights':['distance'],
                  'pca__n_components':[100]        
              },
              {
                  'classifier': (SVC(gamma="auto",probability=True),),
                  'pca__n_components':[100]  
              },
              {
                  'classifier': (MLPClassifier(hidden_layer_sizes=(100,), max_iter=20, alpha=1e-4,
                    solver='sgd', verbose=0, random_state=1,
                    learning_rate_init=.1),),
                  'pca__n_components':[100],
                  'classifier__alpha':[1e-6],
                  'classifier__hidden_layer_sizes':[(50,)],
                  'classifier__max_iter':[100],
                  'classifier__learning_rate_init':[0.01],
              },
                  {
                  'classifier': (LogisticRegression(solver='liblinear'),),
                  'pca__n_components':[100]  
              }
              ]


In [29]:
pipe_gs = GridSearchCV(pipe, param_grid, cv=ss, scoring="roc_auc",
                      verbose = 10, n_jobs = -1)

In [30]:
pipe_gs = pipe_gs.fit(X_train, y_train)

Fitting 1 folds for each of 4 candidates, totalling 4 fits
[Pipeline] ...... (step 1 of 3) Processing preprocessor, total= 3.2min
[Pipeline] ............... (step 2 of 3) Processing pca, total=   1.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s


In [31]:
pipe_gs.best_params_

{'classifier': KNeighborsClassifier(metric='manhattan', n_neighbors=20, weights='distance'),
 'classifier__metric': 'manhattan',
 'classifier__n_neighbors': 20,
 'classifier__weights': 'distance',
 'pca__n_components': 100}

In [32]:
pipe_gs.cv_results_

{'mean_fit_time': array([ 64.2783215 , 114.60409451,  68.751266  ,  63.76017356]),
 'std_fit_time': array([0., 0., 0., 0.]),
 'mean_score_time': array([56.79827905, 26.59423852, 45.34440684, 47.59935236]),
 'std_score_time': array([0., 0., 0., 0.]),
 'param_classifier': masked_array(data=[KNeighborsClassifier(metric='manhattan', n_neighbors=20, weights='distance'),
                    SVC(gamma='auto', probability=True),
                    MLPClassifier(learning_rate_init=0.1, max_iter=20, random_state=1, solver='sgd',
                                  verbose=0)                                                        ,
                    LogisticRegression(solver='liblinear')],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_classifier__metric': masked_array(data=['manhattan', --, --, --],
              mask=[False,  True,  True,  True],
        fill_value='?',
             dtype=object),
 'param_classifier__n_neighbors': m

In [33]:
(pd.concat([pd.DataFrame(pipe_gs
                        .cv_results_["params"]),
           pd.DataFrame(pipe_gs.cv_results_["mean_test_score"],
                        columns=["roc_auc"])],
          axis=1)
 .sort_values("roc_auc",ascending=False)
)


Unnamed: 0,classifier,classifier__metric,classifier__n_neighbors,classifier__weights,pca__n_components,classifier__alpha,classifier__hidden_layer_sizes,classifier__learning_rate_init,classifier__max_iter,roc_auc
0,"KNeighborsClassifier(metric='manhattan', n_nei...",manhattan,20.0,distance,100,,,,,0.825837
1,"SVC(gamma='auto', probability=True)",,,,100,,,,,0.8256
2,"MLPClassifier(learning_rate_init=0.1, max_iter...",,,,100,1e-06,"(50,)",0.01,100.0,0.823555
3,LogisticRegression(solver='liblinear'),,,,100,,,,,0.809719


In [34]:
list(X_train_hold.columns)

['TransactionID',
 'TransactionDT',
 'TransactionAmt',
 'ProductCD',
 'card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'dist1',
 'dist2',
 'P_emaildomain',
 'R_emaildomain',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C7',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D6',
 'D7',
 'D8',
 'D9',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37',
 'V38',
 'V39',
 'V40',
 'V41',
 'V42',
 'V43',
 'V44',
 'V45',
 'V46',
 'V47',
 'V48',
 'V49',
 'V50',
 'V51',
 'V52',
 'V53',
 'V54',
 'V55',
 'V56',
 'V57',
 'V58',
 'V59',
 'V60',
 'V61',
 'V62',
 'V63',
 'V64',
 'V65',
 'V66',

# test model on hold out test from above

In [35]:
X_test_hold.drop(columns=dropped_columns, inplace=True)
X_test_hold.loc[:, "TransactionDT"] = X_test_hold["TransactionDT"].apply(lambda x: start + pd.Timedelta(seconds=x)).dt.day_name()
X_test_hold.set_index("TransactionID", inplace=True)

In [36]:
y_pred_gs = pipe_gs.predict(X_test_hold)

In [37]:
print("Accuracy: {0:4.2f}".format(accuracy_score(y_test_hold,y_pred_gs)))

Accuracy: 0.94


In [None]:
y_test_hold_probs = pipe_gs.predict_proba(X_test_hold)[:, 1]

In [None]:
print("roc_auc_score: {0:4.2f}".format(roc_auc_score(y_test_hold, y_test_hold_probs)))

In [None]:
raise Exception("Stop right here!") 

# Finally predict the Kaggle test data set

In [None]:
test_identity = pd.read_csv('data/ieee-fraud-detection/test_identity.csv')
test_transaction = pd.read_csv('data/ieee-fraud-detection/test_transaction.csv')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
test.shape

In [None]:
test.columns=test.columns.str.replace('-','_')

In [None]:
test.drop(columns=dropped_columns, inplace=True)

In [None]:
test_isFraud_prob = pipe_gs.predict_proba(test)[:, 1]

In [None]:
def package_for_kaggle(test_isFraud_prob):
    submission = pd.DataFrame({"TransactionID": test.TransactionID, "isFraud": test_isFraud_prob}) 
    submission.to_csv("data/submission.csv", index=False)
    
package_for_kaggle(test_isFraud_prob)