Imports X_train, y_train (160 K rows), X_test and y_test (40 K), which already have been pre-processed.

Also imports train, which is the full 200 K rows, not pre-processed yet - other than being a join of transaction- and identity data.  

The focus is on train, as pre-processing steps are compared here. 


## Settings

In [20]:
load_from_csv = True # Set to true to load .csv data and do some basic pre-processing (joining)
n_rows = 200000 # Set to None to load all data (recommended: 200'000)

## Imports

In [1]:
import os
import gc
import pickle
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale as preproc_scale
import plotly.express as px
%matplotlib inline
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

Export the packages and versions to requirements.txt

In [7]:
# Code from Stackoverflow
# https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook

import pkg_resources
import types
def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        # Some packages are weird and have different
        # imported names vs. system/pip names. Unfortunately,
        # there is no systematic way to get pip names from
        # a package's imported name. You'll have to had
        # exceptions to this list manually!
        poorly_named_packages = {
            "PIL": "Pillow",
            "sklearn": "scikit-learn"
        }
        if name in poorly_named_packages.keys():
            name = poorly_named_packages[name]

        yield name
imports = list(set(get_imports()))

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    if m.project_name in imports and m.project_name!="pip":
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{}=={}".format(*r))
with open('requirements.txt', 'wt') as f:
    [f.write("{}=={}\n".format(*r)) for r in requirements]
    

seaborn==0.9.0
scikit-learn==0.20.2
plotly==4.3.0
pandas==0.23.4
numpy==1.17.2
matplotlib==3.0.2


## Function definitions

In [15]:
def train_test_isoF(X_train, y_train, X_test=None, y_test=None, max_samples=1024, feature_list=None):
    if not feature_list is None:
        X_train, X_test = X_train[feature_list], X_test[feature_list]
    ifo = IsolationForest(n_estimators=50, max_samples=max_samples)
    ifo.fit(X_train)
    y_pred_ifo = ifo.decision_function(X_train)
    print('AUC Score on Train: {:.3f}'.format(roc_auc_score(y_train, -y_pred_ifo)))
    if X_test is None:
        return ifo
    y_pred_ifo_test = ifo.decision_function(X_test)    
    print('AUC Score on Test: {:.3f}'.format(roc_auc_score(y_test, -y_pred_ifo_test)))
    return ifo


In [16]:
def median_imputation(df, median_impute_limit=0.95, impute_val=-999):
    """ inf/nan Values that occur more often than median_impute_limit are imputed with the median
    when less often, they are imputed by impute_val. 
    Set median_impute_limit to 0 to always do median imputation
    """
    df = df.replace([np.inf, -np.inf], np.nan)
    for col in df.columns:
        if not df[col].dtype == 'object':
            mean_nan = df[col].isna().mean()
            if mean_nan > median_impute_limit: # then, impute by median
                df[col] = df[col].fillna(df[col].median())
            elif mean_nan > 0 and mean_nan <= median_impute_limit:
                df[col] = df[col].fillna(impute_val)
                
    return df

In [17]:
def label_encoding(df, columns, test_df=None):
    # %%time
    for col in columns:
        le = LabelEncoder()
        if not test_df is None:
            le.fit(list(df[col].astype(str).values) + list(test_df[col].astype(str).values))          
            df[col] = le.transform(list(df[col].astype(str).values))
            test_df[col] = le.transform(list(test_df[col].astype(str).values)) 
        else:
            le.fit(list(df[col].astype(str).values))
            df[col] = le.transform(list(df[col].astype(str).values))
    return df

In [22]:
def reduce_mem_usage(df, verbose=True):
    """ function from Kaggle. Transforms the column data types to the smallest possible representation
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                #if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                #    df[col] = df[col].astype(np.int8)
                #elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                #    df[col] = df[col].astype(np.int16)
                if c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased from {:5.2f} to {:5.2f} Mb ({:.1f}% reduction)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



## Data loading and basic pre-processing

Data can be downloaded here: 

https://www.kaggle.com/c/ieee-fraud-detection

Note: there are two data sets that need to be joined: Transaction data, on which Identity data (which is oftentimes missing) is left-joined. 


In [29]:
TRAIN_DATA_PATH = 'data/train.pkl'
TRAXCOLUMNS_PATH = 'data/trax_columns.pkl
if load_from_csv:
    data_transaction = pd.read_csv('data/ieee-fraud-detection/train_transaction.csv', nrows=n_rows)

    data_identity = pd.read_csv('data/ieee-fraud-detection/train_identity.csv')
    data_identity['has_id'] = 1 # to identify those that had identity info 
    
    train = data_transaction.merge(data_identity, on='TransactionID', how='left')
    train['has_id'] = train['has_id'].fillna(0)

    identity_cols = data_identity.columns
    transaction_cols = data_transaction.columns
    transaction_cols.to_pickle(TRAXCOLUMNS_PATH)
    del data_transaction, data_identity
    print('train shape: {}'.format(train.shape))
    train = reduce_mem_usage(train)
    train.to_pickle(TRAIN_DATA_PATH)
else:
    base_path='basic_data'
    train = pd.read_pickle(TRAIN_DATA_PATH)
    # trax_features: a Series containing the name of features from the transactions data
    transaction_cols = pd.read_pickle(TRAXCOLUMNS_PATH) 
    

train shape: (200000, 435)
Mem. usage decreased from 665.28 to 357.06 Mb (46.3% reduction)


In [14]:
train.isFraud.sum()

6024

In [None]:
np.iinfo(np.int16).max


In [None]:
train.isFraud.to_pickle('y_train_full.pkl')

## Data exploration

In [None]:
y_train.sample(10)

In [None]:
column_groups = {'card': [c for c in train.columns if c.startswith('card')],
                   'addr': [c for c in train.columns if c.startswith('addr')],
                   'dist1': ['dist1', 'dist2'],
                   'C' : [c for c in train.columns if c.startswith('C')],
                   'D' : [c for c in train.columns if c.lstrip('D').isnumeric()],
                   'M' : [c for c in train.columns if c.startswith('M')],
                   'id' : [c for c in train.columns if c.startswith('id')],   
                   'V': [c for c in train.columns if c.startswith('V')],
                   'trans': [c for c in train.columns if c.startswith('Trans')]}



In [None]:
allcols_in_groups = []
for col_group in column_groups.values():
    allcols_in_groups += col_group
    

In [None]:
set(train.columns) - set(allcols_in_groups)


### Card Columns

From the Kaggle forum:

card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.

All categorical! (according to the competition host)


In [None]:
train.sample(5).loc[:, column_groups['card']]


In [None]:
train.loc[:, column_groups['card']].nunique()


In [None]:
train.loc[:, column_groups['card']].isna().sum()


In [None]:
card_encoded = label_encoding(train[column_groups['card']], column_groups['card'])

### Dist1  Column

dist: distance
"distances between (not limited) billing address, mailing address, zip code, IP address, phone area, etc.”


In [None]:
train[column_groups['dist1']].nunique()

In [None]:
train[column_groups['dist1']].isna().mean()

In [None]:
train[column_groups['dist1']].sample(5, random_state=11)

In [None]:
train[column_groups['dist1']].isna().mean()
dist_cols = pd.concat((train[['dist1']].fillna(train[['dist1']].median()), 
                        train[['dist1']].isna().astype(int).rename(columns={'dist1': 'dist1_nan'}),
                      train[['dist2']].fillna(train[['dist2']].median()), 
                        train[['dist2']].isna().astype(int).rename(columns={'dist2': 'dist2_nan'})), axis=1)

In [None]:
dist_cols.head()

Observation: these fields are mostly empty. Indicator column was added for both


### Address Columns 


In [None]:
train.sample(10, random_state=1)[column_groups['addr']]


In [None]:
train[column_groups['addr']].nunique()


In [None]:
train[column_groups['addr']].isna().sum()
addr2 = train['addr2'].fillna(train['addr2'].mode()[0])

In [None]:
train['addr2'].mode()[0]

Observations:
- Indicator for missing values may be useful
- According to the description, these columns are categorical. Do not seem very useful


### C- columns

In [None]:
train[column_groups['C']].dtypes.value_counts()

In [None]:
sns.heatmap(train[column_groups['C']].corr());

Do PCA on the C columns

In [None]:
C_original = preproc_scale(train[column_groups['C']])
pca = PCA(n_components=3)
C_transformed = pca.fit_transform(C_original)

var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1) # NB: 3 variables is plenty


Observation: C columns can be effectively be represented by 3 PCA coefficients

### D- columns

D1-D15: timedelta, such as days between previous transaction, etc.

In [None]:
# PLOT ORIGINAL D
plt.figure(figsize=(15,5))
plt.scatter(train.TransactionDT,train.D15, s=2)
plt.title('Original D15')
plt.xlabel('Time')
plt.ylabel('D15')
plt.show()

In [None]:
train[column_groups['D']].dtypes.value_counts()


In [None]:
sns.heatmap(train[column_groups['D']].corr());

In [None]:
column_groups['D']

In [None]:
train[column_groups['D']].isna().sum()

In [None]:
D_imputed = median_imputation(train[column_groups['D']].copy(), median_impute_limit=0)
D_original = preproc_scale(D_imputed)

pca = PCA(n_components=10)
D_transformed = pca.fit_transform(D_original)
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1) # NB: 3 variables is plenty


Observation: D columns can be effectively be represented by 10 PCA coefficients (91%)

### V- columns

In [None]:
train[column_groups['V']].dtypes.value_counts()

In [None]:
train[column_groups['V']].max().value_counts().head(5)


In [None]:
sns.heatmap(train[column_groups['V'][:20]].corr());

In [None]:
V_original = median_imputation(train[column_groups['V']], median_impute_limit=0)
V_original = preproc_scale(V_original)
pca = PCA(n_components=25, whiten=True)
V_transformed = pca.fit_transform(V_original)
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1) 


In [None]:
train[column_groups['V']].isna().sum().unique()

Do PCA on the "NaN-ness" of the columns

In [None]:
V_nans = train[column_groups['V']].isna().astype(int)
pca = PCA(n_components=3)
V_nans_transformed = pca.fit_transform(V_nans)
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1) 

With merely 3 components, more than 90% of variance is explained. 

Observations: 
- There are about 330 V columns, with a lot of NaN's (~40% is missing)
- These can be compressed to about 25 PCA coefficients (75% explained variance) if they are median-imputed
- The rows with NaN's are equal for many V-columns
- The NaN-ness can be efficiently PCA-compressed to 3 columns (93%) or perfectly reconstructed with 10 components.


### M- columns

M1-M9: match, such as names on card and address, etc.
Categorical

In [None]:
train[column_groups['M']].nunique()


In [None]:
train[column_groups['M']].isna().sum()

In [None]:
train[column_groups['M']].sample(10)

In [None]:
M_encoded = label_encoding(train[column_groups['M']], column_groups['M'])

In [None]:
sns.heatmap(M_encoded.corr());

In [None]:
M_transformed = preproc_scale(M_encoded)
pca = PCA(n_components=3, whiten=True)
M_transformed = pca.fit_transform(M_transformed)
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1) 


In [None]:
train[column_groups['V']].isna().sum().unique()

Do PCA on the "NaN-ness" of the columns

In [None]:
V_nans = train[column_groups['V']].isna().astype(int)
pca = PCA(n_components=3)
V_nans_transformed = pca.fit_transform(V_nans)
var1=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(var1) 

### Transaction columns

TransactionAMT: transaction payment amount in USD

“Some of the transaction amounts have three decimal places to the right of the decimal point. There seems to be a link to three decimal places and a blank addr1 and addr2 field. Is it possible that these are foreign transactions and that, for example, the 75.887 in row 12 is the result of multiplying a foreign currency amount by an exchange rate?”

TransactionDT --> get the time in hours

In [None]:
column_groups['trans']


In [None]:
transaction_hour = (train.TransactionDT % (60*60*24))/3600

In [None]:
train['addr2'].isna().sum()

### Unsupervised Outlier Detection, Numerical columns


In [None]:
def plot_outlier_scores(scores):
    roc_score = roc_auc_score(train.isFraud, scores)
    classify_results = pd.DataFrame(data=pd.concat((train.isFraud, pd.Series(scores)), axis=1))
    classify_results.rename(columns={0:'score'}, inplace=True)
    sns.kdeplot(classify_results.loc[classify_results.isFraud==0, 'score'], label='negatives', shade=True, bw=0.01)
    sns.kdeplot(classify_results.loc[classify_results.isFraud==1, 'score'], label='positives', shade=True, bw=0.01)
    plt.title('AUC: {:.3f}'.format(roc_score))
    plt.xlabel('Score');
    return classify_results

In [None]:
def plot_top_N(scores, N=100):
    N = min(N, len(scores)) 
    classify_results = pd.DataFrame(data=pd.concat((train.isFraud, pd.Series(scores)), axis=1))
    classify_results.rename(columns={0:'score'}, inplace=True)
    classify_results = classify_results.sort_values(by='score', ascending=False)[:N]
    Npos_in_N = classify_results['isFraud'].sum()
    
    fig, ax = plt.subplots(1, 1, figsize=(16, 2))
    ims = ax.imshow(np.reshape(classify_results.isFraud.values, [1, -1]), extent=[-0.5, N, N/50, -0.5])
    ax.yaxis.set_visible(False)
    # ax.xaxis.set_ticklabels
    plt.colorbar(ims)    
    plt.xlabel('Outlier rank [-]')
    plt.title(f'Number of positives found: {Npos_in_N} (P@Rank{N}: {Npos_in_N/N:.1%})')
    #plt.show()
    return classify_results    

In [None]:
classify_results.head()

In [None]:
?sns.kdeplot(classify_results.loc[classify_results.isFraud==0, 'score'], label='negatives', shade=True)


### 1) Time and Region

In [None]:
cmap = plt.cm.get_cmap('YlOrRd')

In [None]:
#### Hour and Region
X_hour_region = pd.concat((transaction_hour, addr2), axis=1)
isof = train_test_isoF(X_hour_region, train.isFraud)
scores = - isof.decision_function(X_hour_region)


In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
sc = plt.scatter(transaction_hour, addr2, c=scores, cmap=cmap)
fig.colorbar(sc)
plt.title('Isolation Forest outliers')
ax.set_xlabel('Time [sec]')
ax.set_ylabel('Region [-]')
plt.show()


In [None]:
res = plot_outlier_scores(scores)

In [None]:
res = plot_top_N(scores, N=500)

Conclusion: Hour and region "mismatches" are only a weak indicator for fraud.


### 2) V columns

** 1. Untransformed ** 

In [None]:
isof = train_test_isoF(V_original, train.isFraud)
scores = - isof.decision_function(V_original)


In [None]:
classify_results = plot_outlier_scores(scores)

In [None]:
res = plot_top_N(scores, N=100)

In [None]:
maha_score = np.mean(V_original ** 2, axis=1)
maha_score = np.log(1 + maha_score)  / 10
classify_results = plot_outlier_scores(maha_score)

In [None]:
res = plot_top_N(maha_score, N=100)
res = plot_top_N(scores, N=1000)
res = plot_top_N(scores, N=800000)

** 2. PCA Transformed ** 

In [None]:
isof = train_test_isoF(V_transformed, train.isFraud)
scores = - isof.decision_function(V_transformed)


In [None]:
classify_results = plot_outlier_scores(scores)

In [None]:
scores.shape

In [None]:
res = plot_top_N(scores, N=100)
res = plot_top_N(scores, N=500)
res = plot_top_N(scores, N=2500)

Alternative: calculate the Mahalonobis distance

In [None]:
maha_score = np.mean(V_transformed ** 2, axis=1)
maha_score = np.log(1 + maha_score)  / 10
classify_results = plot_outlier_scores(maha_score)

In [None]:
res = plot_top_N(maha_score, N=100)
res = plot_top_N(maha_score, N=1000)
res = plot_top_N(maha_score, N=10000)

### 2) V-nan columns

In [None]:
isof = train_test_isoF(V_nans, train.isFraud)
scores = - isof.decision_function(V_nans)


In [None]:
isof = train_test_isoF(V_nans_transformed, train.isFraud)
scores = - isof.decision_function(V_nans_transformed)

### 3) C-columns

In [None]:
isof = train_test_isoF(train[column_groups['C']], train.isFraud)
# scores = - isof.decision_function(C_transformed)

In [None]:
isof = train_test_isoF(C_transformed, train.isFraud)
#scores = - isof.decision_function(C_transformed)

### 4) M-columns (categorical)


In [None]:
isof = train_test_isoF(M_encoded, train.isFraud)
scores = - isof.decision_function(M_encoded)

In [None]:
isof = train_test_isoF(M_transformed, train.isFraud)
scores = - isof.decision_function(M_transformed)

### 5) card-columns (categorical)


** 1. All columns **

In [None]:
isof = train_test_isoF(card_encoded, train.isFraud)
scores = - isof.decision_function(card_encoded)


In [None]:
classify_results = plot_outlier_scores(scores)

** 2. All but first column **

In [None]:
isof = train_test_isoF(card_encoded.iloc[:, 1:], train.isFraud)
scores = - isof.decision_function(card_encoded.iloc[:, 1:])

In [None]:
classify_results = plot_outlier_scores(scores)

### 6) Combining the best groups

In [None]:
data_combined = np.concatenate((V_transformed, card_encoded.iloc[:, 1:].values, train[column_groups['C']]), axis=1)

In [None]:
isof = train_test_isoF(data_combined, train.isFraud)
scores = - isof.decision_function(data_combined)

In [None]:
classify_results = plot_outlier_scores(scores)

In [None]:
res = plot_top_N(scores, N=100)
res = plot_top_N(scores, N=1000)
res = plot_top_N(scores, N=10000)