# E-commerce Fraud Detection Analysis

In [1]:
#Load libraries
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sb
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import glob
import pickle

%matplotlib inline

## Load data

In [2]:
#Load train y data
y = pd.read_pickle("data/train_y.pkl")


In [3]:
#Load train x data
X = pd.read_pickle("data/cleanXG_LR/XGBoost_LR_train_X_1.pkl")

# Replace Nan with 0:
X = X.replace(-999, 0)

In [4]:
y.shape, X.shape

((590540,), (590540, 1323))

## Random Undersampling non-fraud data

In [None]:
#Load libraties
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE


In [None]:
#To use the libary "imblearn", we need to convert dataframe to numpy format
def random_undersample_nonfraud(X,y,ratio=1, randomState = None):
    #INPUT:
        #X [dataframe]: data feature matrix (mxn) where m is no. of data points and n is no. of features
        #y [data series]: response vector (mx1)
        #ratio [float]: ratio of fraud to non-fraud after resampling. Ratio = 1 means no. fraud = no. non-fraud
    #OUTPUT:
        #X_random_res [dataframe]
        #y_random_res [data series]
        
    X_arr, y_arr = X.to_numpy(), y.to_numpy()

    #Before balancing
    print('Original dataset shape %s' % Counter(y))

    #Undersampling without replacement
    rus = RandomUnderSampler(sampling_strategy = ratio, random_state = randomState)
    X_random_res, y_random_res = rus.fit_resample(X_arr, y_arr)
    print('Resampled dataset shape %s' % Counter(y_random_res))
    
    #In order to use sklearn model, we need input to be in dataframe format. Convert balanced numpy data to dataframe:
    X_random_res = pd.DataFrame(data = X_random_res, columns = X.columns)
    y_random_res = pd.Series(data = y_random_res).rename(y.name)
    
    return X_random_res, y_random_res

In [None]:
#Implementation
X_random_res, y_random_res = random_undersample_nonfraud(X,y,ratio=1)

## Cluster Based Undersampling non-fraud data

###### Scale data to mean 0 and unit variance

In [5]:
# Find binary variables to exclude from feature scaling
binary_list = []
for colname in X.columns:
    if len(np.unique(X[colname])) == 2:
        binary_list.append(colname)

In [6]:
print('There are %d binary features in a total of %d features' %(len(binary_list), X.shape[1]))

There are 1154 binary features in a total of 1323 features


In [7]:
multivar_list = list(set(X.columns.to_list()) - set(binary_list))

In [8]:
X.head(3)

Unnamed: 0,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_13,...,DeviceInfo_za409,DeviceInfo_za509,DeviceInfo_za990,DeviceInfo_zeia8,DeviceInfo_zte,DeviceInfo_zte a2017u build/nrd90m,DeviceInfo_zte-z835,DeviceInfo_zte-z956,DeviceInfo_zur70016,DeviceInfo_zuum_zen_i build/lrx21m
0,0.0,70787.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,-5.0,98945.0,0.0,0.0,0.0,-5.0,0.0,0.0,100.0,49.0,...,0,0,0,0,0,0,0,0,0,0
2,-5.0,191631.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,52.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Quantitative Variable --- Feature Scaling: mean = 0, variance = 1
scaler = StandardScaler().fit(X[multivar_list])
X[multivar_list] = scaler.transform(X[multivar_list])

# Note: We do not scale binary features (values  = {0,1}). 

In [10]:
X.head(3)

Unnamed: 0,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_10,id_11,id_13,...,DeviceInfo_za409,DeviceInfo_za509,DeviceInfo_za990,DeviceInfo_zeia8,DeviceInfo_zte,DeviceInfo_zte a2017u build/nrd90m,DeviceInfo_zte-z835,DeviceInfo_zte-z956,DeviceInfo_zur70016,DeviceInfo_zuum_zen_i build/lrx21m
0,0.298237,0.269977,-0.033568,0.028088,-0.143039,0.184225,-0.032833,0.038258,1.791583,-0.505287,...,0,0,0,0,0,0,0,0,0,0
1,-0.302071,0.531136,-0.033568,0.028088,-0.143039,-0.409089,-0.032833,0.038258,1.791583,1.884536,...,0,0,0,0,0,0,0,0,0,0
2,-0.302071,1.390776,-0.033568,0.028088,-0.143039,0.184225,-0.032833,0.038258,1.791583,2.030852,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X.shape

In [12]:
pearsoncorr = X.corr(method='pearson')

MemoryError: 

##### PCA - find no. of principal components

In [None]:
# Plot cumulative variance explained by principal components
pca = PCA().fit(X)
plt.plot(np.cumsum(pca.explained_variance_ratio_),'-o')
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

With 2 PCs, more than 80% of variance is preserved. 2 PCs also help us with visualization. Hence, we will pick 2 principal components

In [11]:
# Perform PCA and then k-means:
pca = PCA(2) #Build PCA of 2 principal components
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents)

# Rename columns:
#col_name = []
#for i in range(len(principalDf.columns)):
#    col_name.append('pc_' + str(i + 1))
#col_name
#principalDf.columns = col_name
principalDf.head()

MemoryError: 

In [None]:
plt.scatter(principalDf.iloc[:,0], principalDf.iloc[:,1], alpha = 0.05)
plt.title('Scatter Plot of fraud data')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

##### k-means - find no. of clusters

In [None]:
# k-means clustering: Elbow method to find optimal k
#principalDf = principalDf.values
Error =[]
for i in range(1, 21, 1):
    kmeans = KMeans(n_clusters = i).fit(principalDf)
    Error.append(kmeans.inertia_)

plt.plot(range(1, 21, 1), Error, '-o')
plt.title('Elbow method')
plt.xlabel('No of clusters')
plt.ylabel('Error')
plt.grid()
plt.show()

We observe elbow at k = 4

In [None]:
# Implement optimal k:
kmeans_optimal = KMeans(n_clusters = 4).fit(principalDf)
labels_pc = kmeans_optimal.fit_predict(principalDf)

In [None]:
# Visualize: 
plt.scatter(principalDf.iloc[:,0], principalDf.iloc[:,1], c = labels_pc, cmap = 'rainbow')
#plt.scatter(kmeans_optimal.cluster_centers_[:, 0], kmeans_optimal.cluster_centers_[:, 1], s=300, c='red')
plt.title('Scatter Plot of fraud data')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()