In [1]:
from warnings import simplefilter
simplefilter('ignore')

import sklearn 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.special import rel_entr
from sklearn.neighbors import KernelDensity
from imblearn.under_sampling import NearMiss, RandomUnderSampler

np.random.seed(10120024)

# Basic Info

In [2]:
data = pd.read_csv('../creditcard.csv')

In [3]:
data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [4]:
feature_columns = ['V17', 'V12', 'V14', 'V10', 'V11', 'V16']
target_column = data.columns[-1]

feature_data = data.loc[:, feature_columns].values
target_data = data.loc[:, target_column].values

target_data[target_data == 0] = -1

# Visualize Probability Density

In [5]:
# fig, ax = plt.subplots(8, 4, figsize=(30, 30))

# for i, feature in enumerate(data.columns[:-1]):
#     plt.subplot(8, 4, i+1)
    
#     sns.kdeplot(data.loc[(target_data.values == 1).reshape(-1, 1).T[0], feature], bw=0.5,label="Class = 1")
#     sns.kdeplot(data.loc[(target_data.values == -1).reshape(-1, 1).T[0], feature], bw=0.5,label="Class = -1")
    
#     plt.xlabel(feature, fontsize=12)
    
#     plt.tick_params(axis='both', which='major', labelsize=12)
#     plt.legend(['Fraud', 'Non-Fraud'])

# plt.savefig('probability_density_vis.png')
# plt.show()

# Visualize KL Divergence

In [6]:
# kl = []

# for feature in data.columns[1:-1]:
#     # select fraud and non-fraud data
#     fraud_data = data.loc[(target_data.values == 1).reshape(-1, 1).T[0], feature]
#     non_fraud_data = data.loc[(target_data.values == -1).reshape(-1, 1).T[0], feature]

#     # create kde instance for fraud and non-fraud data
#     kde_fraud = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(fraud_data.values.reshape(-1, 1))
#     kde_non_fraud = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(non_fraud_data.values.reshape(-1, 1))

#     # create a set of numbers
#     X = np.linspace(data[feature].min(), data[feature].max(), 100).reshape(-1, 1)

#     # calculate the score
#     sample_fraud = np.exp(kde_fraud.score_samples(X)) 
#     sample_non_fraud = np.exp(kde_non_fraud.score_samples(X))
#     sample_fraud /= np.sum(sample_fraud)
#     sample_non_fraud /= np.sum(sample_non_fraud)

#     # calculate the kl-divergence
#     kl.append(sum(rel_entr(sample_fraud, sample_non_fraud)))

In [7]:
# kl_data = pd.DataFrame({'Columns':data.columns[1:-1], 
#                  'KL':kl}).sort_values('KL', ascending=False)

# fig = plt.figure()
# fig.set_size_inches(10, 5)

# sns.barplot(data=kl_data, 
#                x='Columns',
#                y='KL',
#                palette=['green' if val >= 4 else 'blue' for val in kl_data['KL'].values])

# plt.xticks(rotation=-45)

# plt.savefig('kl_div_vis.png')
# plt.show()

# Undersampling The Data

In [8]:
undersample = NearMiss(version=3, n_neighbors=10)

feature_sample, target_sample = undersample.fit_resample(feature_data, target_data)
undersampled_data = pd.concat((feature_sample, target_sample), axis=1)

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

# Select Non-Undersampled Data

In [None]:
undersampled_indexes = feature_sample.apply(lambda row: np.where(feature_data.values == row.values)[0][0], axis=1).values

non_undersampled_data = feature_data.drop(index=undersampled_indexes)
non_undersampled_data['Class'] = -1

# Visualize Distributions Before and After Undersampling

In [None]:
# undersampled_data['Type'] = 'Undersampled'
# non_undersampled_data['Type'] = 'Non-Undersampled'

# combined = pd.concat([
#                         undersampled_data,
#                         non_undersampled_data
#                     ], axis=0)

# combined['Type'] = combined.apply(lambda row: 'Non-Undersampled Non-Fraud' if row['Type'] == 'Non-Undersampled' else 'Undersampled Fraud' if row['Class'] == 1 else 'Undersampled Non-Fraud', axis=1)

In [None]:
# fig, ax = plt.subplots(2, 3, figsize=(30, 30))

# for i, feature in enumerate(combined.columns[:-2]):
#     plt.subplot(2, 3, i+1)
    
#     sns.boxplot(data=combined, x='Type', y=feature, order=['Non-Undersampled Non-Fraud', 'Undersampled Non-Fraud', 'Undersampled Fraud'])
#     plt.xlabel(feature, fontsize=12)
    
#     plt.xticks(rotation = 25)
#     plt.legend(['Fraud', 'Non-Fraud'])
    
# plt.savefig('distribution_before_after_vis.png')
# plt.show()

# Train-Test Split

In [None]:
test_size = 25
undersampled_data_train = pd.concat([
                                    undersampled_data.loc[undersampled_data['Class'] == 1].head(len(undersampled_data.loc[undersampled_data['Class'] == 1]) - test_size),
                                    undersampled_data.loc[undersampled_data['Class'] == -1].head(len(undersampled_data.loc[undersampled_data['Class'] == -1]) - test_size)
                                    ], axis=0)
undersampled_data_test = pd.concat([
                                    undersampled_data.loc[undersampled_data['Class'] == 1].tail(test_size),
                                    undersampled_data.loc[undersampled_data['Class'] == -1].tail(test_size)
                                    ], axis=0)

# Saves All Data

In [None]:
pd.concat((feature_data, target_data), axis=1).to_csv('processed_data.csv', index=False)
undersampled_data.to_csv('undersampled_data.csv', index=False)
undersampled_data_train.to_csv('undersampled_data_train.csv', index=False)
undersampled_data_test.to_csv('undersampled_data_test.csv', index=False)
non_undersampled_data.to_csv('non_undersampled_data.csv', index=False)