In [1]:
import numpy as np
import pandas as pd

In [2]:
print
print('Reading file...')
ccdata = pd.read_csv('creditcard.csv')

fraudtr = ccdata.loc[ccdata['Class'] == 1,:]
normaltr = ccdata.loc[ccdata['Class'] == 0,:]
fraudcnt = fraudtr.shape[0]
normalcnt = normaltr.shape[0]
print('fraudlent transactions count = {}'.format(fraudcnt))
print('normal transactions count = {}'.format(normalcnt))
print('percentage of fraudulent transactions {:.3f}%'.format(fraudcnt*100./(normalcnt+fraudcnt)))

seed=0

print
print('Upsampling data...')
upsample_total = int(fraudcnt/0.05)
normal_count = upsample_total-fraudcnt
normal_data = normaltr.sample(n=normal_count, random_state=seed)
upsample_data = pd.concat([fraudtr, normal_data])
print('upsample size = {}'.format(upsample_total))

seed = 0
feature_columns = ccdata.columns[:-1]
print(feature_columns)
print('feature columns count: {}'.format(len(feature_columns)))


Reading file...
fraudlent transactions count = 492
normal transactions count = 284315
percentage of fraudulent transactions 0.173%

Upsampling data...
upsample size = 9840
Index([u'Time', u'V1', u'V2', u'V3', u'V4', u'V5', u'V6', u'V7', u'V8', u'V9',
       u'V10', u'V11', u'V12', u'V13', u'V14', u'V15', u'V16', u'V17', u'V18',
       u'V19', u'V20', u'V21', u'V22', u'V23', u'V24', u'V25', u'V26', u'V27',
       u'V28', u'Amount'],
      dtype='object')
feature columns count: 30


In [15]:
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    print
    print('Splitting data...')
    splitdata = []
    Xy = upsample_data
    print('dataset size {}: {}, %fraud: {:.3f}'.format(len(splitdata)+1, Xy.shape[0],Xy['Class'].value_counts(sort=True).map(lambda x: x*100./Xy.shape[0])[1]))
    X = Xy.loc[:, feature_columns]
    y = Xy.loc[:,['Class']]
    scaler = MinMaxScaler(feature_range=(0,1))
    X_scaled = scaler.fit_transform(X)
    X_valtr, X_test, y_valtr, y_test = train_test_split(X_scaled, y, stratify=y, random_state=seed)
    X_train, X_valid, y_train, y_valid = train_test_split(X_valtr, y_valtr, stratify=y_valtr, random_state=seed)
    splitdata.append({
        'X_train': X_train,
        'y_train': y_train,
        'X_valid': X_valid,
        'y_valid': y_valid,
        'X_test': X_test,
        'y_test': y_test
    })


Splitting data...
dataset size 1: 9840, %fraud: 5.000


In [16]:
for key in splitdata[0].keys():
    print key, splitdata[0][key].shape

X_test (2460, 30)
X_train (5535, 30)
X_valid (1845, 30)
y_valid (1845, 1)
y_train (5535, 1)
y_test (2460, 1)


In [20]:
for i in ['test', 'train', 'valid']:
    np.savetxt('ccX'+ i + '.csv', splitdata[0]['X_'+i], fmt='%.10f', delimiter=',')
    np.savetxt('ccy'+ i + '.csv', splitdata[0]['y_'+i], fmt='%d', delimiter=',')

In [21]:
y_test.head()

Unnamed: 0,Class
11710,1
120687,0
61627,0
197517,0
147780,0


On bash
```
$ for i in test train valid
do  
paste -d"," ccX$i.csv ccy$i.csv > cc$i.csv
done
```