In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
data = pd.read_csv("../input/creditcard.csv")
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
data.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

##### Clearly the data is totally unbalanced!!

###### This is a clear example where using a typical accuracy score to evaluate our classification algorithm. For example, if we just used a majority class to assign values to all records, we will still be having a high accuracy, BUT WE WOULD BE CLASSIFYING ALL "1" INCORRECTLY!!
There are several ways to approach this classification problem taking into consideration this unbalance.
Collect more data? Nice strategy but not applicable in this case
Changing the performance metric:
Use the confusio nmatrix to calculate Precision, Recall
F1score (weighted average of precision recall)
Use Kappa - which is a classification accuracy normalized by the imbalance of the classes in the data
ROC curves - calculates sensitivity/specificity ratio.
Resampling the dataset
Essentially this is a method that will process the data to have an approximate 50-50 ratio.
One way to achieve this is by OVER-sampling, which is adding copies of the under-represented class (better when you have little data)
Another is UNDER-sampling, which deletes instances from the over-represented class (better when he have lot's of data)

In [4]:
from sklearn.preprocessing import StandardScaler

In [6]:
data['Amount'].reshape(-1,1)

array([[ 149.62],
       [   2.69],
       [ 378.66],
       ..., 
       [  67.88],
       [  10.  ],
       [ 217.  ]])

In [12]:
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].reshape(-1,1))

In [15]:
data = data.drop(['Time','Amount'],axis = 1)

In [16]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,normAmount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0,0.244964
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0,-0.342475
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0,1.160686
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0,0.140534
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0,-0.073403


In [54]:
X = data.ix[:,data.columns != 'Class']
y = data.ix[:,data.columns == 'Class']

In [26]:
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)

# Picking the indices of the normal classes
normal_indices = np.array(data[data.Class == 0].index)

In [27]:
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False)

In [41]:
len(random_normal_indices)

492

In [31]:
under_sample_indices = np.concatenate([random_normal_indices,fraud_indices])

In [44]:
under_sample_data = data.ix[under_sample_indices,:]
under_sample_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,normAmount
99491,-0.984677,0.568697,0.135391,-4.016034,0.406649,-0.509106,0.479542,0.486028,1.634792,-2.618002,...,-0.158616,-0.301181,-0.307951,-1.334014,0.44211,-1.109293,0.262984,0.097393,0,-0.329681
108895,0.658865,1.573861,-1.483989,1.606028,1.233124,-1.575252,1.249426,-0.577478,0.157303,-0.491822,...,-0.237145,0.018458,0.128589,0.04141,-0.577907,-0.441335,0.056127,-0.373906,0,-0.338237
255741,-0.829746,0.69773,1.1117,-0.736462,-0.353659,0.109797,-0.217719,0.651756,0.123429,-0.744705,...,0.326945,0.848629,-0.22989,-0.355075,-0.414129,0.532632,-0.126367,0.062578,0,-0.307251
205663,-2.688138,3.095469,-2.119114,-0.468013,-0.622565,-1.67347,-0.079085,1.097805,0.916147,0.643766,...,0.169969,0.835843,-0.030065,-0.087878,-0.129981,-0.17508,0.129899,-0.023263,0,-0.344514
183293,2.235258,-1.47813,-0.703674,-1.875986,-1.174868,0.040214,-1.457019,0.068532,-1.405483,1.739106,...,-0.306479,-0.595695,0.417357,0.206465,-0.599842,-0.505945,0.020427,-0.037631,0,-0.218294


In [33]:
X_undersample = under_sample_data.ix[:,under_sample_data.columns!='Class']
y__undersample = under_sample_data.ix[:,under_sample_data.columns=='Class']



In [50]:
float(len(under_sample_data[under_sample_data.Class == 0]))/len(under_sample_data)

0.5

In [53]:
from sklearn.cross_validation import train_test_split

X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample,y__undersample,test_size = 0.3,random_state = 0)

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report

In [62]:
fold = KFold(len(y_train_undersample),5,shuffle=False)

In [69]:
X_train_undersample

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,normAmount
92839,-1.777462,-0.406835,0.889257,-0.736005,2.679054,-2.006523,0.446889,-0.189831,-0.221247,-1.287869,...,0.225006,-0.088777,-0.691805,-0.268128,-0.603095,0.821405,0.153463,-0.274795,-0.135001,-0.350191
214775,-0.395582,-0.751792,-1.984666,-0.203459,1.903967,-1.430289,-0.076548,-0.992260,0.756307,0.217630,...,-1.027716,1.377515,2.151787,0.189225,0.772943,-0.872443,-0.200612,0.356856,0.032113,-0.350471
8845,-4.727713,3.044469,-5.598354,5.928191,-2.190770,-1.529323,-4.487422,0.916392,-1.307010,-4.138891,...,-0.207759,0.650988,0.254983,0.628843,-0.238128,-0.671332,-0.033590,-1.331777,0.705698,-0.231728
142754,1.072132,-0.281689,1.188254,1.739634,-0.848114,0.564672,-0.633982,0.371500,1.484735,-0.372933,...,-0.301034,-0.431254,-0.847681,0.100170,0.039475,0.372361,-0.504906,0.080580,0.026029,-0.297256
113343,1.139015,0.078841,0.266070,1.208853,-0.002932,0.317428,-0.094059,0.199578,0.178884,0.067532,...,-0.215207,-0.079771,-0.074708,-0.108405,-0.315464,0.648632,-0.307105,0.029654,0.001221,-0.307731
1151,-0.395115,0.607197,1.416013,-0.239631,-0.206378,0.061775,0.535386,0.058800,-0.048408,-0.486505,...,-0.036989,0.020585,0.205178,-0.251440,0.090647,0.114307,0.620850,-0.131471,-0.012454,-0.183910
136557,-2.719390,-2.646198,1.917540,-0.740002,0.140664,-0.751428,-1.716089,0.864723,-0.260219,-0.688452,...,0.726244,0.592356,0.865426,-0.120048,0.161862,0.281261,-0.145971,0.023806,-0.328578,0.002680
238466,1.833191,0.745333,-1.133009,3.893556,0.858164,0.910235,-0.498200,0.344703,-0.667939,0.398155,...,-0.085579,0.039289,0.181652,0.072981,-0.155299,-0.149891,0.012792,0.040854,0.022903,-0.283703
247995,1.146259,1.403458,-4.159148,2.660107,-0.323217,-1.836071,-1.623740,0.259562,-1.132044,-3.356474,...,0.284831,0.564450,0.445744,-0.141136,-0.265517,0.362260,-0.416062,0.507370,0.243744,-0.147848
9509,-4.710529,8.636214,-15.496222,10.313349,-4.351341,-3.322689,-10.788373,5.060381,-5.689311,-11.712187,...,1.434240,1.990545,0.223785,0.554408,-1.204042,-0.450685,0.641836,1.605958,0.721644,-0.349231
