<a href="https://colab.research.google.com/github/castrostephano/CreditCardDetection/blob/main/CreditCardFraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [9]:
credit_card_data = pd.read_csv('/creditcard.csv')

In [10]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [None]:
credit_card_data.tail()

In [None]:
credit_card_data.info()

In [15]:
#distribution of legit/fraud transactions - (0=normal trans 1=fraud trans)
#more than 99% is non-fraud, so ml can't recognize fraud because there's a lot less data
#so we have to handle this unbalanced datasets
credit_card_data['Class'].value_counts()

0.0    41574
1.0      108
Name: Class, dtype: int64

In [None]:
credit_card_data.isnull().sum()

In [None]:
#Distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

In [17]:
#we have to separate the data into fraud and legit to feed into the machine

legit = credit_card_data[credit_card_data.Class == (0)]
fraud = credit_card_data[credit_card_data.Class == (1)]

In [18]:
#normal trans, 31 columns/ fraud trans, 31 columns
print(legit.shape)
print(fraud.shape)

(41574, 31)
(108, 31)


In [19]:
#Amount is the amount that was used in the the trans
# average is $88 
legit.Amount.describe()

count    41574.000000
mean        89.092554
std        237.129967
min          0.000000
25%          7.550000
50%         24.000000
75%         80.000000
max       7879.420000
Name: Amount, dtype: float64

In [20]:
#mean is $93, more than legit trans
fraud.Amount.describe()

count     108.000000
mean       93.512593
std       250.863273
min         0.000000
25%         1.000000
50%         2.320000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [22]:
#compare the values for both of the classes
#the differences are important. they train the machine what's fraud and what's legit
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0.0,26220.447082,-0.203702,0.021029,0.74135,0.17207,-0.222039,0.105713,-0.09336,0.035106,0.201757,-0.050262,0.397763,-0.415135,0.226988,0.246219,0.102912,0.009163,0.165792,-0.084449,-0.033582,0.04542,-0.030057,-0.110755,-0.039591,0.007931,0.135806,0.022783,0.005968,0.004583,89.092554
1.0,21414.990741,-7.454643,5.595067,-10.62406,5.849772,-5.11595,-2.288288,-7.382457,3.691487,-2.904887,-6.511032,5.397581,-8.12995,0.345852,-8.295162,-0.007414,-4.722107,-7.554117,-2.66742,0.428298,0.630658,0.602725,-0.336878,-0.313708,-0.237598,0.310378,0.18347,0.791559,0.112874,93.512593


In [29]:
#we have to undersample the larger dataset
#build a sample dataset containing similar distribution of legit trans (because that's the much larger dataset)
#because there are 492 fraud trans, i'm going to take 492 random legit trans and join them. makes the distribution even

legit_sample = legit.sample(n=108)




In [30]:
#now i will concatenate two df's
#axis is 0 = concat row wise, 1 would be column (which we don't want)

concat_dataset = pd.concat([legit_sample,fraud], axis=0)

In [31]:
concat_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
16,12,1.103215,-0.040296,1.267332,1.289091,-0.735997,0.288069,-0.586057,0.18938,0.782333,-0.267975,-0.450311,0.936708,0.70838,-0.468647,0.354574,-0.246635,-0.009212,-0.595912,-0.575682,-0.11391,-0.024612,0.196002,0.013802,0.103758,0.364298,-0.382261,0.092809,0.037051,12.99,0.0
27931,34773,1.130214,0.261687,0.347462,1.218957,-0.616529,-1.089168,0.003354,-0.176947,0.448494,-0.803362,0.210979,0.734992,0.409219,-1.662385,-0.345774,-0.081056,1.291167,-0.217976,-0.228301,-0.013056,-0.12721,-0.132587,-0.052853,0.876395,0.492759,0.374385,0.003284,0.052867,29.56,0.0
28043,34823,-0.685598,-0.773272,0.957227,-2.647685,-0.015797,-0.54595,-0.368213,-0.167319,-1.963036,1.80212,0.325893,-0.579822,0.857181,-0.71658,-0.771298,0.056705,-0.496983,0.953191,-0.417815,0.026145,0.002639,0.725083,-0.209376,-0.488714,-0.268698,-0.237374,0.371485,0.122312,30.0,0.0
30869,36111,-0.764152,0.500283,1.191497,-1.397281,-0.719888,-0.703611,0.482568,-0.247566,-1.580204,0.847102,1.675528,0.294901,0.911609,-0.307583,-0.325517,0.968837,0.04331,-0.934832,0.878416,0.172899,0.324851,0.960745,-0.172578,0.591163,0.194616,-0.203914,-0.050639,0.081701,64.8,0.0
30738,36062,1.155097,0.033699,-0.090881,1.026957,0.435032,0.809886,-0.036617,0.207924,0.13376,0.039198,0.090821,0.74608,-0.174612,0.231627,-0.688402,-0.175799,-0.400551,-0.077159,0.354429,-0.101388,-0.113392,-0.184716,-0.239467,-1.144058,0.773572,-0.259407,0.028774,-0.002077,33.56,0.0


In [39]:
concat_dataset['Class'].value_counts()

1.0    108
0.0    108
Name: Class, dtype: int64

In [40]:
#this step is important bc it lets us know if its good or bad, bad would be very dif. this is looking good to go
concat_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0.0,25294.759259,-0.012506,0.145995,0.660691,0.176024,-0.156906,0.271292,0.000626,0.069214,0.126333,-0.057993,0.530235,-0.158841,0.036984,0.217469,0.098595,-0.004948,0.125774,-0.133175,-0.000769,0.003656,0.063703,-0.064915,-0.020604,0.059905,0.163558,0.042373,0.005333,-0.018222,78.346481
1.0,21414.990741,-7.454643,5.595067,-10.62406,5.849772,-5.11595,-2.288288,-7.382457,3.691487,-2.904887,-6.511032,5.397581,-8.12995,0.345852,-8.295162,-0.007414,-4.722107,-7.554117,-2.66742,0.428298,0.630658,0.602725,-0.336878,-0.313708,-0.237598,0.310378,0.18347,0.791559,0.112874,93.512593


In [42]:
#split the data into targets (0 or 1) and features
#axis is 1 bc we want to drop a column, not a row

X= concat_dataset.drop(columns = 'Class', axis = 1)
Y = concat_dataset['Class']

In [44]:
print(X)

        Time        V1        V2  ...       V27       V28  Amount
16        12  1.103215 -0.040296  ...  0.092809  0.037051   12.99
27931  34773  1.130214  0.261687  ...  0.003284  0.052867   29.56
28043  34823 -0.685598 -0.773272  ...  0.371485  0.122312   30.00
30869  36111 -0.764152  0.500283  ... -0.050639  0.081701   64.80
30738  36062  1.155097  0.033699  ...  0.028774 -0.002077   33.56
...      ...       ...       ...  ...       ...       ...     ...
39183  39729 -0.964567 -1.643541  ... -0.308555 -0.164500  776.83
40085  40086  1.083693  1.179501  ...  0.096701  0.114972    1.00
40525  40276  1.159373  2.844795  ...  0.009979  0.160769    1.00
41395  40662 -4.446847 -0.014793  ...  0.492560  0.971834    1.00
41569  40742 -2.377533  0.520539  ...  0.500326  0.551760    1.00

[216 rows x 30 columns]


In [45]:
print(Y)

16       0.0
27931    0.0
28043    0.0
30869    0.0
30738    0.0
        ... 
39183    1.0
40085    1.0
40525    1.0
41395    1.0
41569    1.0
Name: Class, Length: 216, dtype: float64


In [46]:
#split into train and test data
# 20% is going to test data, meaning 80% will be train
#strat Y = evenly dist 2 classes in both x/y train/test
#random state is just reproducing 2 times

X_train, X_test, Y_train, Y_test = train_test_split(X , Y, test_size = 0.2, stratify = Y, random_state=2)

In [48]:
print(X.shape, X_train.shape, X_test.shape)

(216, 30) (172, 30) (44, 30)


In [49]:
#model training. Using logistic regression model (LR) for binary classification models (0,1...like this one)

model = LogisticRegression()

In [50]:
#training (or fitting) our LR model w training data

model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [51]:
#evaluate our model based on accuracy score
#predicting the labels for all the training data 
# second line will compare trained data fed through the model vs original labels stored in Y_train
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [52]:
#very good accuracy score, out of 100 predictions, our model can predict 98 of them. test data is next
#why did we print training data? because if the training and test data accuracy scores are very different, then your model isn't good. over/underfit
print(training_data_accuracy)

0.9883720930232558


In [53]:
#at this point we've only fed the model  training data, next we do the same thing as above, but with test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [54]:
#slightly lower than training data, but still really good
print(test_data_accuracy)

0.9772727272727273
