Transaction Table *

* TransactionDT: timedelta from a given reference datetime (not an actual timestamp)
* TransactionAMT: transaction payment amount in USD
* ProductCD: product code, the product for each transaction
* card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.
* addr: address
* dist: distance
* P_ and (R__) emaildomain: purchaser and recipient email domain
* C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
* D1-D15: timedelta, such as days between previous transaction, etc.
* M1-M9: match, such as names on card and address, etc.
* Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.

Categorical Features:

* ProductCD
* card1 - card6
* addr1, addr2
* P_emaildomain
* R_emaildomain
* M1 - M9

Identity Table *

Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
They're collected by Vesta’s fraud protection system and digital security partners.
(The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement)

* Categorical Features:
* DeviceType
* DeviceInfo
* id_12 - id_38



In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
train_identity = pd.read_csv('data/ieee-fraud-detection/train_identity.csv')
train_transaction = pd.read_csv('data/ieee-fraud-detection/train_transaction.csv')
# test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
# test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

In [3]:
train_identity.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [4]:
train_transaction.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

In [6]:
train.shape

(590540, 434)

In [7]:
print(train.sample(5).T.to_string())


                 170461               49312      212714     381730     77131 
TransactionID   3157461              3036312    3199714    3368730    3064131
isFraud               0                    0          0          0          0
TransactionDT   3687259              1181785    4930096    9564978    1696839
TransactionAmt    226.0                250.0     109.95       97.0       59.0
ProductCD             W                    R          W          W          W
card1              9749                15063      17188      15497       1546
card2             528.0                514.0      321.0      490.0      111.0
card3             150.0                150.0      150.0      150.0      150.0
card4              visa                 visa       visa       visa       visa
card5             226.0                226.0      226.0      226.0      226.0
card6            credit               credit      debit      debit      debit
addr1             337.0                299.0      299.0      299

In [8]:
train.shape

(590540, 434)

In [9]:
train.isnull().sum()


TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
id_36             449555
id_37             449555
id_38             449555
DeviceType        449730
DeviceInfo        471874
Length: 434, dtype: int64

In [10]:
dict(zip(train.columns, train.isna().sum()))

{'TransactionID': 0,
 'isFraud': 0,
 'TransactionDT': 0,
 'TransactionAmt': 0,
 'ProductCD': 0,
 'card1': 0,
 'card2': 8933,
 'card3': 1565,
 'card4': 1577,
 'card5': 4259,
 'card6': 1571,
 'addr1': 65706,
 'addr2': 65706,
 'dist1': 352271,
 'dist2': 552913,
 'P_emaildomain': 94456,
 'R_emaildomain': 453249,
 'C1': 0,
 'C2': 0,
 'C3': 0,
 'C4': 0,
 'C5': 0,
 'C6': 0,
 'C7': 0,
 'C8': 0,
 'C9': 0,
 'C10': 0,
 'C11': 0,
 'C12': 0,
 'C13': 0,
 'C14': 0,
 'D1': 1269,
 'D2': 280797,
 'D3': 262878,
 'D4': 168922,
 'D5': 309841,
 'D6': 517353,
 'D7': 551623,
 'D8': 515614,
 'D9': 515614,
 'D10': 76022,
 'D11': 279287,
 'D12': 525823,
 'D13': 528588,
 'D14': 528353,
 'D15': 89113,
 'M1': 271100,
 'M2': 271100,
 'M3': 271100,
 'M4': 281444,
 'M5': 350482,
 'M6': 169360,
 'M7': 346265,
 'M8': 346252,
 'M9': 346252,
 'V1': 279287,
 'V2': 279287,
 'V3': 279287,
 'V4': 279287,
 'V5': 279287,
 'V6': 279287,
 'V7': 279287,
 'V8': 279287,
 'V9': 279287,
 'V10': 279287,
 'V11': 279287,
 'V12': 76073,
 

In [11]:
for col, values in train.iteritems():
    num_uniques = values.nunique()
    print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))
    print (values.unique()[:20], "....")
    print ('\n')

TransactionID: 590540
[2987000 2987001 2987002 2987003 2987004 2987005 2987006 2987007 2987008
 2987009 2987010 2987011 2987012 2987013 2987014 2987015 2987016 2987017
 2987018 2987019] ....


isFraud: 2
[0 1] ....


TransactionDT: 573349
[86400 86401 86469 86499 86506 86510 86522 86529 86535 86536 86549 86555
 86564 86585 86596 86618 86620 86668 86725 86730] ....


TransactionAmt: 20902
[ 68.5    29.     59.     50.     49.    159.    422.5    15.    117.
  75.887  16.495  40.     10.5    57.95   30.    100.     47.95  186.
  39.    159.95 ] ....


ProductCD: 5
['W' 'H' 'C' 'S' 'R'] ....


card1: 13553
[13926  2755  4663 18132  4497  5937 12308 12695  2803 17399 16496  4461
  3786 12866 11839  7055  1790 11492  7005  7875] ....


card2: 500
[ nan 404. 490. 567. 514. 555. 360. 100. 111. 352. 375. 418. 303. 314.
 543. 583. 148. 321. 269. 361.] ....


card3: 114
[150. 117. 185. 143. 144. 163. 146. 191. 162. 119. 147. 100. 135. 137.
 138. 102. 213. 106. 214. 148.] ....


card4: 4
['discov

[nan  0.  2.  1.  3.  4.  6.  5.] ....


V52: 9
[nan  0.  2.  1.  3.  4.  5.  6.  7. 12.] ....


V53: 6
[ 1.  0. nan  2.  3.  5.  4.] ....


V54: 7
[ 1.  0. nan  2.  3.  5.  4.  6.] ....


V55: 18
[ 1. nan  4.  2.  3.  5.  6.  0.  7.  8.  9. 14. 10. 11. 12. 13. 15. 16.
 17.] ....


V56: 52
[ 1. nan  4.  2.  3.  5.  6.  0.  9. 10.  7.  8. 11. 12. 13. 14. 15. 24.
 29. 16.] ....


V57: 7
[ 0. nan  1.  2.  6.  3.  4.  5.] ....


V58: 11
[ 0. nan  1.  2.  6.  7.  4.  3.  5.  8.  9. 10.] ....


V59: 17
[ 0. nan  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 12. 11. 13. 14. 15. 16.] ....


V60: 17
[ 0. nan  1.  2.  3.  5.  4.  6.  7.  8.  9. 10. 12. 11. 13. 14. 15. 16.] ....


V61: 7
[ 1. nan  0.  2.  3.  6.  4.  5.] ....


V62: 11
[ 1. nan  2.  0.  3.  6.  4.  7.  5.  8.  9. 10.] ....


V63: 8
[ 0. nan  1.  2.  3.  4.  5.  6.  7.] ....


V64: 8
[ 0. nan  1.  2.  4.  3.  5.  6.  7.] ....


V65: 2
[ 1. nan  0.] ....


V66: 8
[ 1. nan  2.  0.  5.  3.  4.  6.  7.] ....


V67: 9
[ 1. nan  2.  0.  3.  5.

V156: 25
[nan  0.  1.  2.  8.  9.  3.  4.  5.  6.  7. 10. 11. 12. 13. 14. 15. 16.
 17. 18.] ....


V157: 25
[nan  0.  1.  2.  7.  3.  4.  5.  6.  8.  9. 10. 11. 12. 13. 14. 15. 16.
 17. 18.] ....


V158: 25
[nan  0.  1.  2.  8.  3.  9.  4.  5.  6.  7. 10. 11. 12. 13. 14. 15. 16.
 17. 18.] ....


V159: 6663
[           nan 15557.99023438 15607.99023438 15622.99023438
 15652.99023438 15672.99023438     0.         15457.99023438
 15382.99023438 15582.99023438 15632.99023438 15657.99023438
 15732.99023438 15782.99023438 15762.99023438 15862.99023438
    30.         16062.99023438 16077.99023438 16102.99023438] ....


V160: 9621
[           nan 1.69690797e+05 1.69740797e+05 1.69755797e+05
 1.69785797e+05 1.69885797e+05 1.45000000e+02 1.69970797e+05
 1.70020797e+05 1.20000000e+02 1.70320797e+05 0.00000000e+00
 1.70420797e+05 1.70445797e+05 1.70545797e+05 1.70595797e+05
 1.70695797e+05 1.70725797e+05 1.70825797e+05 2.77000000e+02] ....


V161: 79
[ nan   0. 500.  30.  50.  20. 100. 150. 200. 

[nan  1.  4.  5.  6.  2.  3. 59. 60. 61. 62.  8. 63.  9. 10. 64. 65.  7.
 35. 36.] ....


V231: 294
[nan  0.  1.  2.  3.  4.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.  5.  6.
 17. 18.] ....


V232: 338
[nan  0.  2.  1.  3. 33.  7.  8.  4.  9. 10. 11. 12. 13. 14. 15. 16.  5.
  6. 32.] ....


V233: 333
[nan  0.  1.  2.  3. 18.  4.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.  5.
  6. 17.] ....


V234: 122
[nan  0.  1.  2. 35.  3.  4.  5. 34. 22. 49. 50.  7. 51. 18.  8. 54.  6.
 55. 36.] ....


V235: 24
[nan  0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.
 17. 18.] ....


V236: 46
[nan  0.  1.  2. 29.  3.  4.  5. 11.  7. 18.  8. 28. 30. 26. 27. 31.  9.
  6. 19.] ....


V237: 40
[nan  0.  1.  2. 13.  3.  4.  7.  5.  8. 12. 10.  9.  6. 11. 14. 15. 16.
 17. 18.] ....


V238: 24
[nan  0.  1.  2.  3.  4.  7.  5.  6.  8. 11.  9. 14. 10. 23. 16. 12. 13.
 15. 20.] ....


V239: 24
[nan  0.  1.  2.  3.  4.  7.  5.  8.  6. 11.  9. 14. 10. 23. 16. 12. 13.
 15. 20.] ....


V240: 6
[nan  1

[ 0.  1.  2.  7.  4.  3. 13.  5.  8.  6.  9. nan 17. 10. 11. 12. 15. 14.
 16. 18.] ....


V304: 17
[ 0.  1.  2.  3.  7.  4.  5. nan  6.  8.  9. 10. 11. 12. 13. 14. 15. 16.] ....


V305: 2
[ 1.  2. nan] ....


V306: 16210
[   0.           50.          166.21539307   29.          774.
  200.           58.95000076  530.         1054.          500.
   77.          780.          100.           42.59609985  280.
   54.37799835  107.94999695   75.88749695   66.15989685   89.9654007 ] ....


V307: 37367
[  117.             0.          1758.           166.21539307
    60.           100.           663.5          170.91999817
    29.          9969.5          145.           774.
    48.95000076    49.         10169.5          103.94999695
   421.8999939    527.           167.8999939    140.        ] ....


V308: 23064
[   0.          925.          166.21539307  102.5          27.96999931
   29.         2507.5          25.          774.           49.
 2707.5         527.           58.95000076  655.

id_16: 2
[nan 'NotFound' 'Found'] ....


id_17: 104
[ nan 166. 121. 225. 102. 148. 199. 146. 144. 133. 100. 130. 218. 150.
 195. 153. 159. 142. 210. 200.] ....


id_18: 18
[nan 15. 18. 13. 12. 20. 21. 14. 26. 24. 17. 11. 29. 16. 28. 25. 23. 27.
 10.] ....


id_19: 522
[ nan 542. 621. 410. 176. 529. 352. 484. 254. 278. 307. 266. 290. 548.
 122. 215. 100. 345. 242. 193.] ....


id_20: 394
[ nan 144. 500. 142. 507. 575. 600. 533. 333. 549. 566. 305. 401. 391.
 535. 325. 222. 277. 368. 597.] ....


id_21: 490
[ nan 252. 657. 724. 228. 369. 796. 755. 848. 734. 849. 596. 672. 255.
 457. 164. 409. 130. 680. 510.] ....


id_22: 25
[nan 14. 41. 21. 33. 35. 19. 20. 31. 12. 36. 23. 28. 40. 24. 39. 22. 43.
 26. 44.] ....


id_23: 3
[nan 'IP_PROXY:TRANSPARENT' 'IP_PROXY:ANONYMOUS' 'IP_PROXY:HIDDEN'] ....


id_24: 12
[nan 11. 15. 16. 12. 21. 18. 25. 26. 19. 23. 24. 17.] ....


id_25: 341
[ nan 321. 161. 460. 426. 205. 268. 509. 132. 516. 485. 365. 501. 427.
 356. 191. 514. 442. 525. 533.] ....


id_

In [12]:
train_noV = train.drop(columns=[x for x in train.columns if x.startswith("V")])
train_noV.shape

(590540, 95)

In [13]:
frauds = train_noV.shape[0]
yes = train_noV['isFraud'].sum()
no = frauds - yes
print('{} frauds'.format(frauds))
print('{} yes \n{} no'.format(yes,no))

590540 frauds
20663 yes 
569877 no


# Hold Out Validation

In [14]:
train_noV.dtypes

TransactionID       int64
isFraud             int64
TransactionDT       int64
TransactionAmt    float64
ProductCD          object
                   ...   
id_36              object
id_37              object
id_38              object
DeviceType         object
DeviceInfo         object
Length: 95, dtype: object

In [15]:
train_noV_noObjects = train_noV.loc[:, train_noV.dtypes != object]
train_noV_noObjects.shape

(590540, 64)

In [16]:
# dict(zip(train_noV_noObjects.columns, train_noV_noObjects.isna().sum()))

In [17]:
y = train_noV_noObjects.pop('isFraud').values   #target classes
Xraw = train_noV_noObjects.values 
# Using the median for missing values
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp = imp.fit(Xraw)
Ximp = imp.transform(Xraw)
# normalise the raw data
scaler = StandardScaler()
X = scaler.fit_transform(Ximp)
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=1/3)
print("X_test shape: ",X_test.shape)

X_test shape:  (196847, 63)


error seems to come from dividing by a NaN 


In [18]:
# train a NB model and predict for test data
mnb = GaussianNB()
fraud_NB = mnb.fit(X_train, y_train)
y_dash = fraud_NB.predict(X_test)

In [19]:
y_dash

array([0, 0, 1, ..., 0, 0, 0])

# Accuracy & Confusion Matrix

In [20]:
acc = accuracy_score(y_test, y_dash)
print("Accuracy: {0:.2f}".format(acc)) 
confusion = confusion_matrix(y_test, y_dash)
print("Confusion matrix:\n{}".format(confusion)) 

Accuracy: 0.89
Confusion matrix:
[[173611  16429]
 [  4490   2317]]


In [21]:
y_dash_prob = fraud_NB.predict_proba(X_test)
y_dash_prob_dec3 = np.round(y_dash_prob, decimals=3)[:,1] # get column 1

# Test data


In [22]:
# Read test data
test_transaction = pd.read_csv("data/ieee-fraud-detection/test_transaction.csv")
print("test_transaction.csv shape:", test_transaction.shape)
test_identity = pd.read_csv("data/ieee-fraud-detection/test_identity.csv")
print("test_identity.csv shape:", test_identity.shape)
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
test_drop = test.drop(columns=[x for x in test.columns if x.startswith("V")])
test_drop = test_drop.loc[:, test_drop.dtypes != object]

test_transaction.csv shape: (506691, 393)
test_identity.csv shape: (141907, 41)


In [23]:
test_drop.shape

(506691, 63)

In [24]:
# Using the median for train for my missing values
test_drop = imp.transform(test_drop)
# normalise the raw data
test_drop = scaler.fit_transform(test_drop)

In [25]:
test_dash_prob = fraud_NB.predict_proba(test_drop)
test_dash_prob_dec3 = np.round(test_dash_prob, decimals=3)[:,1] # get column 1

In [26]:
submission = pd.DataFrame({"TransactionID": test.TransactionID, "isFraud": test_dash_prob_dec3}) 

In [27]:
submission.to_csv("data/submission.csv", index=False)

In [28]:
mnb.get_params()

{'priors': None, 'var_smoothing': 1e-09}

In [29]:
submission.shape

(506691, 2)