# Prepare identity

Dataset Description
-------------------

In this competition you are predicting the probability that an online transaction is fraudulent, as denoted by the binary target `isFraud`.

The data is broken into two files `identity` and `transaction`, which are joined by `TransactionID`. Not all transactions have corresponding identity information.

### Categorical Features - Transaction

*   `ProductCD`
*   `card1` - `card6`
*   `addr1`, `addr2`
*   `P_emaildomain`
*   `R_emaildomain`
*   `M1` - `M9`

The `TransactionDT` feature is a timedelta from a given reference datetime (not an actual timestamp).

You can read more about the data from [this post by the competition host](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203).

Files
-----

*   **train\_{transaction, identity}.csv** - the training set
*   **test\_{transaction, identity}.csv** - the test set (you must predict the `isFraud` value for these observations)
*   **sample\_submission.csv** - a sample submission file in the correct format

Link: https://www.kaggle.com/competitions/ieee-fraud-detection

In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

<IPython.core.display.Javascript object>

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [12]:
scaler = StandardScaler()
le = LabelEncoder()

<IPython.core.display.Javascript object>

In [3]:
test_transaction_df = pd.read_csv(
    "../../data/ieee-fraud-detection/test_transaction.csv"
).set_index("TransactionID")
test_transaction_df

Unnamed: 0_level_0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663549,18403224,31.950,W,10409,111.0,150.0,visa,226.0,debit,170.0,...,,,,,,,,,,
3663550,18403263,49.000,W,4272,111.0,150.0,visa,226.0,debit,299.0,...,,,,,,,,,,
3663551,18403310,171.000,W,4476,574.0,150.0,visa,226.0,debit,472.0,...,,,,,,,,,,
3663552,18403310,284.950,W,10989,360.0,150.0,visa,166.0,debit,205.0,...,,,,,,,,,,
3663553,18403317,67.950,W,18018,452.0,150.0,mastercard,117.0,debit,264.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,34214279,94.679,C,13832,375.0,185.0,mastercard,224.0,debit,284.0,...,,,,,,,,,,
4170236,34214287,12.173,C,3154,408.0,185.0,mastercard,224.0,debit,,...,,,,,,,,,,
4170237,34214326,49.000,W,16661,490.0,150.0,visa,226.0,debit,327.0,...,,,,,,,,,,
4170238,34214337,202.000,W,16621,516.0,150.0,mastercard,224.0,debit,177.0,...,,,,,,,,,,


<IPython.core.display.Javascript object>

In [4]:
train_transaction_df = pd.read_csv(
    "../../data/ieee-fraud-detection/train_transaction.csv"
).set_index("TransactionID")
train_transaction_df

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,
2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,
2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,
2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,
2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,debit,...,,,,,,,,,,
3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,
3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,


<IPython.core.display.Javascript object>

In [5]:
df = pd.concat([train_transaction_df, test_transaction_df]).sort_index()
df["isTest"] = df.index.isin(test_transaction_df.index).astype(int)
df

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V331,V332,V333,V334,V335,V336,V337,V338,V339,isTest
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,0.0,86400,68.500,W,13926,,150.0,discover,142.0,credit,...,,,,,,,,,,0
2987001,0.0,86401,29.000,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,,,,,,0
2987002,0.0,86469,59.000,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,,,,,,0
2987003,0.0,86499,50.000,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,,,,,,0
2987004,0.0,86506,50.000,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,,34214279,94.679,C,13832,375.0,185.0,mastercard,224.0,debit,...,,,,,,,,,,1
4170236,,34214287,12.173,C,3154,408.0,185.0,mastercard,224.0,debit,...,,,,,,,,,,1
4170237,,34214326,49.000,W,16661,490.0,150.0,visa,226.0,debit,...,,,,,,,,,,1
4170238,,34214337,202.000,W,16621,516.0,150.0,mastercard,224.0,debit,...,,,,,,,,,,1


<IPython.core.display.Javascript object>

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1097231 entries, 2987000 to 4170239
Columns: 394 entries, isFraud to isTest
dtypes: float64(377), int64(3), object(14)
memory usage: 3.2+ GB


<IPython.core.display.Javascript object>

In [7]:
(df.isna().sum() / len(df)).sort_values(ascending=False)

dist2             0.932500
D7                0.909727
D12               0.877901
D8                0.863963
D9                0.863963
                    ...   
TransactionDT     0.000000
card1             0.000000
ProductCD         0.000000
TransactionAmt    0.000000
isTest            0.000000
Length: 394, dtype: float64

<IPython.core.display.Javascript object>

# Feature engineering

## Numeric

In [9]:
numeric_df = df._get_numeric_data().drop(["isFraud", "isTest"], axis=1).copy()
numeric_df

Unnamed: 0_level_0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,86400,68.500,13926,,150.0,142.0,315.0,87.0,19.0,,...,,,,,,,,,,
2987001,86401,29.000,2755,404.0,150.0,102.0,325.0,87.0,,,...,,,,,,,,,,
2987002,86469,59.000,4663,490.0,150.0,166.0,330.0,87.0,287.0,,...,,,,,,,,,,
2987003,86499,50.000,18132,567.0,150.0,117.0,476.0,87.0,,,...,,,,,,,,,,
2987004,86506,50.000,4497,514.0,150.0,102.0,420.0,87.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,34214279,94.679,13832,375.0,185.0,224.0,284.0,60.0,,,...,,,,,,,,,,
4170236,34214287,12.173,3154,408.0,185.0,224.0,,,,157.0,...,,,,,,,,,,
4170237,34214326,49.000,16661,490.0,150.0,226.0,327.0,87.0,,,...,,,,,,,,,,
4170238,34214337,202.000,16621,516.0,150.0,224.0,177.0,87.0,,,...,,,,,,,,,,


<IPython.core.display.Javascript object>

In [13]:
numeric_df[numeric_df.columns] = scaler.fit_transform(numeric_df.fillna(0))
numeric_df

Unnamed: 0_level_0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,-1.508623,-0.274058,0.817417,-2.186194,-0.176293,-1.260537,0.435966,0.375378,-0.104394,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
2987001,-1.508623,-0.437119,-1.465279,0.285881,-0.176293,-2.159569,0.510346,0.375378,-0.187625,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
2987002,-1.508616,-0.313275,-1.075396,0.812114,-0.176293,-0.721117,0.547536,0.375378,1.069600,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
2987003,-1.508614,-0.350428,1.676877,1.283277,-0.176293,-1.822432,1.633481,0.375378,-0.187625,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
2987004,-1.508613,-0.350428,-1.109316,0.958970,-0.176293,-2.159569,1.216954,0.375378,-0.187625,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,1.646657,-0.165987,0.798209,0.108430,2.094471,0.582480,0.205389,-0.579101,-0.187625,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
4170236,1.646658,-0.506583,-1.383747,0.310357,2.094471,0.582480,-1.906998,-2.700166,-0.187625,0.924042,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
4170237,1.646662,-0.354556,1.376291,0.812114,-0.176293,0.627431,0.525222,0.375378,-0.187625,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324
4170238,1.646663,0.277047,1.368117,0.971208,-0.176293,0.582480,-0.590475,0.375378,-0.187625,-0.103594,...,-0.053743,-0.025069,-0.03211,-0.029217,-0.010574,-0.030063,-0.01842,-0.016521,-0.017925,-0.014324


<IPython.core.display.Javascript object>

## Categorical

In [14]:
cat_columns = set(df.columns) - set(df._get_numeric_data().columns)
cat_columns

{'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'P_emaildomain',
 'ProductCD',
 'R_emaildomain',
 'card4',
 'card6'}

<IPython.core.display.Javascript object>

In [15]:
categorical_df = df[cat_columns].apply(le.fit_transform).copy()
categorical_df

  categorical_df = df[cat_columns].apply(le.fit_transform).copy()


Unnamed: 0_level_0,P_emaildomain,M6,M8,M2,M1,M9,M4,ProductCD,R_emaildomain,card6,card4,M3,M7,M5
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2987000,60,1,2,1,1,2,2,4,60,1,1,1,2,0
2987001,16,1,2,2,2,2,0,4,60,1,2,2,2,1
2987002,35,0,0,1,1,0,0,4,60,2,3,1,0,0
2987003,54,0,2,2,2,2,0,4,60,2,2,2,2,1
2987004,16,2,2,2,2,2,3,1,60,1,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,16,2,2,2,2,2,2,0,16,2,2,2,2,2
4170236,19,2,2,2,2,2,2,0,19,2,2,2,2,2
4170237,19,0,1,1,1,1,0,4,60,2,3,1,0,0
4170238,19,0,0,1,1,0,0,4,60,2,2,1,0,0


<IPython.core.display.Javascript object>

In [16]:
prep_df = numeric_df.join(categorical_df).join(df[["isFraud", "isTest"]])
prep_df

Unnamed: 0_level_0,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,M4,ProductCD,R_emaildomain,card6,card4,M3,M7,M5,isFraud,isTest
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,-1.508623,-0.274058,0.817417,-2.186194,-0.176293,-1.260537,0.435966,0.375378,-0.104394,-0.103594,...,2,4,60,1,1,1,2,0,0.0,0
2987001,-1.508623,-0.437119,-1.465279,0.285881,-0.176293,-2.159569,0.510346,0.375378,-0.187625,-0.103594,...,0,4,60,1,2,2,2,1,0.0,0
2987002,-1.508616,-0.313275,-1.075396,0.812114,-0.176293,-0.721117,0.547536,0.375378,1.069600,-0.103594,...,0,4,60,2,3,1,0,0,0.0,0
2987003,-1.508614,-0.350428,1.676877,1.283277,-0.176293,-1.822432,1.633481,0.375378,-0.187625,-0.103594,...,0,4,60,2,2,2,2,1,0.0,0
2987004,-1.508613,-0.350428,-1.109316,0.958970,-0.176293,-2.159569,1.216954,0.375378,-0.187625,-0.103594,...,3,1,60,1,2,2,2,2,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170235,1.646657,-0.165987,0.798209,0.108430,2.094471,0.582480,0.205389,-0.579101,-0.187625,-0.103594,...,2,0,16,2,2,2,2,2,,1
4170236,1.646658,-0.506583,-1.383747,0.310357,2.094471,0.582480,-1.906998,-2.700166,-0.187625,0.924042,...,2,0,19,2,2,2,2,2,,1
4170237,1.646662,-0.354556,1.376291,0.812114,-0.176293,0.627431,0.525222,0.375378,-0.187625,-0.103594,...,0,4,60,2,3,1,0,0,,1
4170238,1.646663,0.277047,1.368117,0.971208,-0.176293,0.582480,-0.590475,0.375378,-0.187625,-0.103594,...,0,4,60,2,2,1,0,0,,1


<IPython.core.display.Javascript object>

# Save

In [17]:
prep_df.to_orc("../../data/ieee-fraud-detection/ieee_transaction.orc")

<IPython.core.display.Javascript object>