# Prepare identity

Dataset Description
-------------------

In this competition you are predicting the probability that an online transaction is fraudulent, as denoted by the binary target `isFraud`.

The data is broken into two files `identity` and `transaction`, which are joined by `TransactionID`. Not all transactions have corresponding identity information.

### Categorical Features - Identity

*   `DeviceType`
*   `DeviceInfo`
*   `id_12` - `id_38`

The `TransactionDT` feature is a timedelta from a given reference datetime (not an actual timestamp).

You can read more about the data from [this post by the competition host](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203).

Files
-----

*   **train\_{transaction, identity}.csv** - the training set
*   **test\_{transaction, identity}.csv** - the test set (you must predict the `isFraud` value for these observations)
*   **sample\_submission.csv** - a sample submission file in the correct format

Link: https://www.kaggle.com/competitions/ieee-fraud-detection

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
scaler = StandardScaler()
le = LabelEncoder()

<IPython.core.display.Javascript object>

In [4]:
test_identity_df = pd.read_csv(
    "../../data/ieee-fraud-detection/test_identity.csv"
).set_index("TransactionID")
test_identity_df

Unnamed: 0_level_0,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,...,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3663586,-45.0,280290.0,,,0.0,0.0,,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
3663588,0.0,3579.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,chrome 67.0 for android,24.0,1280x720,match_status:2,T,F,T,T,mobile,LGLS676 Build/MXB48T
3663597,-5.0,185210.0,,,1.0,0.0,,,,,...,ie 11.0 for tablet,,,,F,T,T,F,desktop,Trident/7.0
3663601,-45.0,252944.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,chrome 67.0 for android,,,,F,F,T,F,mobile,MYA-L13 Build/HUAWEIMYA-L13
3663602,-95.0,328680.0,,,7.0,-33.0,,,,,...,chrome 67.0 for android,,,,F,F,T,F,mobile,SM-G9650 Build/R16NW
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170230,-20.0,473365.0,,,0.0,0.0,,,,,...,chrome 71.0 for android,,,,F,F,T,F,mobile,SM-J700M
4170233,-5.0,489917.0,0.0,0.0,-4.0,-32.0,,,0.0,0.0,...,chrome 71.0 for android,,,,F,F,T,F,mobile,SM-J320M
4170234,-5.0,110081.0,,,22.0,-31.0,,,,,...,mobile safari 10.0,32.0,1334x750,match_status:2,T,F,F,T,mobile,iOS Device
4170236,-45.0,266704.0,,,-3.0,-10.0,,,,,...,chrome 43.0 for android,,,,F,F,T,F,mobile,ALE-L23 Build/HuaweiALE-L23


<IPython.core.display.Javascript object>

In [5]:
train_identity_df = pd.read_csv(
    "../../data/ieee-fraud-detection/train_identity.csv"
).set_index("TransactionID")
train_identity_df.columns = train_identity_df.columns.str.replace("_", "-")
train_identity_df

Unnamed: 0_level_0,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,...,id-31,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
2987011,-5.0,221832.0,,,0.0,-6.0,,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3577521,-15.0,145955.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,chrome 66.0 for android,,,,F,F,T,F,mobile,F3111 Build/33.3.A.1.97
3577526,-5.0,172059.0,,,1.0,-5.0,,,,,...,chrome 55.0 for android,32.0,855x480,match_status:2,T,F,T,F,mobile,A574BL Build/NMF26F
3577529,-20.0,632381.0,,,-1.0,-36.0,,,,,...,chrome 65.0 for android,,,,F,F,T,F,mobile,Moto E (4) Plus Build/NMA26.42-152
3577531,-5.0,55528.0,0.0,0.0,0.0,-7.0,,,0.0,0.0,...,chrome 66.0,24.0,2560x1600,match_status:2,T,F,T,F,desktop,MacOS


<IPython.core.display.Javascript object>

In [6]:
df = pd.concat([train_identity_df, test_identity_df]).sort_index()
df["isTest"] = df.index.isin(test_identity_df.index).astype(int)
df

Unnamed: 0_level_0,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,...,id-32,id-33,id-34,id-35,id-36,id-37,id-38,DeviceType,DeviceInfo,isTest
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,,,,,,,,,...,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,0
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,...,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device,0
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,,,,F,F,T,T,desktop,Windows,0
2987011,-5.0,221832.0,,,0.0,-6.0,,,,,...,,,,F,F,T,T,desktop,,0
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,...,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170230,-20.0,473365.0,,,0.0,0.0,,,,,...,,,,F,F,T,F,mobile,SM-J700M,1
4170233,-5.0,489917.0,0.0,0.0,-4.0,-32.0,,,0.0,0.0,...,,,,F,F,T,F,mobile,SM-J320M,1
4170234,-5.0,110081.0,,,22.0,-31.0,,,,,...,32.0,1334x750,match_status:2,T,F,F,T,mobile,iOS Device,1
4170236,-45.0,266704.0,,,-3.0,-10.0,,,,,...,,,,F,F,T,F,mobile,ALE-L23 Build/HuaweiALE-L23,1


<IPython.core.display.Javascript object>

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 286140 entries, 2987004 to 4170239
Data columns (total 41 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id-01       286140 non-null  float64
 1   id-02       277848 non-null  float64
 2   id-03       132805 non-null  float64
 3   id-04       132805 non-null  float64
 4   id-05       271615 non-null  float64
 5   id-06       271615 non-null  float64
 6   id-07       10214 non-null   float64
 7   id-08       10214 non-null   float64
 8   id-09       149264 non-null  float64
 9   id-10       149264 non-null  float64
 10  id-11       277756 non-null  float64
 11  id-12       286140 non-null  object 
 12  id-13       257606 non-null  float64
 13  id-14       151401 non-null  float64
 14  id-15       277962 non-null  object 
 15  id-16       255087 non-null  object 
 16  id-17       275335 non-null  float64
 17  id-18       95988 non-null   float64
 18  id-19       275224 non-null  float64


<IPython.core.display.Javascript object>

In [8]:
(df.isna().sum() / len(df)).sort_values(ascending=False)

id-24         0.966845
id-25         0.964454
id-26         0.964318
id-07         0.964304
id-08         0.964304
id-21         0.964290
id-22         0.964245
id-23         0.964245
id-27         0.964245
id-18         0.664542
id-04         0.535874
id-03         0.535874
id-33         0.496890
id-30         0.481988
id-32         0.481873
id-09         0.478353
id-10         0.478353
id-34         0.475851
id-14         0.470885
DeviceInfo    0.183187
id-16         0.108524
id-13         0.099720
id-05         0.050762
id-06         0.050762
id-20         0.039302
id-19         0.038149
id-17         0.037761
id-31         0.032267
DeviceType    0.029353
id-28         0.029300
id-29         0.029300
id-11         0.029300
id-02         0.028979
id-15         0.028580
id-35         0.028580
id-36         0.028580
id-37         0.028580
id-38         0.028580
id-01         0.000000
id-12         0.000000
isTest        0.000000
dtype: float64

<IPython.core.display.Javascript object>

# Feature engineering

## Numeric

In [9]:
numeric_df = df._get_numeric_data().drop("isTest", axis=1).copy()
numeric_df

Unnamed: 0_level_0,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,...,id-17,id-18,id-19,id-20,id-21,id-22,id-24,id-25,id-26,id-32
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.0,70787.0,,,,,,,,,...,166.0,,542.0,144.0,,,,,,32.0
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,...,166.0,,621.0,500.0,,,,,,32.0
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,...,121.0,,410.0,142.0,,,,,,
2987011,-5.0,221832.0,,,0.0,-6.0,,,,,...,225.0,,176.0,507.0,,,,,,
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,...,166.0,15.0,529.0,575.0,,,,,,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170230,-20.0,473365.0,,,0.0,0.0,,,,,...,225.0,,153.0,325.0,,,,,,
4170233,-5.0,489917.0,0.0,0.0,-4.0,-32.0,,,0.0,0.0,...,225.0,17.0,417.0,595.0,,,,,,
4170234,-5.0,110081.0,,,22.0,-31.0,,,,,...,166.0,15.0,122.0,177.0,,,,,,32.0
4170236,-45.0,266704.0,,,-3.0,-10.0,,,,,...,225.0,15.0,176.0,507.0,,,,,,


<IPython.core.display.Javascript object>

In [10]:
numeric_df[numeric_df.columns] = scaler.fit_transform(numeric_df.fillna(0))
numeric_df

Unnamed: 0_level_0,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,...,id-17,id-18,id-19,id-20,id-21,id-22,id-24,id-25,id-26,id-32
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.744040,-0.625225,-0.059848,0.064284,-0.269624,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,-0.362672,-0.700402,1.331305,-1.434781,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,1.364417
2987008,0.397763,-0.461389,-0.059848,0.064284,-0.269624,0.088768,-0.142214,0.157637,-0.060508,0.079481,...,-0.362672,-0.700402,1.847440,0.638359,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,1.364417
2987010,0.397763,0.077901,-0.059848,0.064284,-0.269624,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,-1.318933,-0.700402,0.468903,-1.446428,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,-1.016641
2987011,0.397763,0.253624,-0.059848,0.064284,-0.269624,0.025730,-0.142214,0.157637,-0.060508,0.079481,...,0.891091,-0.700402,-1.059900,0.679123,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,-1.016641
2987016,0.744040,-0.993690,-0.059848,0.064284,-0.071304,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,-0.362672,1.454544,1.246372,1.075117,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,0.769153
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170230,-0.641068,1.717158,-0.059848,0.064284,-0.269624,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,0.891091,-0.700402,-1.210167,-0.380741,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,-1.016641
4170233,0.397763,1.813465,-0.059848,0.064284,-1.062900,-1.613244,-0.142214,0.157637,-0.060508,0.079481,...,0.891091,1.741870,0.514637,1.191585,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,-1.016641
4170234,0.397763,-0.396595,-0.059848,0.064284,4.093396,-1.550206,-0.142214,0.157637,-0.060508,0.079481,...,-0.362672,1.454544,-1.412701,-1.242608,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,1.364417
4170236,-2.372452,0.514710,-0.059848,0.064284,-0.864581,-0.226420,-0.142214,0.157637,-0.060508,0.079481,...,0.891091,1.454544,-1.059900,0.679123,-0.170535,-0.17819,-0.180778,-0.184696,-0.188009,-1.016641


<IPython.core.display.Javascript object>

## Categorical

In [11]:
cat_columns = set(df.columns) - set(df._get_numeric_data().columns)
cat_columns

{'DeviceInfo',
 'DeviceType',
 'id-12',
 'id-15',
 'id-16',
 'id-23',
 'id-27',
 'id-28',
 'id-29',
 'id-30',
 'id-31',
 'id-33',
 'id-34',
 'id-35',
 'id-36',
 'id-37',
 'id-38'}

<IPython.core.display.Javascript object>

In [12]:
categorical_df = df[cat_columns].apply(le.fit_transform).copy()
categorical_df

  categorical_df = df[cat_columns].apply(le.fit_transform).copy()


Unnamed: 0_level_0,DeviceType,id-36,id-31,id-28,id-33,id-16,id-30,id-38,DeviceInfo,id-27,id-34,id-37,id-29,id-23,id-15,id-12,id-35
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2987004,1,0,161,1,268,1,7,1,1565,2,3,1,1,3,1,1,1
2987008,1,0,130,1,80,1,70,1,2693,2,2,0,1,3,1,1,1
2987010,0,0,46,0,461,0,87,1,2526,2,4,1,0,3,0,1,0
2987011,0,0,46,1,461,1,87,1,2799,2,4,1,1,3,1,1,0
2987016,0,0,46,0,68,0,26,1,1170,2,3,1,0,3,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170230,1,0,74,1,461,1,87,0,2165,2,4,1,1,3,1,1,0
4170233,1,0,74,0,461,0,87,0,2106,2,4,1,0,3,0,1,0
4170234,1,0,129,1,80,1,63,1,2693,2,3,0,1,3,1,1,1
4170236,1,0,22,1,461,1,87,0,141,2,4,1,1,3,1,1,0


<IPython.core.display.Javascript object>

In [13]:
prep_df = numeric_df.join(categorical_df).join(df[["isTest"]])
prep_df

Unnamed: 0_level_0,id-01,id-02,id-03,id-04,id-05,id-06,id-07,id-08,id-09,id-10,...,DeviceInfo,id-27,id-34,id-37,id-29,id-23,id-15,id-12,id-35,isTest
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987004,0.744040,-0.625225,-0.059848,0.064284,-0.269624,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,1565,2,3,1,1,3,1,1,1,0
2987008,0.397763,-0.461389,-0.059848,0.064284,-0.269624,0.088768,-0.142214,0.157637,-0.060508,0.079481,...,2693,2,2,0,1,3,1,1,1,0
2987010,0.397763,0.077901,-0.059848,0.064284,-0.269624,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,2526,2,4,1,0,3,0,1,0,0
2987011,0.397763,0.253624,-0.059848,0.064284,-0.269624,0.025730,-0.142214,0.157637,-0.060508,0.079481,...,2799,2,4,1,1,3,1,1,0,0
2987016,0.744040,-0.993690,-0.059848,0.064284,-0.071304,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,1170,2,3,1,0,3,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4170230,-0.641068,1.717158,-0.059848,0.064284,-0.269624,0.403955,-0.142214,0.157637,-0.060508,0.079481,...,2165,2,4,1,1,3,1,1,0,1
4170233,0.397763,1.813465,-0.059848,0.064284,-1.062900,-1.613244,-0.142214,0.157637,-0.060508,0.079481,...,2106,2,4,1,0,3,0,1,0,1
4170234,0.397763,-0.396595,-0.059848,0.064284,4.093396,-1.550206,-0.142214,0.157637,-0.060508,0.079481,...,2693,2,3,0,1,3,1,1,1,1
4170236,-2.372452,0.514710,-0.059848,0.064284,-0.864581,-0.226420,-0.142214,0.157637,-0.060508,0.079481,...,141,2,4,1,1,3,1,1,0,1


<IPython.core.display.Javascript object>

# Save

In [14]:
prep_df.to_orc("../../data/ieee-fraud-detection/ieee_identity.orc")

<IPython.core.display.Javascript object>