In [33]:
"""
The purpose of this notebook is to downcast the datatypes of the numerical columns in the dataset, 
so that we can save memory while working with the dataset later.

The whole work of fraud detection is done in another notebook called 'fraud_detection.ipynb',
which uses the downcasted dataset.
""";

In [3]:
import pandas as pd
import numpy as np

In [4]:
from os import listdir
from os.path import isfile, join

loc = os.path.abspath("")
data_loc = f"{loc}/data"
print(data_loc)

C:\Users\ahmed\Desktop\IEEE-CIS Fraud Detection/data


In [5]:
%%time
train_id_before = pd.read_csv(f"{data_loc}/train_identity.csv")
train_tr_before = pd.read_csv(f"{data_loc}/train_transaction.csv")
test_id_before = pd.read_csv(f"{data_loc}/test_identity.csv")
test_tr_before = pd.read_csv(f"{data_loc}/test_transaction.csv")

Wall time: 27.2 s


In [6]:
print(train_id_before.info())
print(train_tr_before.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144233 entries, 0 to 144232
Data columns (total 41 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TransactionID  144233 non-null  int64  
 1   id_01          144233 non-null  float64
 2   id_02          140872 non-null  float64
 3   id_03          66324 non-null   float64
 4   id_04          66324 non-null   float64
 5   id_05          136865 non-null  float64
 6   id_06          136865 non-null  float64
 7   id_07          5155 non-null    float64
 8   id_08          5155 non-null    float64
 9   id_09          74926 non-null   float64
 10  id_10          74926 non-null   float64
 11  id_11          140978 non-null  float64
 12  id_12          144233 non-null  object 
 13  id_13          127320 non-null  float64
 14  id_14          80044 non-null   float64
 15  id_15          140985 non-null  object 
 16  id_16          129340 non-null  object 
 17  id_17          139369 non-nul

In [35]:
"""
Loading the dataset consumes 1.7+GB of memory in our system. Therefore something must be
done to reduce the memory consumed. It can be seen that all the numerical columns are of
the float64 or int64 datatypes. So, we can downcast these datatypes to 32 bit or 16 bit.
In order to choose which datatype to convert to, we have to check the maximum values in
each dataframes. 

If the max_value > 2.1B, we cannot downcast the datatypes. It should remain at 64 bit.

If the 2.1B > max_value > 65K , we have to downcast the numerical datatypes to 32 bit.

If the 65K > max_value > 255 , we have to downcast the numerical datatypes to 16 bit.

""";

Checking maximum values in each dataframe:

In [25]:
x = train_id_before.select_dtypes(include=[np.number]).max()
print(x)
print('_______________________________________')
print(type(x))
print('_______________________________________')
print(max(x))

TransactionID    3577534.0
id_01                  0.0
id_02             999595.0
id_03                 10.0
id_04                  0.0
id_05                 52.0
id_06                  0.0
id_07                 61.0
id_08                  0.0
id_09                 25.0
id_10                  0.0
id_11                100.0
id_13                 64.0
id_14                720.0
id_17                229.0
id_18                 29.0
id_19                671.0
id_20                661.0
id_21                854.0
id_22                 44.0
id_24                 26.0
id_25                548.0
id_26                216.0
id_32                 32.0
dtype: float64
_______________________________________
<class 'pandas.core.series.Series'>
_______________________________________
3577534.0


In [26]:
print(max(train_id_before.select_dtypes(include=[np.number]).max()))
print(max(train_tr_before.select_dtypes(include=[np.number]).max()))
print(max(test_id_before.select_dtypes(include=[np.number]).max()))
print(max(test_tr_before.select_dtypes(include=[np.number]).max()))

3577534.0
15811131.0
4170239.0
34214345.0


Maximum value is 34 million. Hence, we can downcast the numerical values to 32 bit. 32-bit signed binary integer can store a positive numerical value upto 2 billion.

In [27]:
# Function for downcasting the datatypes to reduce memory usage:
def downcast_dtypes(df):
    _start = df.memory_usage(deep=True).sum() / 1024 **2
    
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64"]]
    
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int32)
    
    _end = df.memory_usage(deep=True).sum() / 1024 **2
    
    saved = (_start - _end) / _start *100
    
    print(f"Saved {saved:.2f}%")
    
    return df

In [28]:
train_id = downcast_dtypes(train_id_before)
train_tr = downcast_dtypes(train_tr_before)
test_id = downcast_dtypes(test_id_before)
test_tr= downcast_dtypes(test_tr_before)

Saved 8.38%
Saved 40.75%
Saved 8.43%
Saved 40.52%


In [34]:
print(train_id.info())
print(train_tr.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144233 entries, 0 to 144232
Data columns (total 41 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TransactionID  144233 non-null  int32  
 1   id_01          144233 non-null  float32
 2   id_02          140872 non-null  float32
 3   id_03          66324 non-null   float32
 4   id_04          66324 non-null   float32
 5   id_05          136865 non-null  float32
 6   id_06          136865 non-null  float32
 7   id_07          5155 non-null    float32
 8   id_08          5155 non-null    float32
 9   id_09          74926 non-null   float32
 10  id_10          74926 non-null   float32
 11  id_11          140978 non-null  float32
 12  id_12          144233 non-null  object 
 13  id_13          127320 non-null  float32
 14  id_14          80044 non-null   float32
 15  id_15          140985 non-null  object 
 16  id_16          129340 non-null  object 
 17  id_17          139369 non-nul

In [None]:
"""
It can now be seen that the memory consumed by the two dataframes is reduced to 919 MB. Now,
we would save the dataframes in our local drive.
"""

Saving the downcasted dataset in our local drive.

In [37]:
train_id.to_parquet(f"{data_loc}/train_identity.parquet.gzip", compression = 'gzip')
train_tr.to_parquet(f"{data_loc}/train_transaction.parquet.gzip", compression = 'gzip')
test_id.to_parquet(f"{data_loc}/test_identity.parquet.gzip", compression = 'gzip')
test_tr.to_parquet(f"{data_loc}/test_transaction.parquet.gzip", compression = 'gzip')

Loading the downcasted datasets to check if time taken to load is indeed reduced.

In [38]:
%%time
train_id = pd.read_parquet(f"{data_loc}/train_identity.parquet.gzip")
train_tr = pd.read_parquet(f"{data_loc}/train_transaction.parquet.gzip")
test_id = pd.read_parquet(f"{data_loc}/test_identity.parquet.gzip")
test_tr = pd.read_parquet(f"{data_loc}/test_transaction.parquet.gzip")

Wall time: 1.63 s
