## Import the library

* pandas
* numpy
* matplotlib
* seaborn
* sklearn
* dask

In [1]:
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
import sklearnex
import dask_cudf
import cudf
from xgboost import XGBClassifier
from xgboost import XGBRegressor

# INTEL sklearn patching for optimizing sklearn
from sklearnex import patch_sklearn
patch_sklearn(global_patch=True)

# Reimport sklearn
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression

# # Modin, faster alternative compare to Pandas

# import modin.pandas as pd
# import modin.config as cfg

# cfg.Engine.put('Dask')
# cfg.Memory.put(9000000000)
# cfg.CpuCount.put(4)

#Dask
import dask.dataframe as dd
from dask.distributed import Client
client = Client()
from dask.diagnostics import ProgressBar
pbar = ProgressBar()                
pbar.register()
from pprint import pprint

#Ray
# import ray
# ray.init()

Scikit-learn was successfully globally patched by Intel(R) Extension for Scikit-learn


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
2022-09-14 13:59:42,652 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-9j27acrh', purging
2022-09-14 13:59:42,653 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-4w4q_ubb', purging
2022-09-14 13:59:42,653 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-11ph1jmm', purging
2022-09-14 13:59:42,654 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-edaw53kh', purging
2022-09-14 13:59:42,654 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-vyivrila', purging
2022-09-14 13:59:42,654 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-s0e3vt01', purging
2022-09-14 13:59:42,654 - distributed.diskutils -

In [2]:
# Set max display dataframe
import pandas as pd
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)

## Preprocessing
- Import the train and test data
- Numerical features
- Categorical features
- Data type
- Missing value
- Train test split
- Handling missing values, data imputation or encoding or both
- Feature scaling, standardzation or normalization if necessary

In [3]:
# check the number CPU
# import modin
# print(modin.config.NPartitions.get())

In [4]:
# Check Dask client status
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.48 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:38403,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.48 GiB

0,1
Comm: tcp://127.0.0.1:43005,Total threads: 2
Dashboard: http://127.0.0.1:45813/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:46403,
Local directory: /tmp/dask-worker-space/worker-j97uk55n,Local directory: /tmp/dask-worker-space/worker-j97uk55n
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:36735,Total threads: 2
Dashboard: http://127.0.0.1:46663/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:46399,
Local directory: /tmp/dask-worker-space/worker-56n0f1ws,Local directory: /tmp/dask-worker-space/worker-56n0f1ws
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:38089,Total threads: 2
Dashboard: http://127.0.0.1:44153/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:37721,
Local directory: /tmp/dask-worker-space/worker-7_c5brj3,Local directory: /tmp/dask-worker-space/worker-7_c5brj3
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB

0,1
Comm: tcp://127.0.0.1:41289,Total threads: 2
Dashboard: http://127.0.0.1:44269/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:45521,
Local directory: /tmp/dask-worker-space/worker-5m47fmbh,Local directory: /tmp/dask-worker-space/worker-5m47fmbh
GPU: NVIDIA GeForce RTX 3050,GPU memory: 8.00 GiB


In [5]:
train_data = '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/df_parquet_train'


In [6]:
test_data = '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/df_parquet_test'

In [7]:
target_data = '/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/amex-default-prediction/train_labels.csv'

In [8]:
# target data, y
# in this case, assume train labels share similar index and customer id arrangement
# else, have to merge, common column is customer_ID
# then separate target with training data 
# convert dataframe into series to feed the target data into the prediction model

# y = dd.read_csv(target_data)
y = dask_cudf.read_csv(target_data)

In [9]:
y.head()

Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [10]:
y = y.drop(['customer_ID'], axis=1)

In [11]:
y.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [12]:
type(y)

dask_cudf.core.DataFrame

In [13]:
y = y.squeeze()

In [14]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [15]:
type(y)

dask_cudf.core.Series

In [16]:
# df = dd.read_parquet(train_data)
gdf = dask_cudf.read_parquet(train_data)

In [17]:
# df
gdf

Unnamed: 0_level_0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
npartitions=52,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1
,object,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,object,object,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,int64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [18]:
# # Number of rows

# with ProgressBar():
#     row = df.shape[0].compute()

# row

In [19]:
# df.shape[0]
gdf.shape[0]

Delayed('int-f7f3087b-ab47-4d7c-826c-310e94a21c98')

In [20]:
# Number of columns/features
# df.shape[1]
gdf.shape[1]

190

In [21]:
# df.info()
gdf.info()

<class 'dask_cudf.core.DataFrame'>
Columns: 190 entries, customer_ID to D_145
dtypes: object(4), float64(185), int64(1)

In [22]:
# df.columns
gdf.columns

Index(['customer_ID', 'S_2', 'P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41',
       'B_3',
       ...
       'D_136', 'D_137', 'D_138', 'D_139', 'D_140', 'D_141', 'D_142', 'D_143',
       'D_144', 'D_145'],
      dtype='object', length=190)

In [23]:
# Out of much memory
# pandas_df = df.compute()

In [24]:
# df.dtypes
gdf.dtypes

customer_ID     object
S_2             object
P_2            float64
D_39           float64
B_1            float64
B_2            float64
R_1            float64
S_3            float64
D_41           float64
B_3            float64
D_42           float64
D_43           float64
D_44           float64
B_4            float64
D_45           float64
B_5            float64
R_2            float64
D_46           float64
D_47           float64
D_48           float64
D_49           float64
B_6            float64
B_7            float64
B_8            float64
D_50           float64
D_51           float64
B_9            float64
R_3            float64
D_52           float64
P_3            float64
B_10           float64
D_53           float64
S_5            float64
B_11           float64
S_6            float64
D_54           float64
R_4            float64
S_7            float64
B_12           float64
S_8            float64
D_55           float64
D_56           float64
B_13           float64
R_5        

In [25]:
# df.head()
gdf.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,,,0.00063,0.080986,0.708906,0.1706,0.006204,0.358587,0.525351,0.255736074,,1.00008,0.006805,,0.002052,0.005972,,0.004345,0.001535,,,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,,,0.002526,0.069419,0.712795,0.113239,0.006206,0.35363,0.521311,0.223328869,,1.008344,0.004407,,0.001034,0.004838,,0.007495,0.004931,,,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,,,0.007605,0.068839,0.720884,0.060492,0.003259,0.33465,0.524568,0.189423979,,1.006878,0.003221,,0.005681,0.005497,,0.009227,0.009123,,,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960384,0.002455,0.013683,1.0027,0.001373,0.117169,0.000685,0.005531,,,0.006406,0.05563,0.723997,0.166782,0.009918,0.323271,0.530929,0.135586161,,1.007573,0.007703,,0.007108,0.008261,,0.007206,0.002409,,,,,,0.006117,0.004516,0.0032,,0.008419,0.006527,0.0096
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947248,0.002483,0.015193,1.000727,0.007605,0.117325,0.004653,0.009312,,,0.007731,0.038862,0.720619,0.14363,0.006667,0.231009,0.529305,,,1.008132,0.009823,,0.00968,0.004848,,0.006312,0.004462,,,,,,0.003671,0.004946,0.008889,,0.00167,0.008126,0.009827


In [26]:
# df.isnull().sum().compute()
gdf.isnull().sum().compute()

customer_ID          0
S_2                  0
P_2              45985
D_39                 0
B_1                  0
B_2               2016
R_1                  0
S_3            1020544
D_41              2016
B_3               2016
D_42           4740137
D_43           1658396
D_44            274319
B_4                  0
D_45              2017
B_5                  0
R_2                  0
D_46           1211699
D_47                 0
D_48            718725
D_49           4985917
B_6                233
B_7                  0
B_8              22268
D_50           3142402
D_51                 0
B_9                  0
R_3                  0
D_52             29563
P_3             301492
B_10                 0
D_53           4084585
S_5                  0
B_11                 0
S_6                  0
D_54              2016
R_4                  0
S_7            1020544
B_12                 0
S_8                  0
D_55            184803
D_56           2990943
B_13             49519
R_5        

In [27]:
# df.duplicated().sum()

In [30]:
# as we poke into the data, looks like S2 is object with date data
# so we have to parse it

# df['S_2'] = dd.to_datetime(df['S_2'], infer_datetime_format=False ,format="%Y/%m/%d")
# gdf['S_2'] = dask_cudf.to_datetime(gdf['S_2'])
gdf['S_2'] = gdf['S_2'].astype('datetime64[s]')

In [32]:
# df['S_2'].head()
gdf['S_2'].head()

0   2017-03-09
1   2017-04-07
2   2017-05-28
3   2017-06-13
4   2017-07-16
Name: S_2, dtype: datetime64[s]

In [33]:
# df['S_2'].dtype
gdf['S_2'].dtype

dtype('<M8[s]')

In [34]:
# df.shape[0].compute()
gdf.shape[0].compute()

Key:       ('read-parquet-4c003f3df95633838148a445d64052e5', 19)
Function:  subgraph_callable-48c014e3-1e84-4797-867d-78c5a43f
args:      ({'piece': ('/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/df_parquet_train/part.19.parquet', None, None)})
kwargs:    {}
Exception: "MemoryError('Parquet data was larger than the available GPU memory!\\n\\nSee the notes on split_row_groups in the read_parquet documentation.\\n\\nOriginal Error: std::bad_alloc: out_of_memory: CUDA error at: /home/allifizzuddin/anaconda3/envs/dev_v2/include/rmm/mr/device/cuda_memory_resource.hpp')"

Key:       ('read-parquet-4c003f3df95633838148a445d64052e5', 2)
Function:  subgraph_callable-48c014e3-1e84-4797-867d-78c5a43f
args:      ({'piece': ('/mnt/b7b917e1-da96-4995-b7db-c30035d41dbe/Machine Learning Project/AMEX_2022/df_parquet_train/part.2.parquet', None, None)})
kwargs:    {}
Exception: "MemoryError('Parquet data was larger than the available GPU memory!\\n\\nSee the notes on spli

MemoryError: Parquet data was larger than the available GPU memory!

See the notes on split_row_groups in the read_parquet documentation.

Original Error: std::bad_alloc: out_of_memory: CUDA error at: /home/allifizzuddin/anaconda3/envs/dev_v2/include/rmm/mr/device/cuda_memory_resource.hpp

In [None]:
len(y)

In [None]:
# It seems the train and the target data has different shape
# df = 5531451 rows
# y = 458913 rows

In [None]:
# cols_miss_val = [cols for cols in df.columns
#                 if df[cols].isnull().any().compute()]

cols_miss_val = [cols for cols in gdf.columns
                if gdf[cols].isnull().any().compute()]

In [None]:
cols_miss_val

In [None]:
# Split train test the data

X_train, X_test, y_train, y_test = train_test_split(gdf, y, test_size=20, train_size=80, random_state=0)